机器学习2-Logistic回归

tech2025-10-21 12

文章目录

1.形式化定义2.逻辑回归求解举例说明 3.逻辑回归代码实现4.逻辑回归的正则化1）理论部分2）代码实现 5.逻辑回归实现多分类1）原理 6.sklearn实现逻辑回归7.案例：鸢尾花分类8.案例：手写数字识别

1.形式化定义

解决的是分类问题，类别分别是0和1

2.逻辑回归求解

举例说明

3.逻辑回归代码实现

import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import accuracy_score # 定义加载数据的函数 def loaddata(): data = np.loadtxt('data2.txt', delimiter=',') # 特征数 n = data.shape[1] - 1 X = data[:, 0:n] y = data[:, -1].reshape(-1, 1) return X, y # 定义散点图函数 def scatter(X, y): # 分别寻找y==1和y==0时，索引值的位置 pos = np.where(y == 1) neg = np.where(y == 0) plt.scatter(X[pos[0], 0], X[pos[0], 1], marker='x') plt.scatter(X[neg[0], 0], X[neg[0], 1], marker='o') plt.xlabel('Exam 1 score') plt.xlabel('Exam 2 score') plt.show() # 实现sigmoid函数 def sigmoid(z): r = 1 / (1 + np.exp(-z)) return r # 实现假设函数 def hypothesis(X, theta): z = np.dot(X, theta) return sigmoid(z) # 实现代价函数，先实现损失 def computeCost(X, y, theta): m = X.shape[0] l = -y * np.log(hypothesis(X, theta)) - (1 - y) * np.log(1 - hypothesis(X, theta)) return np.sum(l) / m # 梯度下降法求解 def gradientDescent(X, y, theta, iterations, alpha): # 数据量 m = X.shape[0] # 在x最前面插入全为1的列 # np.vstack():在竖直方向上堆叠 # np.hstack表示在水平方向上平铺 X = np.hstack((np.ones((m, 1)), X)) for i in range(iterations): for j in range(len(theta)): theta[j] = theta[j] - (alpha / m) * np.sum((hypothesis(X, theta) - y) * X[:, j].reshape(-1, 1)) if (i % 10000 == 0): # 每迭代10000次，输出一次损失值 print('第', i, '次迭代，当前损失为:', computeCost(X, y, theta), 'theta =', theta) return theta # 画决策边界 def plotDescionBoundary(X, y, theta): # 样本点颜色 cm_dark = mpl.colors.ListedColormap(['g', 'r']) plt.xlabel('Exam 1 score') plt.ylabel('Exam 2 score') # 根据y的结果自动的在cmap中选择颜色，c参数代表颜色 plt.scatter(X[:, 0], X[:, 1], c=np.array(y).squeeze(), cmap=cm_dark, s=30) # 画分类决策面 x1 = np.arange(min(X[:, 0]), max(X[:, 0]), 0.1) x2 = -(theta[0] + theta[1] * x1) / theta[2] plt.plot(x1, x2) plt.show() # 定义预测函数 def predict(X): m = X.shape[0] # 在x最前面插入全为1的列 X = np.hstack((np.ones((m, 1)), X)) # 求解假设函数的值（预测值） h = hypothesis(X, theta) # 根据概率值决定最终的分类,>=0.5为1类，<0.5为0类 h[h >= 0.5] = 1 h[h < 0.5] = 0 return h if __name__ == '__main__': X, y = loaddata() scatter(X, y) # 特征数 n = X.shape[1] # theta是列向量,+1是因为求梯度时X前会增加一个全1列 theta = np.zeros(n + 1).reshape(n + 1, 1) iterations = 250000 alpha = 0.08 theta = gradientDescent(X, y, theta, iterations, alpha) print('theta=\n', theta) # 画决策面 plotDescionBoundary(X, y, theta) # 预测 p = predict(X) print('准确度 =', np.mean(y == p)) print('准确度 =', accuracy_score(y, p)) 第 0 次迭代，当前损失为: 35.99966432903636 theta = [[ 0.008 ] [ 0.9502343 ] [-1.74785255]] 第 10000 次迭代，当前损失为: nan theta = [[-59.02504686] [ 1.14700554] [ 1.84509124]] 第 20000 次迭代，当前损失为: nan theta = [[-1.05585422e+02] [ 4.68578082e-02] [ 1.62067270e+00]] 第 30000 次迭代，当前损失为: nan theta = [[-148.80611341] [ 3.99030508] [ -0.81613523]] 第 40000 次迭代，当前损失为: nan theta = [[-180.59920695] [ 2.5770718 ] [ 1.10286811]] 第 50000 次迭代，当前损失为: nan theta = [[-198.97684916] [ 2.14943403] [ 1.60796969]] 第 60000 次迭代，当前损失为: nan theta = [[-2.09611292e+02] [ 3.50516521e+00] [ 1.82175507e-01]] 第 70000 次迭代，当前损失为: nan theta = [[-216.42044919] [ 3.5220893 ] [ 0.25329443]] 第 80000 次迭代，当前损失为: nan theta = [[-223.27932611] [ 1.0536881 ] [ 2.34639088]] 第 90000 次迭代，当前损失为: nan theta = [[-231.07351365] [ 0.93703815] [ 2.99771369]] 第 100000 次迭代，当前损失为: nan theta = [[-238.25400395] [ 2.93014951] [ 0.90258989]] 第 110000 次迭代，当前损失为: nan theta = [[-245.04435034] [ 1.93799763] [ 1.55619345]] 第 120000 次迭代，当前损失为: nan theta = [[-247.34932834] [ 2.33836033] [ 1.58126479]] 第 130000 次迭代，当前损失为: nan theta = [[-247.77897791] [ 1.71475888] [ 2.42291139]] 第 140000 次迭代，当前损失为: nan theta = [[-248.1731205 ] [ 1.68472485] [ 2.28012846]] 第 150000 次迭代，当前损失为: nan theta = [[-247.4555695 ] [ 1.71254006] [ 2.41912703]] 第 160000 次迭代，当前损失为: nan theta = [[-247.6602936 ] [ 1.96919617] [ 1.83604848]] 第 170000 次迭代，当前损失为: nan theta = [[-247.982795 ] [ 2.33792003] [ 1.59153989]] 第 180000 次迭代，当前损失为: nan theta = [[-248.04938621] [ 1.3140158 ] [ 2.45261571]] 第 190000 次迭代，当前损失为: nan theta = [[-247.07525744] [ 0.8776164 ] [ 3.20085982]] 第 200000 次迭代，当前损失为: nan theta = [[-246.95287181] [ 1.96807141] [ 1.84921964]] 第 210000 次迭代，当前损失为: nan theta = [[-247.42955952] [ 1.68995524] [ 2.2696988 ]] 第 220000 次迭代，当前损失为: nan theta = [[-247.7493172 ] [ 2.04258878] [ 2.16660484]] 第 230000 次迭代，当前损失为: nan theta = [[-247.97539291] [ 2.37657393] [ 1.55682769]] 第 240000 次迭代，当前损失为: nan theta = [[-248.04658161] [ 2.99387595] [ 0.87684819]] theta= [[-247.44837176] [ 2.78836624] [ 1.56814101]] 准确度 = 0.88 准确度 = 0.88

4.逻辑回归的正则化

1）理论部分

2）代码实现

将逻辑回归的代码实现修改代价函数，梯度下降法中迭代公式要改

5.逻辑回归实现多分类

1）原理

6.sklearn实现逻辑回归

import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 加载数据 def loaddata(): data = np.loadtxt('data2.txt', delimiter=',') n = data.shape[1] - 1 X = data[:, 0:n] y = data[:, -1].ravel() # reshape(-1,1) return X, y # 画决策边界 def plotDescionsBoundary(X, y, theta): cm_dark = mpl.colors.ListedColormap(['g', 'r']) plt.xlabel('Exam 1 score') plt.ylabel('Exam 2 score') # 根据y的结果自动的在cmap中选择颜色，c参数代表颜色 # np.squeeze()函数转换后，要显示的数组变成了秩为1的数组 plt.scatter(X[:, 0], X[:, 1], c=np.array(y).squeeze(), cmap=cm_dark, s=30) # 画分类决策面 x1 = np.arange(min(X[:, 0]), max(X[:, 0]), 0.1) x2 = -(theta[0] + theta[1] * x1) / theta[2] plt.plot(x1, x2) plt.show() if __name__ == '__main__': X, y = loaddata() # 逻辑回归 # 常用参数含义 # C正则化参数(𝜆的倒数), C越大，惩罚越小，易过拟合，泛化能力差。C越小，惩罚越大，不易过拟合，泛化能力好。 # multi_class: ovr(二分类，默认是这个), multinomial(多分类) # max_iter表示迭代次数 # penalty表示正则化类别:l2,l1 model = LogisticRegression(C=50, max_iter=2000) model.fit(X, y) print('theta = ', model.coef_) # theta_1,theta_2 print('theta_0 = ', model.intercept_) # 截距 y_predict = model.predict(X) print('准确率 = ', accuracy_score(y_predict, y)) theta = np.append(model.intercept_, model.coef_) plotDescionsBoundary(X, y, theta) theta = [[0.20621451 0.20145397]] theta_0 = [-25.15918048] 准确率 = 0.89

7.案例：鸢尾花分类

import matplotlib.pyplot as plt import numpy as np import matplotlib as mpl from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 加载数据 iris = load_iris() # print(iris.DESCR) # print(iris) # 特征值与目标值 X = iris.data y = iris.target model = LogisticRegression(C=100) model.fit(X, y) print('coef = \n', model.coef_) print('截距=', model.intercept_) y_hat = model.predict(X) print('准确率 = ', accuracy_score(y, y_hat)) # 可视化分类结果，取两个特征进行训练 feature = 2 feature_other = 3 X_2 = X[:, [feature, feature_other]] model_2 = LogisticRegression(C=100.0) model_2.fit(X_2, y) # meshgrid函数生成两个网格矩阵-生成密密麻麻的点，点是有颜色的 # 点的间隔 h = 0.02 x_min, x_max = X[:, feature].min() - 0.5, X[:, feature].max() + 0.5 y_min, y_max = X[:, feature_other].min() - 0.5, X[:, feature_other].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # print(xx) # print(yy) # ravel表示将多维数组降维成一维，np.c_表示按列连接 # 生成网格坐标点 grid_point = np.c_[xx.ravel(), yy.ravel()] z = model_2.predict(grid_point) # 形状相同 z = z.reshape(xx.shape) # 画图 # plt.pcolormesh的作用在于能够直观表现出分类边界 cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) # plt.pcolormesh(xx, yy, z, cmap=plt.cm.Paired) plt.pcolormesh(xx, yy, z, cmap=cm_light) plt.scatter(X[0:50, feature], X[0:50, feature_other], color='red', marker='o', label='setosa') # 前50个样本 plt.scatter(X[50:100, feature], X[50:100, feature_other], color='blue', marker='x', label='versicolor') # 中间50个 plt.scatter(X[100:, feature], X[100:, feature_other], color='green', marker='+', label='Virginica') # 后50个样本 plt.show() coef = [[-0.39061968 3.40817576 -6.40535743 -3.51065375] [ 1.35573005 0.44817256 -0.51233169 -4.41445438] [-0.96511037 -3.85634833 6.91768912 7.92510812]] 截距= [ 19.94481478 5.2315545 -25.17636928] 准确率 = 0.9866666666666667

8.案例：手写数字识别

from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 读取图片的帮助类文件 from tool.Read_Minist_Tool import * # 加载数据 train_images = load_train_images() train_labels = load_train_labels() test_images = load_test_images() test_labels = load_test_labels() # 查看数据维度 print('数据维度=\n', train_images.shape) # 查看前五个对象及其对应的标签 for i in range(5): print(train_labels[i]) plt.imshow(train_images[i], cmap='gray') plt.show() # 构件训练数据特征 # 将三维数据转化成二维数据 X = train_images.reshape(train_images.shape[0], -1) # 数据归一化 X = X / 255 # 训练模型 model = LogisticRegression(C=50) model.fit(X, train_labels) print('coef = ', model.coef_) print('截距=', model.intercept_) # 测试模型 # 测试集精度 y_train_predict = model.predict(X) print('训练集模型是:', accuracy_score(train_labels, y_train_predict)) # 训练集精度 X_test = test_images.reshape(test_images.shape[0], -1) X_test = X_test / 255 y_test_predict = model.predict(X_test) print('测试集精度:', accuracy_score(test_labels, y_test_predict)) 截距= [-1.41537142 1.29661244 0.07179749 -0.59472239 0.27552796 1.97940312 -0.52777525 1.56939798 -2.14293741 -0.5119325 ] 训练集模型是: 0.9352666666666667 测试集精度: 0.9255

最新回复(0)