基于Scikit-Learn的机器学习入门系列(一)

tech2024-12-04  25

【学习交流QQ:1244180262】 0 | 引言 Scikit-Learn 的简要介绍

基于Python基于 Numpy、Scipy、matplotlib算法丰富自带测试数据集开源文档比较完备

解决任务的类型:

回归任务问题分类任务问题聚类任务问题降维任务问题 scikit-learn官网 scikit-learn用户指南

1| 先导部分 1.1 导入相应的库,并查看版本

import numpy as np import matplotlib import sklearn #作用:绘图框显示在单元框内,不至于弹框显示 %matplotlib inline print (np.__version__) print (matplotlib.__version__) print(sklearn.__version__)

1.2 利用numpy库中random()模块,练习一下基本操作

# 利用numpy库,随机生成3*5的矩阵 X = np.random.random((3,5)) X

查看X中的第一行,第一列

#查看X中的第一行,第一列 X[0,0]

查看X中第一行

#查看X中第一行 X[0]

查看矩阵X中第一列

#查看矩阵X中第一列 X[:,0]

1.3 利用numpy库中linspace()模块 练习一下基本操作

y = np.linspace(0,12,5) y

1.4 利用Scipy库中的spare模块功能(稀疏矩阵)

生成稀疏矩阵

from scipy import sparse #随机生成10*5的随机矩阵 z = np.random.random((10,5)) z

将随机矩阵z中的元素为小于0.5,置为0

# 将随机矩阵z中的元素为小于0.5,置为0 z[ z <0.5 ]=0 z

将随机矩阵z设置成稀疏矩阵形式

# 将随机矩阵z设置成稀疏矩阵形式 z_csr = sparse.csr_matrix(z) print(z_csr)

稀疏矩阵还原

# 稀疏矩阵还原 print(z_csr.toarray())

2| Scikit-Learn

2.1 利用sklearn中自带的测试数据集

(一)iris数据集

# iris数据集 from sklearn.datasets import load_iris iris = load_iris()

1.查看iris 数据集的属性

iris.keys()

target:标签 target_names:标签名称 feature_names: 特征名称 data:特征参数

2.查看iris数据中data的样本数目和样本维度

n_sample,n_features = iris.data.shape print(n_sample,n_features)

n_sample:样本数目: 150个 n_features: 特征维度 : 4

查看iris中target 的数目 iris.target.shape

4. 查看iris中target_names

iris.target_names

5. 查看iris中特征名称

iris.feature_names

6. 可视化iris数据集

import matplotlib.pyplot as plt x_index=0 y_index=1 plt.scatter(iris.data[:,x_index],iris.data[:,y_index], c = iris.target) plt.xlabel = ('iris.feature_name[x_index]') plt.ylabel = ('iris.feature_name[y_index]') plt.show()

(2)手写体数据集digits

# digits 手写体数据集 from sklearn.datasets import load_digits digits = load_digits()

1.查看digits 数据集的属性

digits.keys()

2.查看digits数据中data的样本数目和样本维度

digits.images.shape n_samples,n_features = digits.data.shape print(n_samples, n_features)

可视化digits数据集 %matplotlib inline import matplotlib.pyplot as plt fig = plt.figure(figsize = (8,8)) fig.subplots_adjust(left=0, right=1,bottom=0, top=1, hspace =0.05, wspace=0) for i in range(64): ax=fig.add_subplot(8,8,i+1,xticks=[],yticks=[]) ax.imshow(digits.images[i])

4. 查看digits中特征名称

digits.target_names

查看iris中target 的数目 digits.target.shape

2.2 回归任务问题

问题类型:regression 回归数据集: load_boston:–波士顿房价模型: LinearRegression():—线性模型性能指标:平均绝对误差: mean_absolute_error:MAE import numpy as np import matplotlib.pyplot as plt import sklearn from sklearn.linear_model import LinearRegression #模型 clf = LinearRegression() from sklearn.datasets import load_boston # 数据集 Boston = load_boston() #数据参数 n_samples, n_features = Boston.data.shape print(n_samples, n_features) print(Boston.target.shape)

可视化数据

column_i = 5 plt.scatter(Boston.data[:,column_i], Boston.target) print (Boston.feature_names[5])

# MAE from sklearn.metrics import mean_absolute_error clf.fit(Boston.data, Boston.target) #预测值 predicted = clf.predict(Boston.data) #预测误差 mean_absolute_error(Boston.target, predicted)

可视化预测值与真实房价值

问题类型:regression 回归数据集: load_boston:–波士顿房价模型: DecisionTreeRegressor():—决策树模型性能指标:平均绝对误差: mean_absolute_error:MAE from sklearn.tree import DecisionTreeRegressor import matplotlib.pyplot as plt import numpy as np import sklearn from sklearn.datasets import load_boston # 数据集 Boston = load_boston() #训练模型 clf2 = DecisionTreeRegressor() clf2.fit(Boston.data, Boston.target) #测试模型 predicted2 = clf2.predict(Boston.data) #可视化 plt.scatter(Boston.target,predicted2) plt.plot(Boston.target,predicted2,color ='red') plt.xlabel("True price") plt.ylabel("Predicted price") plt.show()

性能指标

#性能评价 overfit:过拟合 from sklearn.metrics import mean_absolute_error mean_absolute_error(Boston.target, predicted2)

2.3 分类任务问题

问题类型: classification数据集: load_iris()模型: LogisticRegression()性能评价: accuracy_score from sklearn.datasets import load_iris iris =load_iris() from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(iris.data,iris.target) clf.predict_proba(iris.data) predicted = clf2.predict(iris.data) probas = clf.predict_proba(iris.data) # print(probas) print(iris.data.shape,probas.shape) from sklearn.metrics import accuracy_score Result_score = accuracy_score(iris.target, predicted) print(Result_score)

问题类型: classification数据集: load_iris()模型: SVC()性能指标:精度(正检率):accuracy_score() from sklearn.datasets import load_iris iris =load_iris() from sklearn.svm import SVC clf2=SVC() clf2.fit(iris.data,iris.target) predicted2 = clf2.predict(iris.data) from sklearn.metrics import accuracy_score Result_score = accuracy_score(iris.target, predicted2) print(Result_score)

2.4 降维任务问题

PCA from sklearn.decomposition import PCA pca = PCA(n_components=2,whiten =True) from sklearn.datasets import load_iris iris =load_iris() pca.fit(iris.data) #查看前2个主成分: pca.components_

查看方差解释率

#查看方差解释率 pca.explained_variance_ratio_

查看累计方差贡献率

#查看累计方差贡献率 pca.explained_variance_ratio_.sum()

原数据集经过PCA转换后的新数据x_pca

#原数据集经过PCA转换后的新数据x_pca x_pca = pca.transform(iris.data) print(iris.data.shape, x_pca.shape)

PCA可视化

#PCA可视化 %matplotlib inline import matplotlib.pyplot as plt from itertools import cycle def plot_PCA_2D(data, target, target_names): colors = cycle("rgbcmykw") target_ids = range(len(target_names)) plt.figure() plt.title("PCA real labels") plt.xlabel("PC1") plt.ylabel("PC2") for i,c,label in zip(target_ids, colors, target_names): plt.scatter(data[target ==i,0], data[target==i,1],c = c, label=label) plot_PCA_2D(x_pca, iris.target,iris.target_names)

kmeans可视化

# K-mean 可视化 from sklearn.cluster import KMeans kmeans = KMeans(n_clusters =3) kmeans.fit(x_pca) kmeans.labels_ plot_PCA_2D(x_pca, kmeans.labels_,['c0','c1','c2']) plt.title('kmeans labels') plt.xlabel("PC1") plt.ylabel("PC2")

2.5 交叉验证分类

数据集: digits 模型: svc;GaussianNB 性能指标: accuracy_score

import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit def plot_learning_curve(estimator, title , x, y , ylim = None, cv= None, n_jobs=1, train_sizes =np.linspace(0.1,1.0,5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, x, y, cv=cv, n_jobs= n_jobs, train_sizes= train_sizes) train_scores_mean = np.mean(train_scores,axis=1) train_scores_std = np.std(train_scores,axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores,axis=1) plt.grid() plt.fill_between( train_sizes, train_scores_mean, train_scores_std, train_scores_mean+train_scores_std,alpha=0.1, color="r") plt.fill_between( train_sizes, test_scores_mean, test_scores_std, test_scores_mean+test_scores_std, alpha=0.1, color ="g") plt.plot(train_sizes, train_scores_mean, 'o-',color="r", label = "Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label = "Cross-validation" ) plt.legend(loc='best') return plt %matplotlib inline digits = load_digits() x,y = digits.data, digits.target # title = "Learning Curves (Naive Bayes)" cv = ShuffleSplit(n_splits =100, test_size=0.2, random_state =0) estimator = GaussianNB() plot_learning_curve(estimator,title, x,y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) # title = "Learning Cuves (SVM, RBF, kernel, $\gamma=0.001$)" cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) plot_learning_curve(estimator, title, x, y, ylim=(0.7,1.01), cv=cv, n_jobs=4)

最新回复(0)