十大机器学习算法(三)—— 无监督 聚类算法 KMeans

tech2024-10-26  24

KMeans

KMeans属于无监督(即无标签)聚类算法,在不知道数据没有具体的划分标准时,通过物以类聚的方法,将相似数据放在一起。

一、源码流程

(一)首先随机生成一堆数据 [ x , y ] [x,y] [x,y],尝试将这些数据进行聚类

import random import matplotlib.pyplot as plt points_num = 100 random_x = [random.randint(-100, 100) for _ in range(points_num)] random_y = [random.randint(-100, 100) for _ in range(points_num)] random_poinsts = [(x, y) for x, y in zip(random_x, random_y)] plt.scatter(random_x, random_y)

(二) 随机生成五个点作为分类的初始点

(初始点的选择不影响最后的分类结果,只影响迭代次数)

#随机初始点 def generate_random_point(min_, max_): return random.randint(min_, max_), random.randint(min_, max_) #随机生成五个点 generate_random_point(-100, 100) K = 5 #初始中心点 pervious_kernels = [generate_random_point(-100, 100) for _ in range(K)] print(pervious_kernels) #将五个点在图中画出 plt.scatter([pervious_kernels[0][0]], [pervious_kernels[0][1]], color='red', s=100) plt.scatter([pervious_kernels[1][0]], [pervious_kernels[1][1]], color='green', s=100) plt.scatter([pervious_kernels[2][0]], [pervious_kernels[2][1]], color='yellow', s=100) plt.scatter([pervious_kernels[3][0]], [pervious_kernels[3][1]], color='blue', s=100) plt.scatter([pervious_kernels[4][0]], [pervious_kernels[4][1]], color='purple', s=100) #画出随机的点[x,y] plt.scatter(random_x, random_y)

(三)根据距离函数对所有点进行分类

距离函数:欧氏距离

import numpy as np #欧氏距离 def dis(x, y): return np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) kernel_colors = ['red', 'green', 'yellow', 'blue', 'purple'] #存放迭代更新后的中心点 new_kernels = [] #存放与相应的中心点为一类的点 groups = [[] for _ in range(K)] #对所有点与现在的中心点求距离,并将其与最小距离的点归为同一类 for p in random_poinsts: distances = [dis(p, k) for k in previous_kernels] min_index = np.argmin(distances) groups[min_index].append(p) #画出当前的中心的点与其集合中的点 for i, p in enumerate(previous_kernels): plt.scatter([p[0]], [p[1]], color=kernel_colors[i], s=100) #plt.scatter([previous_kernels[1][0]], [previous_kernels[1][1]], color=kernel_colors[1], s=100) #plt.scatter([previous_kernels[2][0]], [previous_kernels[2][1]], color=kernel_colors[2], s=100)

(四)更新中心点

#当前点不一定是最终的中心点,对其进行更新 #根据所有点的均值求出其更新后的中心点 for i, g in enumerate(groups): g_x = [_x for _x, _y in g] g_y = [_y for _x, _y in g] n_k_x, n_k_y = np.mean(g_x), np.mean(g_y) new_kernels.append((n_k_x, n_k_y)) print(kernel_colors[i]) plt.scatter(g_x, g_y, color=kernel_colors[i]) plt.scatter([n_k_x], [n_k_y], color=kernel_colors[i], alpha=0.5, s=200) print('根据新的Group获得的kernal和之前的kernel的距离是: {}'.format(dis((n_k_x, n_k_y), previous_kernels[i]))) #plt.scatter(random_x, random_y, s=10) #更新中心点 previous_kernels = new_kernels

(五)持续迭代即可得到最终的中心点

二、KMeans的应用

(一)最优 K 值的选取:手肘法

手肘法

(二)使用KMeans对商场客户进行聚类

matplotlib.pyplot 绘制双 y 轴图像 seaborn 绘制数据分布图

数据内容如下:

#商场顾客分类 import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.preprocessing import Normalizer #加载数据 data = pd.read_csv("Mall_Customers.csv") # data = pd.DataFrame(data) # print(data.describe(), data.info()) #取出需要分类的数据 train_x = data.iloc[:,1:5] #标签编码, LabelEncoder from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_x['Gender'] = le.fit_transform(train_x['Gender']) # col = train_x.columns.to_list() #如何取列名:https://www.cnblogs.com/wqbin/p/11845042.html # 规范化到 [0,1] 空间 min_max_scaler = preprocessing.MinMaxScaler() train_x1 = min_max_scaler.fit_transform(train_x) pd.DataFrame(train_x1, columns =train_x.columns).to_csv('temp.csv', index=False) # print(train_x2)) #正态分布的规范化 # normalizer = Normalizer() # train_x1 = normalizer.fit_transform(train_x) # print(train_x1) #使用KMeans聚类 kmeans = KMeans(n_clusters = 3) kmeans.fit(train_x) pre_y = kmeans.predict(train_x) #将数据与源数据拼接 results = pd.concat((train_x, pd.DataFrame(pre_y, columns = ['Group'])), axis = 1) results.to_csv('Customers_Group.csv', index = False) results.head(20)

结果分析:

#得到数据可视化内容 group = pd.DataFrame() group_list = [group, group, group] for i in range(3): group = pd.DataFrame(results.iloc[results[results["Group"] == i].index]) group.columns = ['gender', 'age', 'income', 'spending','group'] group_list[i] = group.describe() #得到描述统计的列表 print(group_list) group_attr = {} for i in range(3): group_attr[i] = [] group_attr[i] += [group_list[i].spending['mean'], group_list[i].income['mean'], group_list[i].age['count']] group_attr = pd.DataFrame(group_attr.values(), index = group_attr.keys(), columns = ['spending', 'income', 'count']) group_attr

对各群体的特性进行分析:

spending、income 在各群体中的均值,各群体中顾客的数量 count 如表所示。

对其进行可视化分析:

#Group特征分析 #新建画布可视化分析 fig = plt.figure(figsize = [18,18]) plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 plt.suptitle('商场顾客分类:KMeans聚类', fontsize=30) #绘制第一张图:各群体的spending, income, count特征 #添加标题 ax1 = plt.subplot(311) ax2 = ax1.twinx() plt.title('各群体的spending, income, count特征', size=20) #设置参数 bar_width = 0.2 bar_1 = group_attr.index.tolist() bar_2 = [i+bar_width for i in bar_1] bar_3 = [i+bar_width for i in bar_2] #导入数据,绘制条形图 ax1.bar(bar_1, group_attr['spending'], width=bar_width, label='spending', alpha = 0.3) ax1.bar(bar_2, group_attr['income'], width=bar_width, label='income', alpha = 0.3) ax2.bar(bar_3, group_attr['count'], width=bar_width, label='count',color = 'green', alpha = 0.3) #添加xy轴 # ax1.set_xlabel('Group',size = 20) ax1.set_ylabel('元', size = 20) ax2.set_ylabel('数量', size = 20) #x轴刻度 ax1.set_xticks(bar_2) ax1.set_xticklabels( ['group1', 'group2', 'group3'], size = 15) # 图例 handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() plt.legend(handles1+handles2, labels1+labels2, loc='upper right') #网格 ax1.grid(alpha = 0.1) #绘制第二张图:各群体的年龄分布 # # plt.subplot(3,3,4) # sns.set(font='SimHei',font_scale=1.5) # 解决Seaborn中文显示问题并调整字体大小 # fig1, axes = plt.subplots(1,3, figsize=(18, 7)) # sns.set(color_codes=True, font_scale = 1.25) # # sns.distplot(results.Age[[result.Group == 0].index], ax=axes[0]) # sns.distplot(results[results.Group == 0]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group1:年龄分布', ax = axes[0]) # # plt.subplot(3,3,5) # sns.distplot(results[results.Group == 1]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group2:年龄分布', ax = axes[1]) # # plt.subplot(3,3,6) # sns.distplot(results[results.Group == 2]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group3:年龄分布', ax = axes[2]) for i in range(3): plt.subplot(3,3, i +4) plt.title('group'+str(i + 1)+':年龄分布特性', size = 20) sns.distplot(results[results.Group == i]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = '')

由于数据量小,分类仅分为三类。粗略分析可以发现:

group1:群体数量相对较少,相对年龄最小,消费能力最强,收入相对较高,但均用于消费; group2:相对年龄跨度最大,属于普遍状态,但收入较低,消费能力强于收入能力,超前消费较多; group3:稳健型消费者,收入高且收入大于消费,年龄在20 ~ 50岁之间,数量与 group1 数量基本相当,有剩余资金可以进行消费。

最新回复(0)