KMeans属于无监督(即无标签)聚类算法,在不知道数据没有具体的划分标准时,通过物以类聚的方法,将相似数据放在一起。
(初始点的选择不影响最后的分类结果,只影响迭代次数)
#随机初始点 def generate_random_point(min_, max_): return random.randint(min_, max_), random.randint(min_, max_) #随机生成五个点 generate_random_point(-100, 100) K = 5 #初始中心点 pervious_kernels = [generate_random_point(-100, 100) for _ in range(K)] print(pervious_kernels) #将五个点在图中画出 plt.scatter([pervious_kernels[0][0]], [pervious_kernels[0][1]], color='red', s=100) plt.scatter([pervious_kernels[1][0]], [pervious_kernels[1][1]], color='green', s=100) plt.scatter([pervious_kernels[2][0]], [pervious_kernels[2][1]], color='yellow', s=100) plt.scatter([pervious_kernels[3][0]], [pervious_kernels[3][1]], color='blue', s=100) plt.scatter([pervious_kernels[4][0]], [pervious_kernels[4][1]], color='purple', s=100) #画出随机的点[x,y] plt.scatter(random_x, random_y)距离函数:欧氏距离
import numpy as np #欧氏距离 def dis(x, y): return np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) kernel_colors = ['red', 'green', 'yellow', 'blue', 'purple'] #存放迭代更新后的中心点 new_kernels = [] #存放与相应的中心点为一类的点 groups = [[] for _ in range(K)] #对所有点与现在的中心点求距离,并将其与最小距离的点归为同一类 for p in random_poinsts: distances = [dis(p, k) for k in previous_kernels] min_index = np.argmin(distances) groups[min_index].append(p) #画出当前的中心的点与其集合中的点 for i, p in enumerate(previous_kernels): plt.scatter([p[0]], [p[1]], color=kernel_colors[i], s=100) #plt.scatter([previous_kernels[1][0]], [previous_kernels[1][1]], color=kernel_colors[1], s=100) #plt.scatter([previous_kernels[2][0]], [previous_kernels[2][1]], color=kernel_colors[2], s=100)
手肘法
matplotlib.pyplot 绘制双 y 轴图像 seaborn 绘制数据分布图
数据内容如下:
#商场顾客分类 import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.preprocessing import Normalizer #加载数据 data = pd.read_csv("Mall_Customers.csv") # data = pd.DataFrame(data) # print(data.describe(), data.info()) #取出需要分类的数据 train_x = data.iloc[:,1:5] #标签编码, LabelEncoder from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_x['Gender'] = le.fit_transform(train_x['Gender']) # col = train_x.columns.to_list() #如何取列名:https://www.cnblogs.com/wqbin/p/11845042.html # 规范化到 [0,1] 空间 min_max_scaler = preprocessing.MinMaxScaler() train_x1 = min_max_scaler.fit_transform(train_x) pd.DataFrame(train_x1, columns =train_x.columns).to_csv('temp.csv', index=False) # print(train_x2)) #正态分布的规范化 # normalizer = Normalizer() # train_x1 = normalizer.fit_transform(train_x) # print(train_x1) #使用KMeans聚类 kmeans = KMeans(n_clusters = 3) kmeans.fit(train_x) pre_y = kmeans.predict(train_x) #将数据与源数据拼接 results = pd.concat((train_x, pd.DataFrame(pre_y, columns = ['Group'])), axis = 1) results.to_csv('Customers_Group.csv', index = False) results.head(20)结果分析:
#得到数据可视化内容 group = pd.DataFrame() group_list = [group, group, group] for i in range(3): group = pd.DataFrame(results.iloc[results[results["Group"] == i].index]) group.columns = ['gender', 'age', 'income', 'spending','group'] group_list[i] = group.describe() #得到描述统计的列表 print(group_list) group_attr = {} for i in range(3): group_attr[i] = [] group_attr[i] += [group_list[i].spending['mean'], group_list[i].income['mean'], group_list[i].age['count']] group_attr = pd.DataFrame(group_attr.values(), index = group_attr.keys(), columns = ['spending', 'income', 'count']) group_attr对各群体的特性进行分析:
spending、income 在各群体中的均值,各群体中顾客的数量 count 如表所示。
对其进行可视化分析:
#Group特征分析 #新建画布可视化分析 fig = plt.figure(figsize = [18,18]) plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 plt.suptitle('商场顾客分类:KMeans聚类', fontsize=30) #绘制第一张图:各群体的spending, income, count特征 #添加标题 ax1 = plt.subplot(311) ax2 = ax1.twinx() plt.title('各群体的spending, income, count特征', size=20) #设置参数 bar_width = 0.2 bar_1 = group_attr.index.tolist() bar_2 = [i+bar_width for i in bar_1] bar_3 = [i+bar_width for i in bar_2] #导入数据,绘制条形图 ax1.bar(bar_1, group_attr['spending'], width=bar_width, label='spending', alpha = 0.3) ax1.bar(bar_2, group_attr['income'], width=bar_width, label='income', alpha = 0.3) ax2.bar(bar_3, group_attr['count'], width=bar_width, label='count',color = 'green', alpha = 0.3) #添加xy轴 # ax1.set_xlabel('Group',size = 20) ax1.set_ylabel('元', size = 20) ax2.set_ylabel('数量', size = 20) #x轴刻度 ax1.set_xticks(bar_2) ax1.set_xticklabels( ['group1', 'group2', 'group3'], size = 15) # 图例 handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() plt.legend(handles1+handles2, labels1+labels2, loc='upper right') #网格 ax1.grid(alpha = 0.1) #绘制第二张图:各群体的年龄分布 # # plt.subplot(3,3,4) # sns.set(font='SimHei',font_scale=1.5) # 解决Seaborn中文显示问题并调整字体大小 # fig1, axes = plt.subplots(1,3, figsize=(18, 7)) # sns.set(color_codes=True, font_scale = 1.25) # # sns.distplot(results.Age[[result.Group == 0].index], ax=axes[0]) # sns.distplot(results[results.Group == 0]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group1:年龄分布', ax = axes[0]) # # plt.subplot(3,3,5) # sns.distplot(results[results.Group == 1]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group2:年龄分布', ax = axes[1]) # # plt.subplot(3,3,6) # sns.distplot(results[results.Group == 2]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = 'group3:年龄分布', ax = axes[2]) for i in range(3): plt.subplot(3,3, i +4) plt.title('group'+str(i + 1)+':年龄分布特性', size = 20) sns.distplot(results[results.Group == i]['Age'].tolist(), rug_kws={"color": "g"}, kde_kws={ 'alpha' : 0.5, "lw":4, "label": "kde", 'color': 'g'}, hist_kws={"label" : 'hist'}, axlabel = '')由于数据量小,分类仅分为三类。粗略分析可以发现:
group1:群体数量相对较少,相对年龄最小,消费能力最强,收入相对较高,但均用于消费; group2:相对年龄跨度最大,属于普遍状态,但收入较低,消费能力强于收入能力,超前消费较多; group3:稳健型消费者,收入高且收入大于消费,年龄在20 ~ 50岁之间,数量与 group1 数量基本相当,有剩余资金可以进行消费。