聚类算法相关:
聚类算法(一)——DBSCAN
聚类算法(二)—— 优缺点对比
聚类算法(三)—— 评测方法1
聚类算法(三)—— 评测方法2
聚类算法(三)—— 评测方法3(代码)
聚类算法(四)—— 基于词语相似度的聚类算法(含代码)
聚类算法(五)——层次聚类 linkage (含代码)
聚类算法(六)——谱聚类 (含代码)
import pandas as pd import math def load_cluster_pred(file_name): df = pd.read_excel(file_name, 0) word_pred_dic = {} cluster_pred_dic = {} for rep, word in zip(df['核心词'], df['包含词']): if str(rep) == 'nan': continue if str(rep) == '未分类词语': continue word_pred_dic[word] = rep cluster_pred_dic.setdefault(rep, []) cluster_pred_dic[rep].append(word) return word_pred_dic, cluster_pred_dic def load_cluster_label(file_name): df = pd.read_excel(file_name) columns = df.columns.values.tolist() print(columns) word_label_dic = {} cluster_label_dic = {} for i in range(len(columns)): keywords = df[columns[i]].tolist() for item in keywords: if str(item)=='nan': continue word_label_dic[str(item)] = i cluster_label_dic.setdefault(i, []) cluster_label_dic[i].append(str(item)) return word_label_dic, cluster_label_dic def calculate_sum_aibj(cluster_dic, intersect_keywords): sum_aibi = 0 for cluster_rep in cluster_dic: cluster_effect_num = len([x for x in cluster_dic[cluster_rep] if x in intersect_keywords]) sum_aibi += cluster_effect_num * (cluster_effect_num - 1) / 2 return sum_aibi def evaluate(word_label_dic, word_pred_dic, cluster_label_dic, cluster_pred_dic): # keywords = list(cluster_label_dic.keys()) keywords = list(set(word_label_dic.keys()).intersection(word_pred_dic.keys())) print('intesection keywords num: {}'.format(len(keywords))) sum_ai = calculate_sum_aibj(cluster_label_dic, keywords) sum_bj = calculate_sum_aibj(cluster_pred_dic, keywords) n = len(keywords) a = 0 # w1 w2 in one cluster of label and in one cluster of predict b = 0 # w1 w2 in one cluster of label and not in one cluster of predict c = 0 # w1 w2 not in one cluster of label and in one cluster of predict d = 0 # w1 w2 not in one cluster of label and not in one cluster of predict e = 0 # w1 w2 in one cluster of label f = 0 # w1 w2 in one cluster of predict data = [] for i in range(n): for j in range(i+1, n): wi = keywords[i] wj = keywords[j] tag_label = word_label_dic[wi] == word_label_dic[wj] tag_predict = word_pred_dic[wi] == word_pred_dic[wj] if tag_label and tag_predict: a += 1 elif tag_label and not tag_predict: c += 1 elif not tag_label and tag_predict: b += 1 else: d += 1 if tag_label: e += 1 if tag_predict: f += 1 data.append([wi, wj, word_label_dic[wi], word_label_dic[wj], word_pred_dic[wi], word_pred_dic[wj], tag_label, tag_predict]) print("count number: a:{}\t b:{}\t c:{}\t d:{}".format(a, b, c, d)) print("count number: e:{}\t f:{}".format(e, f)) ri = ri_eval(a, d, n) ari = ari_eval(a, sum_ai, sum_bj, n) fmi = fmi_eval(a, b, c) jc = jc_eval(a, b, c) di = di_eval(a, b, c) p = a / (a + b) r = a / (a + c) print('RI: {}\n ARI: {}\n FMI: {}\n JC: {}\n DI: {}'.format(ri, ari, fmi, jc, di)) print('precision:{}\t recall:{}\tF-value:{}\n'.format(p, r, 2*p*r/(p+r))) df_r = pd.DataFrame(data=data, columns=['wi', 'wj', 'wi_lab_rep', 'wj_lab_rep', 'wi_pred_rep', 'wj_pred_rep', 'samecluster_label', 'samecluster_pred']) # df_r.to_excel('/data/hanxuhong/nlp/consultant-nlp/sentiment_new_field/data/美妆_eval.xlsx', index=False) return ri def ri_eval(a, d, n): ri = 2 * (a + d) / (n * (n - 1)) return ri def ari_eval(a, sum_ai, sum_bj, n): molecular = a - (sum_ai * sum_bj) * 2 / (n * (n - 1)) denominator = (sum_ai + sum_bj) / 2 - 2 * sum_ai * sum_bj / (n * (n - 1)) ari = molecular / denominator return ari def fmi_eval(a, b, c): fmi = math.sqrt((a/(a + b)) * (a/(a + c))) return fmi def jc_eval(a, b, c): return a/(a + b + c) def di_eval(a, b, c): return 2 * a /(2 * a + b + c) def hcv(labels, preds): if len(labels)!=len(preds): raise Exception('Length error!') n = len(labels) from sklearn import metrics def eval_2(labels, preds): ari = metrics.adjusted_rand_score(labels, preds) ami = metrics.adjusted_mutual_info_score(labels, preds) homogeneity = metrics.homogeneity_score(labels, preds) # 同质性homogeneity:每个群集只包含单个类的成员。 completeness = metrics.completeness_score(labels, preds) # 完整性completeness:给定类的所有成员都分配给同一个群集。 V_measure = metrics.v_measure_score(labels, preds) # 两者的调和平均V-measure print('matrix \nARI:{}\nAMI:{}\n'.format(ari, ami)) print('homogeneity:{}\tcompleteness:{}\tV-measure:{}\n'.format(homogeneity, completeness, V_measure))
转载请注明出处
觉得有用,麻烦点个赞,喜欢我请关注!!!

