特征工程实战2:随机森林模型用于疾病的判断工程

tech2022-08-26  113

x_train , y_train : 在一张表里面。确定x,y 的时候会用到 drop函数 对整个矩阵 删去 ,y的那一列,剩下的都是x

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.tree import export_graphviz from sklearn.model_selection import train_test_split #训练集 ,测试集分类 #特征工程重要三个插件 import eli5 from eli5.sklearn import PermutationImportance import shap #对比多个/所有特征对模型起到抑制和促进 from pdpbox import pdp, info_plots np.random.seed(123)#跟random_state是一样的,第一次运行的时候,后面的结果是不会变的 ```python dt = pd.read_csv('C:/Users/lb/Desktop/test/heart.csv') dt.head() dt.info() #未发现存在缺失值

#列名可以根据自己易读性修改 dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

#转换一下各个特征的属性,后面用astype强制转换回来 dt['sex'][dt['sex'] == 0] = 'female' dt['sex'][dt['sex'] == 1] = 'male' # 胸痛经历 dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina' #典型心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina' #非典型心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain' #非心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic' #无症状 #病人的静息血压 dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml' #低压 dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'#高压 #心电图测量 dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'#正常 dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality' #有ST-T波异常 dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'#按Estes标准显示可能或明确的左心室肥厚 #运动诱发心绞痛 dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no' #否 dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes' #是 #峰值运动后ST段心电图的斜率 dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'#上升 dt['st_slope'][dt['st_slope'] == 2] = 'flat' #平坦 dt['st_slope'][dt['st_slope'] == 3] = 'downsloping' #下降 #称为地中海贫血的血液疾病 dt['thalassemia'][dt['thalassemia'] == 1] = 'normal' #正常 dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect' #固定 dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'#可逆缺陷 #原来数据是正常的都是数值特征,中间转换为str类型了 , 用astype再强制转换回来 dt['sex'] = dt['sex'].astype('object')# 现在是”sex“是female为女,male为男 ,现在强反转过来男为1 ,女为0 dt['chest_pain_type'] = dt['chest_pain_type'].astype('object') #现在都强制反转换过来1 2 3 4 dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object') dt['rest_ecg'] = dt['rest_ecg'].astype('object') dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object') dt['st_slope'] = dt['st_slope'].astype('object') dt['thalassemia'] = dt['thalassemia'].astype('object')

#pandas的读热编码 dt = pd.get_dummies(dt)#drop_first=True读热编码,#drop_first表示去除one-hot编码后的第一列数据,反之就有第一列 dt.head()

#实际模型使用的时候不会使用,random. =1 或者 0 会用一个大的数字 #比如 10 加入 100 个数据集 , 80 个做训练集 ,20个测试集 ,那么在每次分割训练集的时候测试集总要10个数据不会变动 # x_train, x_test, y_train, y_test = train_test_split(dt.drop('target',axis = 1), dt['target'], test_size=0.2,random_state =10) #利用随机森林模型进行训练 ,这种选择数的深度是 5 model = RandomForestClassifier(max_depth=5) model.fit(x_train, y_train) #bootstrap=True是否使用bootstrap,默认是true,自助法,有放回的重采样 #“balanced” 模式自动调整权重,每类的权重为 n_samples / (n_classes * np.bincount(y)),即类别数的倒数除以每类样本数的占比。 #树分裂的规则:gini系数,entropy熵,默认的是基尼系数 #max_depth=5:树的深度为5层 """max_features='auto':int, float, string or None, optional (default=”auto”) 查找最佳分裂所需考虑的特征数, int:分裂的最大特征数, float:分裂的特征占比, auto、sqrt:sqrt(n_features), log2:log2(n_features), None:n_features, """ #max_leaf_nodes=None 最大叶子节点数; #min_impurity_decrease=0 分裂的最小不纯度为0 #n_estimators:随机森林中树的数量 #n_jobs : integer, optional (default=1),并行job数,-1 代表全部 #oob_score : bool (default=False),是否使用袋外(out-of-bag)样本估计准确度; #random_state=None ,随机数种子,保持下一次运行不变 #verbose:控制树冗余 #warm_start : bool, optional (default=False),如果设置为True,在之前的模型基础上预测并添加模型,否则,建立一个全新的森林; print(model) #随机森林的参数

#graphviz 手动安装 ,这是一个模板,需要填的就填好了 #proportion=True ,设置均匀 #filled:装满 #feature_names特征名称,已定义 #feature_names = [i for i in x_train.columns] #y 就是target 那一列 之前是 0 或者 1 ,现在我想修改为 字符串,强制转换格式 # y_train_str = y_train.astype('str') # y_train_str[y_train_str == '0'] = 'no disease' #0代表没心脏病 # y_train_str[y_train_str == '1'] = 'disease' #1代表有心脏病 # y_train_str = y_train_str.values export_graphviz(estimator, out_file='tree.dot', feature_names = feature_names, #特征变量 ,已被定义 class_names = y_train_str, # 类别变量,已被定义 rounded = True, proportion = True, #树节点为圆角矩形 label='root', precision = 2, filled = True) #precision=2:每个节点的杂质,阈值和值属性的值中浮点数的精度位数; filled:充满 # # 使用系统命令转换为png(需要Graphviz) from subprocess import call call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600']) # # dot:生成可视化图片的命令 # #-Tpng:指定图像类型是png # #tree.dot:out_file输出的文件名 # #-o:output输出文件 # #tree.png:输出文件名 # #-Gdpi=600:图像每英寸含600个像素 # 显示在jupyter笔记本 from IPython.display import Image Image(filename = 'tree.png')

重要 可以查看 概率 和分类结果

训练完模型之后 可以使用三个插件

#把随机森林加载进来 ,下一次运行结果不变 perm = PermutationImportance(model, random_state=1).fit(X_test, y_test) # 要求集成算法的特征重要度 , 把所有特征加载进来 eli5.show_weights(perm, feature_names = X_test.columns.tolist()) #第一行的心绞痛的经历权重很高跟是否得心脏病很重要,中间的非心绞痛就跟心脏病和正常的贫血跟没关系

看一下别的因素

feat_name = 'age' pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name) pdp.pdp_plot(pdp_dist, feat_name) plt.show() #随着年龄的升高,心脏病越小,但是37到42岁是危险的年龄,心脏病越高

#chest_pain_type:心绞痛从蓝变红,越来越大,代表越来越严重 explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values[1], X_test)

def heart_disease_risk_factors(model, patient): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient)#shap_values是所有的测试特征 shap.initjs()#显示格式转换 return shap.force_plot(explainer.expected_value[1], shap_values[1], patient) #patient患者 data_for_prediction = X_test.iloc[1,:].astype(float)#把测试样本中,第一行的所有特征拿到都强制转为”float“ heart_disease_risk_factors(model, data_for_prediction) #图中红色的chest_pain_type = 2非典型心绞痛对没有患心脏病的强度很大;蓝色的num_magor_vessels=1血管数量越少,对换心脏病的强度越高

最新回复(0)