特征工程实战2：随机森林模型用于疾病的判断工程

tech2022-08-26 176

x_train , y_train ：在一张表里面。确定x,y 的时候会用到 drop函数对整个矩阵删去，y的那一列，剩下的都是x

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.tree import export_graphviz from sklearn.model_selection import train_test_split #训练集，测试集分类 #特征工程重要三个插件 import eli5 from eli5.sklearn import PermutationImportance import shap #对比多个/所有特征对模型起到抑制和促进 from pdpbox import pdp, info_plots np.random.seed(123)#跟random_state是一样的，第一次运行的时候，后面的结果是不会变的 ```python dt = pd.read_csv('C:/Users/lb/Desktop/test/heart.csv') dt.head() dt.info() #未发现存在缺失值

#列名可以根据自己易读性修改 dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

#转换一下各个特征的属性，后面用astype强制转换回来 dt['sex'][dt['sex'] == 0] = 'female' dt['sex'][dt['sex'] == 1] = 'male' # 胸痛经历 dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina' #典型心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina' #非典型心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain' #非心绞痛 dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic' #无症状 #病人的静息血压 dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml' #低压 dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'#高压 #心电图测量 dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'#正常 dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality' #有ST-T波异常 dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'#按Estes标准显示可能或明确的左心室肥厚 #运动诱发心绞痛 dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no' #否 dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes' #是 #峰值运动后ST段心电图的斜率 dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'#上升 dt['st_slope'][dt['st_slope'] == 2] = 'flat' #平坦 dt['st_slope'][dt['st_slope'] == 3] = 'downsloping' #下降 #称为地中海贫血的血液疾病 dt['thalassemia'][dt['thalassemia'] == 1] = 'normal' #正常 dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect' #固定 dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'#可逆缺陷 #原来数据是正常的都是数值特征，中间转换为str类型了，用astype再强制转换回来 dt['sex'] = dt['sex'].astype('object')# 现在是”sex“是female为女，male为男，现在强反转过来男为1 ，女为0 dt['chest_pain_type'] = dt['chest_pain_type'].astype('object') #现在都强制反转换过来1 2 3 4 dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object') dt['rest_ecg'] = dt['rest_ecg'].astype('object') dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object') dt['st_slope'] = dt['st_slope'].astype('object') dt['thalassemia'] = dt['thalassemia'].astype('object')

#pandas的读热编码 dt = pd.get_dummies(dt)#drop_first=True读热编码，#drop_first表示去除one-hot编码后的第一列数据，反之就有第一列 dt.head()

#实际模型使用的时候不会使用，random. =1 或者 0 会用一个大的数字 #比如 10 加入 100 个数据集， 80 个做训练集，20个测试集，那么在每次分割训练集的时候测试集总要10个数据不会变动 # x_train, x_test, y_train, y_test = train_test_split(dt.drop('target',axis = 1), dt['target'], test_size=0.2,random_state =10) #利用随机森林模型进行训练，这种选择数的深度是 5 model = RandomForestClassifier(max_depth=5) model.fit(x_train, y_train) #bootstrap=True是否使用bootstrap，默认是true，自助法，有放回的重采样 #“balanced” 模式自动调整权重，每类的权重为 n_samples / (n_classes * np.bincount(y))，即类别数的倒数除以每类样本数的占比。 #树分裂的规则：gini系数，entropy熵,默认的是基尼系数 #max_depth=5：树的深度为5层 """max_features='auto'：int, float, string or None, optional (default=”auto”) 查找最佳分裂所需考虑的特征数， int：分裂的最大特征数， float：分裂的特征占比， auto、sqrt：sqrt(n_features)， log2：log2(n_features)， None：n_features， """ #max_leaf_nodes=None 最大叶子节点数； #min_impurity_decrease=0 分裂的最小不纯度为0 #n_estimators：随机森林中树的数量 #n_jobs : integer, optional (default=1)，并行job数，-1 代表全部 #oob_score : bool (default=False)，是否使用袋外（out-of-bag）样本估计准确度； #random_state=None ，随机数种子，保持下一次运行不变 #verbose：控制树冗余 #warm_start : bool, optional (default=False)，如果设置为True，在之前的模型基础上预测并添加模型，否则，建立一个全新的森林； print(model) #随机森林的参数

#graphviz 手动安装，这是一个模板，需要填的就填好了 #proportion=True ，设置均匀 #filled：装满 #feature_names特征名称，已定义 #feature_names = [i for i in x_train.columns] #y 就是target 那一列之前是 0 或者 1 ，现在我想修改为字符串，强制转换格式 # y_train_str = y_train.astype('str') # y_train_str[y_train_str == '0'] = 'no disease' #0代表没心脏病 # y_train_str[y_train_str == '1'] = 'disease' #1代表有心脏病 # y_train_str = y_train_str.values export_graphviz(estimator, out_file='tree.dot', feature_names = feature_names, #特征变量，已被定义 class_names = y_train_str, # 类别变量，已被定义 rounded = True, proportion = True, #树节点为圆角矩形 label='root', precision = 2, filled = True) #precision=2：每个节点的杂质，阈值和值属性的值中浮点数的精度位数； filled：充满 # # 使用系统命令转换为png（需要Graphviz） from subprocess import call call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600']) # # dot:生成可视化图片的命令 # #-Tpng：指定图像类型是png # #tree.dot：out_file输出的文件名 # #-o：output输出文件 # #tree.png：输出文件名 # #-Gdpi=600：图像每英寸含600个像素 # 显示在jupyter笔记本 from IPython.display import Image Image(filename = 'tree.png')

重要可以查看概率和分类结果

训练完模型之后可以使用三个插件

#把随机森林加载进来，下一次运行结果不变 perm = PermutationImportance(model, random_state=1).fit(X_test, y_test) # 要求集成算法的特征重要度，把所有特征加载进来 eli5.show_weights(perm, feature_names = X_test.columns.tolist()) #第一行的心绞痛的经历权重很高跟是否得心脏病很重要，中间的非心绞痛就跟心脏病和正常的贫血跟没关系

看一下别的因素

feat_name = 'age' pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name) pdp.pdp_plot(pdp_dist, feat_name) plt.show() #随着年龄的升高，心脏病越小，但是37到42岁是危险的年龄，心脏病越高

#chest_pain_type：心绞痛从蓝变红，越来越大，代表越来越严重 explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values[1], X_test)

def heart_disease_risk_factors(model, patient): explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient)#shap_values是所有的测试特征 shap.initjs()#显示格式转换 return shap.force_plot(explainer.expected_value[1], shap_values[1], patient) #patient患者 data_for_prediction = X_test.iloc[1,:].astype(float)#把测试样本中，第一行的所有特征拿到都强制转为”float“ heart_disease_risk_factors(model, data_for_prediction) #图中红色的chest_pain_type = 2非典型心绞痛对没有患心脏病的强度很大；蓝色的num_magor_vessels=1血管数量越少，对换心脏病的强度越高

最新回复(0)