最后记录一个网上看来的代码:去除两个表中相同的内容,保留剩下的
from pyspark.sql import functions def LeftDeleteRight(test_left,test_right,left_col = 'user_pin',right_col = 'user_pin'): print('right data process ...') columns_right = test_right.columns test_right = test_right.withColumn('user_pin_right', test_right[right_col]) test_right = test_right.withColumn('notDelete', functions.lit(0)) # 删除其余的 for col in columns_right: test_right = test_right.drop(col) # 合并 print('rbind left and right data ...') test_left = test_left.join(test_right, test_left[left_col] == test_right['user_pin_right'], "left") test_left = test_left.fillna(1) test_left = test_left.where('notDelete =1') # 去掉多余的字段 for col in ['user_pin_right','notDelete']: test_left = test_left.drop(col) return test_left %time test_left = LeftDeleteRight(test_b,test_a,left_col = 'user_pin',right_col = 'user_pin')