Knowledge map of family tree (向氏家谱)
github地址:https://github.com/lonngxiang/Knowledge-map-of-family-tree
本项目主要是把数据整理出实体与关系上传eno4j图数据库做知识图谱
数据源:http://xiangshijiapu.com/Family/F5186/V00251867559/WebSxtList.htm
难点:1、重名处理,特别同一代与不同代 2、关系:父子,兄妹,叔伯三种主要
3、py2neo库连接neo4j版本问题,后续更新neo4j-community-3.5.21 版本没问题
下载数据源
import requests
from lxml import etree
url = 'http://xiangshijiapu.com/Family/F5186/V00251867559/WebSxtList.htm'
html = requests.get(url)
html.encoding = 'gb2312'
html1 = html.text.replace("T","t").replace("R","r").replace("D","d")
new_html = etree.HTML(html1)
lines = []
for i in range(len(new_html.xpath('//tr'))):
# if i <5:
line = []
for j in range(21):
aa = new_html.xpath('//tr[{}]//td[{}]/div//text()'.format(i+1,j+1))
bb = [ii.replace("\u3000","") for ii in aa]
try:
if bb[1]:
bb = bb[:2]
except:
pass
line.append(bb)
print(bb)
lines.append(line)
import numpy as np
# numpy_array = np.array([1,2,3])
np.save('家谱111.npy',lines )
numpy_array = np.load('家谱111.npy',allow_pickle=True)
同一代重名整理和父子关系区分
import pandas as pd
df66 = pd.DataFrame(lines)
def same1(x,y,z):
global iii0
global sames
print(x)
# x =eval(x)
# print(type(x))
iii0+=1
if len(x)==2:
x[1] = x[1] +"<"+str(y+1)+"世"+">"
if x[1] not in sames:
sames.append(x[1])
else:
x[1] = x[1] + "-重名"+str(iii0)
sames.append(x[1])
print(x)
return x
return str(x)
for i in range(21):
sames =[]
iii0 = 0
df66[i].apply(same1,args = (i,1))
kkk = []
for j in range(0,21):
kk = []
for i in range(6708):
value = df66.iloc[i+1,j]
try:
if "━" == value[0]:
kk.append(value+[i+1,j])
print(value+[i+1,j])
if "┳" == value[0]:
kk.append(value+[i+1,j])
print(value+[i+1,j])
continue
if "┣" == value[0]:
kk.append(value+[i+1,j])
print(value+[i+1,j])
continue
if "┗" == value[0]:
kk.append(value+[i+1,j])
print(value+[i+1,j])
continue
except:
pass
kkk.append(kk)
mmm = []
for kk1 in kkk:
mma = []
mm1 = []
# print(kk1)
for num,i in enumerate(kk1):
try:
if i[0]=='━':
mm={}
mm[df66.iloc[i[2],i[3]-1][1]] = i[1]
mma.append(mm)
if i[0]=='┳':
mm1.append(num)
if i[0]=='┗':
mm1.append(num)
except:
pass
for i in range(0,len(mm1),2):
print(mm1[i])
# print([ i[1]+"<"+str(i[3]+1)+"世"+">" for i in kk1[mm1[i]:mm1[i+1]+1]])
mm={}
mm[df66.iloc[kk1[mm1[i]:mm1[i+1]+1][0][2],kk1[mm1[i]:mm1[i+1]+1][0][3]-1][1]] = [ i[1] for i in kk1[mm1[i]:mm1[i+1]+1]]
mma.append(mm)
mmm.append(mma)
存储图数据库
from py2neo import Graph,Node,Relationship
graph = Graph("http://localhost:7474",auth = ('neo4j',"neo4j"))
#实体
name_set=set()
for i in mmm:
if i :
for j in i:
for ii,jj in j.items():
if type(jj)==str:
name_set.add(ii)
name_set.add(jj)
else:
name_set.add(ii)
for jjj in jj:
name_set.add(jjj)
for i in name_set:
graph.create( Node('Person_name', name=i,))
#父子
for i in mmm:
if i :
for j in i:
for ii,jj in j.items():
if type(jj)==str:
# name_set.add(ii)
# name_set.add(jj)
node_a = matcher.match("Person_name", name=ii).first()
node_b = matcher.match("Person_name", name=jj).first()
print(node_a,node_b)
graph.create(Relationship(node_b, '父子', node_a))
else:
# name_set.add(ii)
node_a = matcher.match("Person_name", name=ii).first()
for jjj in jj:
node_b = matcher.match("Person_name", name=jjj).first()
graph.create(Relationship(node_b, '父子', node_a))
#兄妹
for i in mmm:
if i :
for j in i:
for ii,jj in j.items():
if type(jj)==str:
# name_set.add(ii)
# name_set.add(jj)
# node_a = matcher.match("Person_name", name=ii).first()
# node_b = matcher.match("Person_name", name=jj).first()
# print(node_a,node_b)
# graph.create(Relationship(node_b, '父子', node_a))
pass
else:
# name_set.add(ii)
node_a = matcher.match("Person_name", name=jj[0]).first()
for jjj in jj[1:]:
node_b = matcher.match("Person_name", name=jjj).first()
graph.create(Relationship(node_b, '兄妹', node_a))
MATCH p=()-[*1..20]->() RETURN p LIMIT 1000
match data=(na:Person_name{name:'向智念<17世>'})-[*3..]-(nb:Person_name{name:'向骏<18世>'}) return data
match data=(na:Person_name{name:'向语涵<19世>'})-[*1..20]->(nb:Person_name) return data
另:叔伯关系暂时未加,主要是前后辈关系,大体思路是隔代数据进行关系关联整理:
uncles = []
for i,j in mm.items():
# print(type(j))
if type(j)==list:
print(j)
for m in j:
print(m)
if m in mm5.keys():
uncle =[]
if type(mm5[m])==str:
for q in j:
if q !=m:
uncle.append({mm5[m]:q})
else:
for n in mm5[m]:
for q in j:
if q !=m:
uncle.append({n:q})
uncles.append(uncle)