姓氏家谱知识图谱构建

tech2022-12-29  94

Knowledge map of family tree (向氏家谱)

github地址:https://github.com/lonngxiang/Knowledge-map-of-family-tree

本项目主要是把数据整理出实体与关系上传eno4j图数据库做知识图谱

数据源:http://xiangshijiapu.com/Family/F5186/V00251867559/WebSxtList.htm

难点:1、重名处理,特别同一代与不同代 2、关系:父子,兄妹,叔伯三种主要

3、py2neo库连接neo4j版本问题,后续更新neo4j-community-3.5.21 版本没问题

下载数据源

import requests from lxml import etree url = 'http://xiangshijiapu.com/Family/F5186/V00251867559/WebSxtList.htm' html = requests.get(url) html.encoding = 'gb2312' html1 = html.text.replace("T","t").replace("R","r").replace("D","d") new_html = etree.HTML(html1) lines = [] for i in range(len(new_html.xpath('//tr'))): # if i <5: line = [] for j in range(21): aa = new_html.xpath('//tr[{}]//td[{}]/div//text()'.format(i+1,j+1)) bb = [ii.replace("\u3000","") for ii in aa] try: if bb[1]: bb = bb[:2] except: pass line.append(bb) print(bb) lines.append(line) import numpy as np # numpy_array = np.array([1,2,3]) np.save('家谱111.npy',lines ) numpy_array = np.load('家谱111.npy',allow_pickle=True)

同一代重名整理和父子关系区分

import pandas as pd df66 = pd.DataFrame(lines) def same1(x,y,z): global iii0 global sames print(x) # x =eval(x) # print(type(x)) iii0+=1 if len(x)==2: x[1] = x[1] +"<"+str(y+1)+"世"+">" if x[1] not in sames: sames.append(x[1]) else: x[1] = x[1] + "-重名"+str(iii0) sames.append(x[1]) print(x) return x return str(x) for i in range(21): sames =[] iii0 = 0 df66[i].apply(same1,args = (i,1)) kkk = [] for j in range(0,21): kk = [] for i in range(6708): value = df66.iloc[i+1,j] try: if "━" == value[0]: kk.append(value+[i+1,j]) print(value+[i+1,j]) if "┳" == value[0]: kk.append(value+[i+1,j]) print(value+[i+1,j]) continue if "┣" == value[0]: kk.append(value+[i+1,j]) print(value+[i+1,j]) continue if "┗" == value[0]: kk.append(value+[i+1,j]) print(value+[i+1,j]) continue except: pass kkk.append(kk) mmm = [] for kk1 in kkk: mma = [] mm1 = [] # print(kk1) for num,i in enumerate(kk1): try: if i[0]=='━': mm={} mm[df66.iloc[i[2],i[3]-1][1]] = i[1] mma.append(mm) if i[0]=='┳': mm1.append(num) if i[0]=='┗': mm1.append(num) except: pass for i in range(0,len(mm1),2): print(mm1[i]) # print([ i[1]+"<"+str(i[3]+1)+"世"+">" for i in kk1[mm1[i]:mm1[i+1]+1]]) mm={} mm[df66.iloc[kk1[mm1[i]:mm1[i+1]+1][0][2],kk1[mm1[i]:mm1[i+1]+1][0][3]-1][1]] = [ i[1] for i in kk1[mm1[i]:mm1[i+1]+1]] mma.append(mm) mmm.append(mma)

存储图数据库

from py2neo import Graph,Node,Relationship graph = Graph("http://localhost:7474",auth = ('neo4j',"neo4j")) #实体 name_set=set() for i in mmm: if i : for j in i: for ii,jj in j.items(): if type(jj)==str: name_set.add(ii) name_set.add(jj) else: name_set.add(ii) for jjj in jj: name_set.add(jjj) for i in name_set: graph.create( Node('Person_name', name=i,)) #父子 for i in mmm: if i : for j in i: for ii,jj in j.items(): if type(jj)==str: # name_set.add(ii) # name_set.add(jj) node_a = matcher.match("Person_name", name=ii).first() node_b = matcher.match("Person_name", name=jj).first() print(node_a,node_b) graph.create(Relationship(node_b, '父子', node_a)) else: # name_set.add(ii) node_a = matcher.match("Person_name", name=ii).first() for jjj in jj: node_b = matcher.match("Person_name", name=jjj).first() graph.create(Relationship(node_b, '父子', node_a)) #兄妹 for i in mmm: if i : for j in i: for ii,jj in j.items(): if type(jj)==str: # name_set.add(ii) # name_set.add(jj) # node_a = matcher.match("Person_name", name=ii).first() # node_b = matcher.match("Person_name", name=jj).first() # print(node_a,node_b) # graph.create(Relationship(node_b, '父子', node_a)) pass else: # name_set.add(ii) node_a = matcher.match("Person_name", name=jj[0]).first() for jjj in jj[1:]: node_b = matcher.match("Person_name", name=jjj).first() graph.create(Relationship(node_b, '兄妹', node_a)) MATCH p=()-[*1..20]->() RETURN p LIMIT 1000

match data=(na:Person_name{name:'向智念<17世>'})-[*3..]-(nb:Person_name{name:'向骏<18世>'}) return data

match data=(na:Person_name{name:'向语涵<19世>'})-[*1..20]->(nb:Person_name) return data

另:叔伯关系暂时未加,主要是前后辈关系,大体思路是隔代数据进行关系关联整理:

uncles = [] for i,j in mm.items(): # print(type(j)) if type(j)==list: print(j) for m in j: print(m) if m in mm5.keys(): uncle =[] if type(mm5[m])==str: for q in j: if q !=m: uncle.append({mm5[m]:q}) else: for n in mm5[m]: for q in j: if q !=m: uncle.append({n:q}) uncles.append(uncle)
最新回复(0)