参考文章 文章1 文章2
ConceptNet下载
中文部分 http://openkg.cn/dataset/conceptnet5-chinese 完整下载 https://github.com/commonsense/conceptnet5/wiki/Downloads
中文部分下载到以后是一个csv文件
读取内容
import pandas
as pd
FILE
= 'F:\conceptNet_chinese\chineseconceptnet.csv'
data
= pd
.read_csv
(FILE
, delimiter
='\t')
data
.columns
= ['uri', 'relation', 'start', 'end', 'json']
上边内容的含义
data
= data
[data
['start'].apply(lambda row
: row
.find
('zh') > 0) & data
['end'].apply(lambda row
: row
.find
('zh') > 0)]
data
.index
= range(data
.shape
[0])
print(data
)
import json
weights
= data
['json'].apply(lambda row
: json
.loads
(row
)['weight'])
data
.pop
('json')
data
.insert
(4, 'weights', weights
)
print(data
)
中文繁简体转换
from zhtools
.longconv
import *
def cht_to_chs(line
):
line
= Converter
('zh-hans').convert
(line
.decode
('utf-8'))
line
.encode
('utf-8')
return line
def chs_to_cht(line
):
line
= Converter
('zh-hant').convert
(line
.decode
('utf-8'))
line
.encode
('utf-8')
return line
这里需要强调,langconv 这个包 在pycharm里 不会自动安装。需要下载 https://github.com/skydark/nstools/tree/master/zhtools 将下载的 langconv.py 和 zh_wiki.py,放在python代码所在目录即可使用
查询起始节点
def search(words
, n
=20):
result
= data
[data
['start'].str.contains
(chs_to_cht
(words
))]
topK_result
= result
.sort_values
("weights", ascending
=False).head
(n
)
return topK_result
格式化输出结果
template
= {
'/r/RelatedTo': '和{}相关',
'/r/FormOf': '的形式为{}',
'/r/IsA': '是{}',
'/r/PartOf': '是{}的一部分',
'/r/HasA': '具有{}',
'/r/UsedFor': '用来{}',
'/r/CapableOf': '可以{}',
'/r/AtLocation': '在{}',
'/r/Causes': '导致{}',
'/r/HasSubevent': ',接下来,{}',
'/r/HasFirstSubevent': ',紧接着,{}',
'/r/HasLastSubevent': '的最后一步是{}',
'/r/HasPrerequisite': '的前提为{}',
'/r/HasProperty': '具有{}的属性',
'/r/MotivatedByGoal': '受到{}的驱动',
'/r/ObstructedBy': '受到{}的影响',
'/r/Desires': '想要{}',
'/r/CreatedBy': '被{}创造',
'/r/Synonym': '和{}同义',
'/r/Antonym': '和{}反义',
'/r/DistinctFrom': '和{}相区别',
'/r/DerivedFrom': '由{}导致',
'/r/SymbolOf': '象征着{}',
'/r/DefinedAs': '定义为{}',
'/r/MannerOf': '',
'/r/LocatedNear': '和{}相邻',
'/r/HasContext': '的背景是{}',
'/r/SimilarTo': '和{}相似',
'/r/EtymologicallyRelatedTo': '',
'/r/EtymologicallyDerivedFrom': '',
'/r/CausesDesire': '',
'/r/MadeOf': '由{}制成',
'/r/ReceivesAction': '',
'/r/ExternalURL': ''
}
def strip(str):
return str.split
('/')[3]
topK_result
= search
("常识", 20)
for i
in topK_result
.index
:
i
= topK_result
.loc
[i
]
if len(template
[i
['relation']]) > 0:
fanti
= strip
(i
['start']) + template
[i
['relation']].format(strip
(i
['end']))
jianti
= cht_to_chs
(fanti
)
print(jianti
)