数据说明 1.本次数据来源于qq群聊天记录 2.一共3123条数据,可以直接用qq导出聊天记录 3.使用到的库有pyecharts jieba
发言排行
index_idnamecount1闰土7392天天4953温柔少女豆瓣酱β30847酱2965逢考必过2716土猹2657辞1878不吃香菜1259予8910靓仔飞机66时间段统计 词频分析
index_idWordcount1?1742懂743kpdd734天天605土狗516逼517哥哥438傻419闰土3810排位38end
# qq聊天记录 制图 from pyecharts.charts import Bar,Line,WordCloud from pyecharts import options as opts import jieba import collections import re hour_list = { '1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0,'9':0,'10':0,'11':0,'12':0, '13': 0,'14':0,'15':0,'16':0,'17':0,'18':0,'19':0,'20':0,'21':0,'22':0,'23':0,'0':0 } content_count = {} content_all = '' def parse(): with open("C:/Users/Administrator/Desktop/土狗大队.txt", "r", encoding='utf-8') as f: text = f.read() text_list = re.compile('(\d{4}-\d{2}-\d{2} \d{1,2}:\d{2}:\d{2}) (.*)[\(,\<](.*)[\),\>][\n](.*)[\n]').findall(text) for item in text_list: time = item[0] name = item[1] qq = item[2] content = item[3] print('日期:', time) print('昵称:', name) print('qq:', qq) print('消息:', content) print() # 统计时间段 time_hour = re.compile('[\s](\d{1,2}):').findall(time)[0] hour_list[time_hour] = hour_list[time_hour] + 1 # 消息文本 global content_all content_all = content_all + content + "\t" # 发言统计 try: content_count[qq]['count'] = content_count[qq]['count'] + 1 except: if name == '': content_count[qq] = {'name': qq, 'count': 1} else: content_count[qq] = {'name': name, 'count': 1} print(hour_list) print(content_all) print(content_count) def top(): bar = Bar() top = [] for content in content_count: name = content_count[content]['name'] count = content_count[content]['count'] top.append((content, name, count)) # 指定第三个元素排序 def takeSecond(elem): return elem[2] top.sort(key=takeSecond, reverse=True) print(top) xaxis = [] yaxis = [] for item in top[:10]: xaxis.append(item[1]) yaxis.append(item[2]) bar.add_xaxis(xaxis) bar.add_yaxis('发言top10', yaxis, gap="80%") bar.set_global_opts(title_opts=opts.TitleOpts(title="土狗大队", subtitle="8月29号-9月4号"), xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30})) bar.render('top.html') def time(): xaxis = [] yaxis = [] for itme in hour_list: xaxis.append(itme) yaxis.append(hour_list[itme]) ( Line(init_opts=opts.InitOpts(width="600px", height="400px")) .set_global_opts(title_opts=opts.TitleOpts(title="土狗大队", subtitle="8月29号-9月4号")) .add_xaxis(xaxis_data=xaxis) .add_yaxis( series_name="发言时间段统计", y_axis=yaxis ) .render("time.html") ) def wordCloud(): seg_list_exact = jieba.lcut(content_all, cut_all=False) # 精确模式分词 object_list = [] remove_words = [u'\t', u'图片', u'[', u']', u' ', u'我', u'了', u'你', u'的', u'是', u'就', u'都', u',', u'不', u'吗', u'@' , u'还', u'没', u'这', u'好', u'有', u'在', u'也', u'吧', u'。', u'月', u'说', u'打', u'她', u'表情' ] # 自定义去除词库 for word in seg_list_exact: # 循环读出每个分词 if word not in remove_words: # 如果不在去除词库中 object_list.append(word) # 分词追加到列表 word_counts = collections.Counter(object_list) print(word_counts.most_common(40)) ( WordCloud() .add(series_name="词频", data_pair=word_counts.most_common(40), word_size_range=[18, 198]) .set_global_opts( title_opts=opts.TitleOpts( title="土狗大队", subtitle="8月29号-9月4号", title_textstyle_opts=opts.TextStyleOpts(font_size=23) ), tooltip_opts=opts.TooltipOpts(is_show=True), ) .render("wordcloud.html") ) if __name__ == '__main__': parse() top() time() wordCloud()