|
发表于 2017-6-5 20:54:24
|
显示全部楼层
# -*- coding: cp936 -*-
因为返回的不是中文
改了一下,加一个import json
#! /usr/bin/env python
# encoding:utf-8
import json
import codecs as cs
import jieba as jb #jieba是python上非常不错的中文分词插件,有多种模式可以选择
f = cs.open("bad.txt","r")
lines = f.readlines()
f.close()
data = []
dic ={}
for each in lines:
bad = jb.cut_for_search(each)
data.append(bad)
for eachline in data:
for eachword in eachline:
if eachword in dic:
dic[eachword] += 1
else:
dic[eachword] = 1
sorteddic = sorted(dic.items(),key=lambda x:x[1],reverse=True)
for i in range(200): #显示了前100项,但是很多都是无用的助词或者标点符号,我在excel里手动删了,还是很方便的。
print json.dumps(sorteddic[i],encoding='UTF-8',ensure_ascii=False)
|
|