马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 chunguang 于 2018-8-31 17:15 编辑
python在做分词的时候,为什么下列程序运行不了import jieba
import re
import jieba.analyse
from scipy.misc import imread
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
import matplotlib.pyplot as plt
from os import path
def stopwordslist(filepath): #停用词list
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence): #分词
zhongwen_pat = re.compile(r'^[\u4e00-\u9fa5a-zA-Z])
cut_list = [c for c in jieba.cut(sentence[0]) if zhongwen_pat.search(c)]
cut_set = "".join(cut_list)
sentence_seged = jieba.cut(cut_set.strip())
stopwords = stopwordslist('C:/Users/lenovo/Desktop/中文停用词.txt') #这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t': #'\t'是横向制表符,可以理解成空白
outstr += word
outstr += " "
return outstr
inputs = open('C:/Users/lenovo/Desktop/白沙comment.txt', 'r', encoding='gbk')
outputs = open('C:/Users/lenovo/Desktop/白沙comments3.txt', 'w')
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
outputs.write(line_seg + '\n' )
outputs.close()
inputs.close()
求大神
数据如下: |