|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 chunguang 于 2018-8-31 17:15 编辑
python在做分词的时候,为什么下列程序运行不了
- import jieba
- import re
- import jieba.analyse
- from scipy.misc import imread
- from wordcloud import WordCloud
- from wordcloud import ImageColorGenerator
- import matplotlib.pyplot as plt
- from os import path
- def stopwordslist(filepath): #停用词list
- stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
- return stopwords
- def seg_sentence(sentence): #分词
- zhongwen_pat = re.compile(r'^[\u4e00-\u9fa5a-zA-Z])
- cut_list = [c for c in jieba.cut(sentence[0]) if zhongwen_pat.search(c)]
- cut_set = "".join(cut_list)
- sentence_seged = jieba.cut(cut_set.strip())
- stopwords = stopwordslist('C:/Users/lenovo/Desktop/中文停用词.txt') #这里加载停用词的路径
- outstr = ''
- for word in sentence_seged:
- if word not in stopwords:
- if word != '\t': #'\t'是横向制表符,可以理解成空白
- outstr += word
- outstr += " "
- return outstr
- inputs = open('C:/Users/lenovo/Desktop/白沙comment.txt', 'r', encoding='gbk')
- outputs = open('C:/Users/lenovo/Desktop/白沙comments3.txt', 'w')
- for line in inputs:
- line_seg = seg_sentence(line) # 这里的返回值是字符串
- outputs.write(line_seg + '\n' )
- outputs.close()
- inputs.close()
复制代码
求大神
数据如下: |
|