|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import os
import pandas as pd
import re
import jieba
import jieba.posseg as psg #导入lda分析需要的一系列包
output_path = 'D:/文本挖掘' #把结果输出到该路径下
file_path = 'C:/Users/m/Desktop' #数据所在路径
os.chdir(file_path)
data=pd.read_excel("数据2.xlsx")#content type
os.chdir(output_path)
dic_file = "D:/自然语言处理/dict.txt" #字典路径
stop_file = "D:/自然语言处理/stopwords.txt" #停用词路径
def chinese_word_cut(mytext): #对数据进行中文分词
jieba.load_userdict(dic_file)
jieba.initialize()
try:
stopword_list = open(stop_file,encoding ='utf-8') #防止出现乱码错误,故指定编码类型为'utf-8'
except:
stopword_list = []
print("error in stop_file")
stop_list = []
flag_list = ['n','nz','vn']
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
word_list = []
#jieba分词
seg_list = psg.cut(mytext)
for seg_word in seg_list:
word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
word = seg_word.word #如果想要分析英语文本,注释这行代码,启动下行代码
find = 0
for stop_word in stop_list:
if stop_word == word or len(word)<2: #this word is stopword
find = 1
break
if find == 0 and seg_word.flag in flag_list:
word_list.append(word)
return (" ").join(word_list)
data["content_cutted"] = data.content.apply(chinese_word_cut)
#使用apply函数不断调用以上函数批量处理全部文本(正文)信息
data.content_cutted.head()
#查看文本是否已被正确分词 |
|