|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
报错,找不到原因
File "D:\python370\lib\site-packages\jieba\_compat.py", line 79, in strdecode
sentence = sentence.decode('utf-8')
AttributeError: 'int' object has no attribute 'decode'
- """
- ---------------------------------
- test: rzbbzr
- test1: PyCharm
- test2: LDA20220212.py
- test3: 2022-02-12 11:38
- Pyinstaller -F LDA20220212.py
- py转jupyter %load xxxx.py
- pip install -i https://pypi.doubanio.com/simple/ 包名
- pip install -i https://pypi.tuna.tsinghua.edu.cn/simple +包名
- """
- import os
- import pandas as pd
- import re
- import jieba
- import jieba.posseg as psg
- #######预处理
- output_path = 'D:/lda/data/result'
- file_path = 'D:/lda/data'
- os.chdir(file_path)
- data = pd.read_excel("D:/lda/data/data.xlsx")
- os.chdir(output_path)
- dic_file = "D:/lda/data/dict.txt"
- stop_file = "D:/lda/data/stopwords.txt"
- def chinese_word_cut(mytext):
- jieba.load_userdict(dic_file)
- jieba.initialize()
- try:
- stopword_list = open(stop_file, encoding='utf-8')
- except:
- stopword_list = []
- print("error in stop_file")
- stop_list = []
- flag_list = ['n', 'nz', 'vn']
- for line in stopword_list:
- line = re.sub(u'\n|\\r', '', line)
- stop_list.append(line)
- word_list = []
- # jieba分词
- seg_list = psg.cut(mytext)
- for seg_word in seg_list:
- # word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
- # seg_word = str(seg_word).strip()
- print(type(seg_word))
- word = seg_word.word
- find = 0
- print(type(word))
- for stop_word in stop_list:
- if stop_word == word or len(word) < 2: # this word is stopword
- find = 1
- break
- if find == 0 and seg_word.flag in flag_list:
- word_list.append(word)
- return (" ").join(word_list)
- data["content_cutted"] = data.content.apply(chinese_word_cut)
- #######LDA分析
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
- from sklearn.decomposition import LatentDirichletAllocation
- def print_top_words(model, feature_names, n_top_words):
- tword = []
- for topic_idx, topic in enumerate(model.components_):
- print("Topic #%d:" % topic_idx)
- topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
- tword.append(topic_w)
- print(topic_w)
- return tword
- n_features = 1000 # 提取1000个特征词语
- tf_vectorizer = CountVectorizer(strip_accents='unicode',
- max_features=n_features,
- stop_words='english',
- max_df=0.5,
- min_df=10)
- tf = tf_vectorizer.fit_transform(data.content_cutted)
- n_topics = 8
- lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
- learning_method='batch',
- learning_offset=50,
- # doc_topic_prior=0.1,
- # topic_word_prior=0.01,
- random_state=0)
- lda.fit(tf)
- ###########每个主题对应词语
- n_top_words = 25
- tf_feature_names = tf_vectorizer.get_feature_names()
- topic_word = print_top_words(lda, tf_feature_names, n_top_words)
- ###########输出每篇文章对应主题
- import numpy as np
- topics = lda.transform(tf)
- topic = []
- for t in topics:
- topic.append(list(t).index(np.max(t)))
- data['topic'] = topic
- data.to_excel("data_topic.xlsx", index=False)
- topics[0] # 0 1 2
- ###########可视化
- import pyLDAvis
- import pyLDAvis.sklearn
- pyLDAvis.enable_notebook()
- pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
- pyLDAvis.display(pic)
- pyLDAvis.save_html(pic, 'lda_pass' + str(n_topics) + '.html')
- # 去工作路径下找保存好的html文件
- # pyLDAvis.show(pic)
- ###########困惑度
- import matplotlib.pyplot as plt
- plexs = []
- n_max_topics = 16
- for i in range(1, n_max_topics):
- print(i)
- lda = LatentDirichletAllocation(n_components=i, max_iter=50,
- learning_method='batch',
- learning_offset=50, random_state=0)
- lda.fit(tf)
- plexs.append(lda.perplexity(tf))
- n_t = 15 # 区间最右侧的值。注意:不能大于n_max_topics
- x = list(range(1, n_t))
- plt.plot(x, plexs[1:n_t])
- plt.xlabel("number of topics")
- plt.ylabel("perplexity")
- plt.show()
- 用的下面的数据
- 链接:https://pan.baidu.com/s/1OkzDW_fXarDOGQg3VOYb3w
- 提取码:1111
- 问题出在jieba那里 , 谢谢
复制代码
|
|