|
楼主 |
发表于 2023-4-2 17:15:56
|
显示全部楼层
新代码:
import gensim
import os
from gensim.models import Lda2Vec
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 定义读取文件路径
input_folder = "C:/Users/11564/Desktop/实战/滑雪场"
stopwords_file = "C:/Users/11564/Desktop/实战/scu_stopword.txt"
# 加载停用词表
with open(stopwords_file, 'r', encoding='utf-8') as f:
stopwords = f.read().splitlines()
# 去除停用词和长度小于3的词语
def preprocess(text):
result = []
for token in simple_preprocess(text):
if token not in stopwords and len(token) > 3:
result.append(token)
return result
# 读取文件
class TextIterator:
def __init__(self, folder_path):
self.folder_path = folder_path
def __iter__(self):
for file_name in os.listdir(self.folder_path):
file_path = os.path.join(self.folder_path, file_name)
for line in open(file_path, 'r', encoding='utf-8'):
yield preprocess(line)
# 构建LDA2Vec模型
sentences = TextIterator(input_folder)
dictionary = Dictionary(sentences)
model = Lda2Vec(
corpus=[dictionary.doc2bow(sent) for sent in sentences],
num_topics=50,
id2word=dictionary,
chunksize=5000,
passes=10,
alpha=0.5,
eta=0.5,
iterations=200,
random_state=42,
batch_size=128,
workers=4
)
# 保存模型
model.save("lda2vec.model")
# 计算模型的困惑度和一致性得分
perplexity = model.log_perplexity([dictionary.doc2bow(sent) for sent in sentences])
coh_score = CoherenceModel(model=model, texts=sentences, dictionary=dictionary, coherence='c_v').get_coherence()
# 可视化结果
vis_data = pyLDAvis.gensim_models.prepare(model, [dictionary.doc2bow(sent) for sent in sentences], dictionary)
pyLDAvis.show(vis_data)
# 展示主题和对应的词汇
for topic in model.topic_word_.T:
words = ", ".join([model.id2word[idx] for idx in topic.argsort()[-5:]])
print(words)
print("Perplexity: ", perplexity)
print("Coherence score: ", coh_score) |
|