鱼C论坛

 找回密码
 立即注册
查看: 530|回复: 1

[已解决]词云

[复制链接]
发表于 2018-8-7 18:53:10 | 显示全部楼层 |阅读模式
10鱼币
  1. import requests
  2. from requests.exceptions import RequestException
  3. from lxml import etree
  4. import time
  5. import jieba
  6. import wordcloud
  7. import re
  8. from collections import Counter

  9. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
  10.            'Referer': 'https://www.cangqionglongqi.com/wukongzhuan/',
  11.            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}

  12. def get_page(url):
  13.     """请求网页内容"""
  14.     try:
  15.         response = requests.get(url, headers=headers)
  16.         if response.status_code == 200:
  17.             response.encoding = response.apparent_encoding
  18.             return response.text
  19.         return None
  20.     except RequestException as error:
  21.         print('报错:',error)
  22.         exit()
  23.         
  24. def parser_page_url(html_href):
  25.     """
  26.     解析链接、章节
  27.     :param link_href:链接
  28.     :param list_chapter:章节
  29.     """
  30.     page_content = etree.HTML(html_href)
  31.     link_href = page_content.xpath('//*[@id="list"]/dl/dd/a/@href')
  32.     list_chapter = page_content.xpath('//*[@id="list"]/dl/dd/a/text()')
  33.     return (link_href, list_chapter)

  34. def parser_content(url):
  35.     """解析小说内容"""
  36.     html_content = get_page(url)
  37.     content = ''
  38.     if html_content:
  39.         page_contents = etree.HTML(html_content)
  40.         contents = page_contents.xpath('//*[@id="content"]/text()')
  41.         for i in contents:
  42.             content += (i+'\n')
  43.         return content.replace('\xa0'*5,'\n')

  44. def get_urls():
  45.     """
  46.     拼接url
  47.     :params urls:各个章节的url
  48.     """
  49.     url = 'https://www.cangqionglongqi.com/wukongzhuan/'
  50.     html = get_page(url)
  51.     hrefs,chapters = parser_page_url(html)
  52.     urls = [url + href for href in hrefs]
  53.     return (urls,chapters)

  54. def write_txt(writer, chapt):
  55.     """保存在txt文件"""
  56.     with open('悟空传.txt','a',encoding='utf-8') as file:
  57.         file.write('*'*10+chapt+'*'*10)
  58.         file.write(writer)

  59. def jiebas(dict=dict()):
  60.     """jieba分析小说并提取出现次数较多的词语    """
  61.     file = open('悟空传.txt','r',encoding='utf-8').read()
  62.     replace_text = re.sub('\W','',file)
  63.     words = jieba.lcut(replace_text)
  64.     c = Counter(words)
  65.     common_c = c.most_common(200)
  66.     for key, value in common_c:
  67.         if len(key) >= 2: #提取大于两个词的词语
  68.             dict[key] = value
  69.     return dict

  70. def wordclouds(dict):
  71.     """生成词云"""
  72.     w = wordcloud.WordCloud()
  73.     w.generate_from_frequencies(dict)
  74.     w.to_file('wukongzhuan.jpg')

  75. if __name__ == '__main__':
  76.     page_urls, page_chapter = get_urls()
  77.     for num in range(len(page_urls)-1):
  78.         print('正在抓取%s' % page_chapter[num])
  79.         write = parser_content(page_urls[num])
  80.         write_txt(write, page_chapter[num])
  81.         time.sleep(15)
  82.     print('抓取完成')
  83.     print('生成词云')
  84.     dict1 = jiebas()
  85.     wordclouds(dict1)
  86.     print('完成')
复制代码


为什么生成的词云不含有中文,各位大佬回答下啊
最佳答案
2018-8-7 18:53:11
手动添加一下支持中文的字体。比如:
  1. font = r'C:\Windows\Fonts\simfang.ttf'
  2. wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate(text.lower())
复制代码
wukongzhuan.jpg

最佳答案

查看完整内容

手动添加一下支持中文的字体。比如:
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2018-8-7 18:53:11 | 显示全部楼层    本楼为最佳答案   
手动添加一下支持中文的字体。比如:
  1. font = r'C:\Windows\Fonts\simfang.ttf'
  2. wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate(text.lower())
复制代码
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-4-25 09:18

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表