|

楼主 |
发表于 2020-8-27 16:17:24
|
显示全部楼层
- import requests
- import bs4
- import re
- from lxml import etree
- from multiprocessing import Pool
- headers = {
- 'Accept-Language': 'zh-CN',
- 'Cache-Control': 'no-cache',
- 'Connection': 'Keep-Alive',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'
- }
-
- def open_html(url):
- res = requests.get(url=url, headers=headers)
- html = res.text
- return html
- def get_mulu(): # 拿到小说目录的地址
- url1 = 'https://www.biduo.cc/search.php?q='
- name = input("请输入小说名:")
- f = open(name + '.txt', 'w',encoding='utf-8')
- f.close()
- url = url1 + name
- res = requests.get(url, headers=headers)
- s = bs4.BeautifulSoup(res.text, features="lxml")
- xiaoshuo = s.find("h3", class_="result-item-title result-game-item-title") # 找到 第一个 标签(即为搜索榜一) 找全部可用s.find_all
- url_ = xiaoshuo.find("a").get('href') # 在获得的标签中 继续找到 a 标签,并get 到 href 属性
- return ("https://www.biduo.cc" + url_,name) # 加前缀
- def get_text(dic):
- name = dic['name']
- url = dic['url']
- f = open(name + '.txt','a+',encoding='utf-8')
- html = requests.get(url,headers=headers).text
- tree = etree.HTML(html)
- title = tree.xpath('//div[@class="bookname"]/h1/text()')[0]
- f.write(title)
- f.write('\n\n')
- text = tree.xpath('//div[@id="content"]//text()')
- for each in text:
- each.replace('\xa0\xa0\xa0\xa0','\n')
- f.write(each + '\n')
- f.write('\n\n')
- f.close()
- print(title,'爬取完成!!!')
- def get_list(url,name):
- html = open_html(url)
- content_list = []
- regular1 = re.compile('<dd><a href="(.*?)" >.*?</a></dd>')
- list1 = regular1.findall(html)
- url_1 = 'https://www.biduo.cc'
- for each in list1:
- url = url_1 + each
- dict1 = {
- 'name':name,
- 'url':url
- }
- content_list.append(dict1)
- return content_list
- def main():
- global name
- (catalog_address,name) = get_mulu()
-
- number = int(input('请输入您想获取的章节总数:'))
- content_list = get_list(catalog_address,name)
- print(len(content_list))
- content_list = content_list[0:number]
- for each in content_list:
- print(each)
- p = Pool(9)
- p.map(get_text,content_list)
- if __name__ == "__main__":
- main()
复制代码
大佬,这是我改完的代码,你看一下有没有什么能改进的地方。还有就是爬完是无序的,你有什么解决的好方法吗 |
|