|
发表于 2021-11-17 12:03:44
|
显示全部楼层
因为进程的返回值要求是可序列化的对象(你的返回值是列表套元组),把它改正就好(字符串列表),为了方便我把他合成一个函数了。
- import requests
- import time
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
- from bs4 import BeautifulSoup
- from fake_useragent import UserAgent
- class Get_Movie:
- def __init__(self, url_list):
- self.url_list = url_list
- pass
- # 获取网页,并解析网页为text模式,通用函数前面加x
- def x_get_html(self, url):
- try:
- # header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
- ua = UserAgent(verify_ssl=False)
- header = {"User-Agent": ua.random}
- requests.packages.urllib3.disable_warnings()
- response = requests.get(url, verify=False, headers=header)
- if response.status_code != 200:
- raise Exception("链接网页服务器不成功")
- else:
- response.encoding = response.apparent_encoding # 获取网页字体编码,防止乱码
- html = response.text
- soup = BeautifulSoup(html, "lxml") # 解析一个网页
- return self.get_everypage_info(soup)
- except:
- print("网页不存在")
- def get_everypage_info(self, x):
- everyone_moive = x.find("ol", class_="grid_view").find_all("div", class_="item")
- a = [
- ", ".join(
- (i.find("em").string,
- i.find("span", class_="title").string,
- i.find("span", class_="rating_num").string)
- )
- for i in everyone_moive
- ]
- return a
- def many_thread(self):
- with ProcessPoolExecutor() as pool:
- jiexi_html = pool.map(self.x_get_html, self.url_list) # 获取每一页的解析页面列表
- # result = pool.map(self.get_everypage_info, jiexi_html) # 获取每一页页面的电影信息
- for i in jiexi_html:
- for a in i:
- print(a, end="\n")
- # print(f'{i}+\n')
- if __name__ == "__main__":
- start = time.perf_counter() # 用这个才是秒数
- getmovie = Get_Movie(
- [
- f"https://movie.douban.com/top250?start={x}&filter="
- for x in range(0, 250, 25)
- ]
- )
- getmovie.many_thread()
- end = time.perf_counter()
- print(f"一共用时:{end-start}秒")
复制代码 |
|