因为进程的返回值要求是可序列化的对象(你的返回值是列表套元组),把它改正就好(字符串列表),为了方便我把他合成一个函数了。import requests
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class Get_Movie:
def __init__(self, url_list):
self.url_list = url_list
pass
# 获取网页,并解析网页为text模式,通用函数前面加x
def x_get_html(self, url):
try:
# header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
ua = UserAgent(verify_ssl=False)
header = {"User-Agent": ua.random}
requests.packages.urllib3.disable_warnings()
response = requests.get(url, verify=False, headers=header)
if response.status_code != 200:
raise Exception("链接网页服务器不成功")
else:
response.encoding = response.apparent_encoding # 获取网页字体编码,防止乱码
html = response.text
soup = BeautifulSoup(html, "lxml") # 解析一个网页
return self.get_everypage_info(soup)
except:
print("网页不存在")
def get_everypage_info(self, x):
everyone_moive = x.find("ol", class_="grid_view").find_all("div", class_="item")
a = [
", ".join(
(i.find("em").string,
i.find("span", class_="title").string,
i.find("span", class_="rating_num").string)
)
for i in everyone_moive
]
return a
def many_thread(self):
with ProcessPoolExecutor() as pool:
jiexi_html = pool.map(self.x_get_html, self.url_list) # 获取每一页的解析页面列表
# result = pool.map(self.get_everypage_info, jiexi_html) # 获取每一页页面的电影信息
for i in jiexi_html:
for a in i:
print(a, end="\n")
# print(f'{i}+\n')
if __name__ == "__main__":
start = time.perf_counter() # 用这个才是秒数
getmovie = Get_Movie(
[
f"https://movie.douban.com/top250?start={x}&filter="
for x in range(0, 250, 25)
]
)
getmovie.many_thread()
end = time.perf_counter()
print(f"一共用时:{end-start}秒")
|