|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 _谪仙 于 2020-2-25 12:42 编辑
异步抓取
代码地址: https://github.com/Crowded-Conditions/Crawlers/tree/master/Novel-qidian
- import aiohttp
- import asyncio
- from aiohttp.client_exceptions import ClientConnectorError, ClientError
- from asyncio import TimeoutError
- import aiofiles
- from tqdm import tqdm
- from re import compile, findall
- from fake_useragent import UserAgent
- import time
- # 异步抓取起点中文网的小说信息
- class Novels(object):
- def __init__(self):
- # 小说的初始界面, max_page = 4018
- self.inital_url = 'https://www.qidian.com/finish?page={}'
- self.browser = UserAgent()
- self.headers = {
- 'Host': 'www.qidian.com',
- 'Accept-Language': 'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'User-Agent': self.browser.chrome,
- }
- async def crawl(self, url: str):
- conn = aiohttp.TCPConnector(limit=200)
- async with aiohttp.ClientSession(connector=conn) as session:
- try:
- async with session.get(url, headers=self.headers, timeout=20) as response:
- if response.status == 200:
- return await response.text()
- else:
- raise ValueError(f'返回状态码错误<{response.status}>')
- except (ClientConnectorError, ClientError, TimeoutError) as error:
- raise ConnectionError('连接失败')
- async def paser_html(self, page: int, pbar):
- try:
- content = await self.crawl(self.inital_url.format(page))
- except (ValueError, ConnectionError) as error:
- print(f'起点中文网: 抓取第{page}页失败<{error}>')
- else:
- pattern = compile('li.*?<h4><a href="(.*?)".*?>(.*?)<.*?class="name".*?>(.*?)<.*?/em>'
- '<a.*?>(.*?)</a><i>.*?</i><a.*?>(.*?)<.*?</li>')
- for result in findall(pattern, content):
- link, book, author, *style = result
- info = f'书名: {book}\n作者: {author}\n类型{"-".join(style)}\n链接: {"https:" + link}\n'
- async with aiofiles.open('起点中文网.', 'a+', encoding='utf-8') as fp:
- await fp.write('*'*40 + '\n')
- await fp.write(str(info))
- pbar.update(1)
- #await asyncio.sleep(2)
- def run(self, max_page: int=1, step: int=300):
- with tqdm(total=max_page, unit_scale=True, desc='起点中文网') as pbar:
- for count in range(1, max_page+1, step):
- start = count
- end = min(count + step, max_page)
- tasks = [self.paser_html(num, pbar) for num in range(start, end)]
- loop = asyncio.get_event_loop()
- loop.run_until_complete(asyncio.wait(tasks))
- time.sleep(15)
- if __name__ == '__main__':
- max_page = 4018
- start = Novels()
- start.run(max_page)
复制代码
Ps:该资源已删除 |
|