|
发表于 2023-5-27 17:50:18
|
显示全部楼层
response = requests.get(url, verify=False)
- import re
- import requests
- import logging #打印日志
- from urllib.parse import urljoin #进行url拼接
- logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
- #当前站点
- url = 'https://ssr1.scrape.center'
- #爬取的总页数
- TOTAL_PAGE = 10
- #页面爬取
- def scrape_page(url):
- logging.info('scraping %s...',url)
- try:
- response = requests.get(url, verify=False)
- if response.status_code == 200:
- return response.text
- logging.error('get invalid status code %s while scrape %s',response.status_code,url)
- except requests.RequestException:
- logging.error('error occurred while scraping %s',url,exc_info=True)
- #爬取列表页
- def scrape_index(page):
- index_url = '{}/page/{}'.format(url,page)
- return scrape_page(index_url)
- #页面解析
- def parse_index(html):
- #正则表达式对象
- pattern = re.compile('<a .*?href="(.*?)".*?class="name">')
- items = re.findall(pattern,html)
- if not items:
- return None
- for item in items:
- #详情页url拼接
- detail_url = urljoin(url,item)
- logging.info('get detail_url {}'.format(detail_url))
- yield detail_url
-
- def main():
- for page in range(1,3):
- index_html = scrape_index(page)
- details_url = parse_index(index_html)
- logging.info('detail url {}'.format(list(details_url)))
-
- if __name__ == '__main__':
- main()
复制代码 |
|