| 
 | 
 
 
发表于 2023-5-27 17:50:18
|
显示全部楼层
 
 
 
response = requests.get(url, verify=False) 
 
- import re
 
 - import requests
 
 - import logging #打印日志
 
 - from urllib.parse import urljoin #进行url拼接
 
  
- logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
 
 - #当前站点
 
 - url = 'https://ssr1.scrape.center'
 
 - #爬取的总页数
 
 - TOTAL_PAGE = 10
 
  
- #页面爬取
 
 - def scrape_page(url):
 
 -     logging.info('scraping %s...',url)
 
 -     try:
 
 -         response = requests.get(url, verify=False)
 
 -         if response.status_code == 200:
 
 -             return response.text
 
 -         logging.error('get invalid status code %s while scrape %s',response.status_code,url)
 
 -     except requests.RequestException:
 
 -         logging.error('error occurred while scraping %s',url,exc_info=True)
 
  
- #爬取列表页
 
 - def scrape_index(page):
 
 -     index_url = '{}/page/{}'.format(url,page)
 
 -     return scrape_page(index_url)
 
  
- #页面解析
 
 - def parse_index(html):
 
 -     #正则表达式对象
 
 -     pattern = re.compile('<a .*?href="(.*?)".*?class="name">')
 
 -     items = re.findall(pattern,html)
 
 -     if not items:
 
 -         return None
 
 -     for item in items:
 
 -         #详情页url拼接
 
 -         detail_url = urljoin(url,item)
 
 -         logging.info('get detail_url {}'.format(detail_url))
 
 -         yield detail_url
 
 -         
 
 - def main():
 
 -     for page in range(1,3):
 
 -         index_html = scrape_index(page)
 
 -         details_url = parse_index(index_html)
 
 -         logging.info('detail url {}'.format(list(details_url)))
 
 -         
 
 - if __name__ == '__main__':
 
 -     main()
 
 
  复制代码 |   
 
 
 
 |