response = requests.get(url, verify=False)
import re
import requests
import logging #打印日志
from urllib.parse import urljoin #进行url拼接
logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
#当前站点
url = 'https://ssr1.scrape.center'
#爬取的总页数
TOTAL_PAGE = 10
#页面爬取
def scrape_page(url):
logging.info('scraping %s...',url)
try:
response = requests.get(url, verify=False)
if response.status_code == 200:
return response.text
logging.error('get invalid status code %s while scrape %s',response.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
#爬取列表页
def scrape_index(page):
index_url = '{}/page/{}'.format(url,page)
return scrape_page(index_url)
#页面解析
def parse_index(html):
#正则表达式对象
pattern = re.compile('<a .*?href="(.*?)".*?class="name">')
items = re.findall(pattern,html)
if not items:
return None
for item in items:
#详情页url拼接
detail_url = urljoin(url,item)
logging.info('get detail_url {}'.format(detail_url))
yield detail_url
def main():
for page in range(1,3):
index_html = scrape_index(page)
details_url = parse_index(index_html)
logging.info('detail url {}'.format(list(details_url)))
if __name__ == '__main__':
main()
|