|
发表于 2020-8-15 11:22:56
|
显示全部楼层
本楼为最佳答案
本帖最后由 suchocolate 于 2020-8-15 11:53 编辑
不用xpath或bs太低效了
- import requests
- from lxml import etree
- import os
- def ck_dir():
- # Please fill in the name of the path where you want to save the images
- pic_dir = 'mm'
- if not os.path.exists(pic_dir):
- os.mkdir(pic_dir)
- os.chdir(pic_dir)
- def get_pic_url(act_url):
- r = requests.get(act_url, headers=headers)
- html = etree.HTML(r.text)
- # pic list
- result = html.xpath('//img/@src')
- # last pic is not mm pic
- result.pop()
- # next page url
- act_nx_pg = html.xpath('//a[contains(text(),"99")]/@href')[0]
- return result, act_nx_pg
- def main():
- # pic counter
- n = 1
- # check pic directory
- ck_dir()
- # pic url list
- pic_list = []
- # next page url
- nx_pg_url = ''
- # get pics urls
- for item in range(100, 98, -1):
- if item == 100:
- pic_url, nx_pg = get_pic_url(url)
- else:
- pic_url, nx_pg = get_pic_url(nx_pg_url)
- nx_pg_url = f'{url}/{nx_pg}#comments'
- pic_list.extend(pic_url)
- # download pics
- for item in pic_list:
- r = requests.get('http:' + item, headers=headers)
- pic_name = item.split('/')[-1]
- with open(pic_name, 'wb') as f:
- f.write(r.content)
- print(f'{pic_name} has been downloaded. total number: {n}')
- n = n + 1
- if __name__ == '__main__':
- # global variables
- url = 'http://jandan.net/ooxx'
- headers = {'User-agent': 'firefox'}
- # main func
- main()
复制代码 |
|