hhad 发表于 2022-4-8 10:45:36

想要爬取任意页的图片代码该怎么加

import requests
import parsel
import os

filename = '壁纸\\'

if not os.path.exists(filename):
    os.mkdir(filename)


url = 'http://www.netbian.com/index.htm'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
#print(response.text)
selector = parsel.Selector(response.text)
href = selector.css('.list li a::attr(href)').getall()
lis = selector.css('.list li')
#print(href)
for li in lis:
    title = li.css('b::text').get()
    if title:
      li_url = 'http://www.netbian.com/' + li.css('a::attr(href)').get()
      response_2 = requests.get(url=li_url, headers=headers)
      selector_2 = parsel.Selector(response_2.text)
      img_url = selector_2.css('.pic img::attr(src)').get()
      img_content = requests.get(url=img_url).content
      with open(filename + title +'.jpg', mode='wb') as f:
            f.write(img_content)
            print(title, img_url)

isdkz 发表于 2022-4-8 17:18:37

import requests
import parsel
import os

filename = '壁纸\\'

if not os.path.exists(filename):
    os.mkdir(filename)


def main(page=1):
    if page == 1:
      url = 'http://www.netbian.com/index.htm'
    else:
      url = f'http://www.netbian.com/index_{page}.htm'
    headers = {
      'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    # print(response.text)
    selector = parsel.Selector(response.text)
    href = selector.css('.list li a::attr(href)').getall()
    lis = selector.css('.list li')
    #print(href)
    for li in lis:
      title = li.css('b::text').get()
      if title:
            li_url = 'http://www.netbian.com/' + li.css('a::attr(href)').get()
            response_2 = requests.get(url=li_url, headers=headers)
            selector_2 = parsel.Selector(response_2.text)
            img_url = selector_2.css('.pic img::attr(src)').get()
            img_content = requests.get(url=img_url).content
            with open(filename + title +'.jpg', mode='wb') as f:
                f.write(img_content)
                print(title, img_url)

if __name__ == '__main__':
    main(2)                     # 爬取第二页
页: [1]
查看完整版本: 想要爬取任意页的图片代码该怎么加