|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 弈秋呜呜呜 于 2021-5-22 13:06 编辑
大佬们对于这个代码请发表你们的意见。帮忙看看这个代码有没有问题
由于爬取的网站老是连接失败,所以我就没有做分页爬取了。
- import requests
- from lxml.html import etree
- from threading import Thread
- import os
- from multiprocessing import Queue
- # 需求:运用多线程爬取免费模板
- def detail_url(q, url, headers):
- '''获取模板页面url'''
- html = requests.get(url=url, headers=headers).text
- tree = etree.HTML(html)
- a_list = tree.xpath('//div[@class="main_list jl_main masonry"]/div/a')
- for a in a_list:
- href = a.xpath('./@href')[0]
- q.put(href)
- def download_url(q1, q2, headers):
- '''获取模板的下载地址'''
- url = q1.get()
- html = requests.get(url=url, headers=headers).text
- tree = etree.HTML(html)
- a_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a')
- for a in a_list:
- href = a.xpath('./@href')[0]
- q2.put(href)
- def download_file(q, headers):
- '''下载文件并保存'''
- global count
- count += 1
- url = q.get()
- file = requests.get(url=url, headers=headers).content
- with open(f'./模板/{count}.rar', 'wb') as f:
- f.write(file)
- if __name__ == '__main__':
- # 创建文件夹
- if not os.path.exists('./模板'):
- os.mkdir('./模板')
- count = 0
- url = 'https://sc.chinaz.com/jianli/free.html'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'
- }
- # 创建队列
- q1 = Queue(20)
- q2 = Queue(20)
- # 创建线程
- t1 = Thread(target=detail_url, args=(q1, url, headers))
- t2 = Thread(target=download_url, args=(q1, q2, headers))
- t3 = Thread(target=download_file, args=(q2, headers))
- t1.start()
- t2.start()
- t3.start()
- t1.join()
- t2.join()
- t3.join()
- print('爬取完毕')
复制代码
你试下这段代码看能不能获取到url,如果可以获取到,那要么就是你代码问题,要么是多线程请求太频繁被网站限制了。
- import requests
- from lxml import etree
- url = 'https://sc.chinaz.com/jianli/free.html'
- headers = {
- 'User-Agent': 'Mozilla/5.0'
- }
- res = requests.get(url, headers=headers)
- sel = etree.HTML(res.text)
- urls = sel.xpath('//div[@id="main"]/div/div/a/@href')
- for url_ in urls:
- res1 = requests.get('https:'+url_, headers=headers)
- sel1 = etree.HTML(res1.text)
- down_url = sel1.xpath('//div[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
- print(down_url)
- break
复制代码
|
|