|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
我检查了一下,是Xpath语法从第二页开始爬取为空,但是同样的语法单独用在爬取第二页可以正常爬取到数据,
一旦用在for循环之中就不能正常运行爬取,求怎么解决
- import requests
- import os
- from lxml import etree
- class Get():
- def __init__(self):
- self.headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
- self.url = "https://www.kuaidaili.com/free/inha/%d/"
- self.ip_list = []
- self.port_list = []
- self.type_list = []
- self.time_list = []
- self.daili_list = []
- self.detection_list = []
- def start(self):
- for e in range(1,5):
- response = requests.get(self.url % e, headers = self.headers)
- html = etree.HTML(response.text)
- result = html.xpath("//tbody/tr//td/text()")
- for i in range(len(result)):
- if i % 7 == 0:
- self.ip_list.append(result[i])
- elif i % 7 == 1:
- self.port_list.append(result[i])
- elif i % 7 == 3:
- self.type_list.append(result[i])
- elif i % 7 == 6:
- self.time_list.append(result[i])
-
- for i in range(len(self.ip_list)):
- self.daili_list.append("""+ self.type_list[i] + """ + ":" +\
- """+self.ip_list[i] +":" +self.port_list[i] + """)
- self.detection_list.append(" 最近检测时间" +\
- self.time_list[i])
- with open("iplist.txt", "w") as f:
- for i in range(len(self.daili_list)):
- f.write(self.daili_list[i] + self.detection_list[i] + "\n")
- return self.daili_list
- if __name__ == "__main__":
- get = Get()
- get.start()
复制代码
等待下就可以抓取到
- #_*_coding:utf-8_*_
- # @Time :2019/3/4--23:33
- # @Author : Stubbron
- # @Email : 1263270345@qq.com
- # @File :test.py
- import requests
- import os,time
- from lxml import etree
- from fake_useragent import UserAgent
- ua = UserAgent()
- Headers = {
- 'User-Agent': ua.random
- }
- agent_pool = []
- def get2_proxy(page):
- "快代理"
- global agent_pool
- url = "https://www.kuaidaili.com/free/inha/{}/".format(page)
- try:
- html = etree.HTML(requests.get(url, headers=Headers).text)
- for each in html.xpath('//table[@class="table table-bordered table-striped"]//tr')[1:]:
- ip = each.xpath('./td[1]/text()')[0] + ":" + each.xpath('./td[2]/text()')[0]
- if ip:
- agent_pool.append(ip)
- except:
- print("get2_proxy抓取失败")
- return agent_pool
- for i in range(1,3):
- time.sleep(5) #等待5秒
- get2_proxy(i)
- print(len(agent_pool))
复制代码
|
|