[已解决]爬虫问题

伏惜寒 · 发表于 2019-3-3 20:10:59

您需要登录才可以下载或查看，没有账号？立即注册

x

我检查了一下，是Xpath语法从第二页开始爬取为空，但是同样的语法单独用在爬取第二页可以正常爬取到数据，
一旦用在for循环之中就不能正常运行爬取，求怎么解决

import requests
import os
from lxml import etree
class Get():
def __init__(self):
self.headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
self.url = "https://www.kuaidaili.com/free/inha/%d/"
self.ip_list = []
self.port_list = []
self.type_list = []
self.time_list = []
self.daili_list = []
self.detection_list = []
def start(self):
for e in range(1,5):
response = requests.get(self.url % e, headers = self.headers)
html = etree.HTML(response.text)
result = html.xpath("//tbody/tr//td/text()")
for i in range(len(result)):
if i % 7 == 0:
self.ip_list.append(result[i])
elif i % 7 == 1:
self.port_list.append(result[i])
elif i % 7 == 3:
self.type_list.append(result[i])
elif i % 7 == 6:
self.time_list.append(result[i])
for i in range(len(self.ip_list)):
self.daili_list.append("""+ self.type_list[i] + """ + ":" +\
"""+self.ip_list[i] +":" +self.port_list[i] + """)
self.detection_list.append(" 最近检测时间" +\
self.time_list[i])
with open("iplist.txt", "w") as f:
for i in range(len(self.daili_list)):
f.write(self.daili_list[i] + self.detection_list[i] + "\n")
return self.daili_list
if __name__ == "__main__":
get = Get()
get.start()

复制代码

最佳答案

Stubborn

2019-3-5 02:23:00

等待下就可以抓取到

#_*_coding:utf-8_*_
# @Time :2019/3/4--23:33
# @Author : Stubbron
# @Email : 1263270345@qq.com
# @File :test.py
import requests
import os,time
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()
Headers = {
'User-Agent': ua.random
}
agent_pool = []
def get2_proxy(page):
"快代理"
global agent_pool
url = "https://www.kuaidaili.com/free/inha/{}/".format(page)
try:
html = etree.HTML(requests.get(url, headers=Headers).text)
for each in html.xpath('//table[@class="table table-bordered table-striped"]//tr')[1:]:
ip = each.xpath('./td[1]/text()')[0] + ":" + each.xpath('./td[2]/text()')[0]
if ip:
agent_pool.append(ip)
except:
print("get2_proxy抓取失败")
return agent_pool
for i in range(1,3):
time.sleep(5) #等待5秒
get2_proxy(i)
print(len(agent_pool))

复制代码

zero_sunshine · 发表于 2019-3-3 23:24:59

你的访问速度太快了，这个网站的服务器使用了反爬虫的机制，高频访问直接返回了503状态码。
你可以使用time模块的time.sleep()函数在每个次请求网页以后暂停几秒钟

zero_sunshine · 发表于 2019-3-3 23:26:08

所以这不是你xpath查找的问题，是你第二次请求网页的时候就没有收到数据

伏惜寒 · 发表于 2019-3-4 22:49:32

zero_sunshine 发表于 2019-3-3 23:26
所以这不是你xpath查找的问题，是你第二次请求网页的时候就没有收到数据

那该怎么解决？换ip可以吗？

Stubborn · 发表于 2019-3-5 02:23:00

等待下就可以抓取到

#_*_coding:utf-8_*_
# @Time :2019/3/4--23:33
# @Author : Stubbron
# @Email : 1263270345@qq.com
# @File :test.py
import requests
import os,time
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()
Headers = {
'User-Agent': ua.random
}
agent_pool = []
def get2_proxy(page):
"快代理"
global agent_pool
url = "https://www.kuaidaili.com/free/inha/{}/".format(page)
try:
html = etree.HTML(requests.get(url, headers=Headers).text)
for each in html.xpath('//table[@class="table table-bordered table-striped"]//tr')[1:]:
ip = each.xpath('./td[1]/text()')[0] + ":" + each.xpath('./td[2]/text()')[0]
if ip:
agent_pool.append(ip)
except:
print("get2_proxy抓取失败")
return agent_pool
for i in range(1,3):
time.sleep(5) #等待5秒
get2_proxy(i)
print(len(agent_pool))

复制代码

账号		自动登录	找回密码
密码			立即注册