爬虫求助
最进在学爬虫,前几天做了一个简单爬虫能正常爬取数据,但今天运行后返回的数据为空,Html文本中显示这么一条信息“<h1><strong>请开启JavaScript并刷新该页.</strong></h1>“,大佬们这是被反爬虫了吗?{:5_105:} 不一定,把代码发上来吧。 import requestsimport csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div/div/div/div/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div/div/a/text()")
house_layout = house.xpath("div/div/p/text()")
area = house.xpath("div/div/p/text()")
region = house.xpath("div/div/p/text()")
total_price = house.xpath("div/p/span/text()")
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div/div/a/@href'))
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) import requests
import csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div/div/div/div/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div/div/a/text()")
house_layout = house.xpath("div/div/p/text()")
area = house.xpath("div/div/p/text()")
region = house.xpath("div/div/p/text()")
total_price = house.xpath("div/p/span/text()")
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div/div/a/@href'))
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) import requests
import csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div/div/div/div/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div/div/a/text()")
house_layout = house.xpath("div/div/p/text()")
area = house.xpath("div/div/p/text()")
region = house.xpath("div/div/p/text()")
total_price = house.xpath("div/p/span/text()")
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div/div/a/@href'))
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) 不发代码
不发要爬取的网址
很难了解你的情况 import requests
import csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div/div/div/div/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div/div/a/text()")
house_layout = house.xpath("div/div/p/text()")
area = house.xpath("div/div/p/text()")
region = house.xpath("div/div/p/text()")
total_price = house.xpath("div/p/span/text()")
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div/div/a/@href'))
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) suchocolate 发表于 2020-11-6 17:44
不一定,把代码发上来吧。
import requests
import csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div/div/div/div/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div/div/a/text()")
house_layout = house.xpath("div/div/p/text()")
area = house.xpath("div/div/p/text()")
region = house.xpath("div/div/p/text()")
total_price = house.xpath("div/p/span/text()")
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div/div/a/@href'))
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) 大致看了一下,你的代码应该没问题,分析了一下网站请求,确认是网站做了cookie反爬。
浏览器的请求过程是:先请求https://shenzhen.qfang.com/sale/f1 ==> 判断是否有一个叫wzws_cid的cookie,如果没有或者失效,得到的响应内容就是“请开启JavaScript并刷新该页.”这个页面 ==>然后浏览器自动请求https://shenzhen.qfang.com/WZWSREL3NhbGUvZjE=?这个url为浏览器设置cookie(这个url有个加密请求参数) ==> 最后重定向回来原请求网址就是显示正常内容。
cookie可以直接从浏览器复制cookie中复制,不过cookie有效期比较短,短时间爬取应该没问题,如果要长时间爬取就需要逆向js找到https://shenzhen.qfang.com/WZWSREL3NhbGUvZjE=?这个请求的加密参数生成方式。
页:
[1]