我为学渣代言 发表于 2020-11-6 16:13:54

爬虫求助

最进在学爬虫,前几天做了一个简单爬虫能正常爬取数据,但今天运行后返回的数据为空,Html文本中显示这么一条信息“<h1><strong>请开启JavaScript并刷新该页.</strong></h1>“,大佬们这是被反爬虫了吗?{:5_105:}

suchocolate 发表于 2020-11-6 17:44:35

不一定,把代码发上来吧。

我为学渣代言 发表于 2020-11-6 17:53:15

import requests
import csv
import time
from lxml import etree

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'

def download(url):

    html = requests.get(url, headers = headers)
    html.encoding = 'utf-8'
    print(html.status_code)
    time.sleep(2)
    print(html.text)
    print(1)
    return etree.HTML(html.text)

def data_writer(item):

    with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
      writer = csv.writer(fp)
      writer.writerow(item)

def spider(list_url):

    selector = download(list_url)

    house_list = selector.xpath('/html/body/div/div/div/div/ul/li')

    #print(house_list)
    for house in house_list:
      apartment = house.xpath("div/div/a/text()")
      house_layout = house.xpath("div/div/p/text()")
      area = house.xpath("div/div/p/text()")
      region = house.xpath("div/div/p/text()")
      total_price = house.xpath("div/p/span/text()")
      print(1)
      house_url = ('http://shenzhen.qfang.com'\
                     + house.xpath('div/div/a/@href'))
      sel = download(house_url)
      time.sleep(1)
      house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      item = [apartment, house_layout, area, region,
                total_price, house_years, mortgage_info]
      print("正在爬取",apartment)
      data_writer(item)

if __name__ == '__main__':

    for i in range(1, 2):
      spider(pre_url + str(i))

我为学渣代言 发表于 2020-11-6 17:54:00

import requests
import csv
import time
from lxml import etree

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'

def download(url):

    html = requests.get(url, headers = headers)
    html.encoding = 'utf-8'
    print(html.status_code)
    time.sleep(2)
    print(html.text)
    print(1)
    return etree.HTML(html.text)

def data_writer(item):

    with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
      writer = csv.writer(fp)
      writer.writerow(item)

def spider(list_url):

    selector = download(list_url)

    house_list = selector.xpath('/html/body/div/div/div/div/ul/li')

    #print(house_list)
    for house in house_list:
      apartment = house.xpath("div/div/a/text()")
      house_layout = house.xpath("div/div/p/text()")
      area = house.xpath("div/div/p/text()")
      region = house.xpath("div/div/p/text()")
      total_price = house.xpath("div/p/span/text()")
      print(1)
      house_url = ('http://shenzhen.qfang.com'\
                     + house.xpath('div/div/a/@href'))
      sel = download(house_url)
      time.sleep(1)
      house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      item = [apartment, house_layout, area, region,
                total_price, house_years, mortgage_info]
      print("正在爬取",apartment)
      data_writer(item)

if __name__ == '__main__':

    for i in range(1, 2):
      spider(pre_url + str(i))

我为学渣代言 发表于 2020-11-6 17:55:54

import requests
import csv
import time
from lxml import etree

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'

def download(url):

    html = requests.get(url, headers = headers)
    html.encoding = 'utf-8'
    print(html.status_code)
    time.sleep(2)
    print(html.text)
    print(1)
    return etree.HTML(html.text)

def data_writer(item):

    with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
      writer = csv.writer(fp)
      writer.writerow(item)

def spider(list_url):

    selector = download(list_url)

    house_list = selector.xpath('/html/body/div/div/div/div/ul/li')

    #print(house_list)
    for house in house_list:
      apartment = house.xpath("div/div/a/text()")
      house_layout = house.xpath("div/div/p/text()")
      area = house.xpath("div/div/p/text()")
      region = house.xpath("div/div/p/text()")
      total_price = house.xpath("div/p/span/text()")
      print(1)
      house_url = ('http://shenzhen.qfang.com'\
                     + house.xpath('div/div/a/@href'))
      sel = download(house_url)
      time.sleep(1)
      house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      item = [apartment, house_layout, area, region,
                total_price, house_years, mortgage_info]
      print("正在爬取",apartment)
      data_writer(item)

if __name__ == '__main__':

    for i in range(1, 2):
      spider(pre_url + str(i))

liuzhengyuan 发表于 2020-11-6 18:05:16

不发代码
不发要爬取的网址
很难了解你的情况

我为学渣代言 发表于 2020-11-6 18:18:59

import requests
import csv
import time
from lxml import etree

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'

def download(url):

    html = requests.get(url, headers = headers)
    html.encoding = 'utf-8'
    print(html.status_code)
    time.sleep(2)
    print(html.text)
    print(1)
    return etree.HTML(html.text)

def data_writer(item):

    with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
      writer = csv.writer(fp)
      writer.writerow(item)

def spider(list_url):

    selector = download(list_url)

    house_list = selector.xpath('/html/body/div/div/div/div/ul/li')

    #print(house_list)
    for house in house_list:
      apartment = house.xpath("div/div/a/text()")
      house_layout = house.xpath("div/div/p/text()")
      area = house.xpath("div/div/p/text()")
      region = house.xpath("div/div/p/text()")
      total_price = house.xpath("div/p/span/text()")
      print(1)
      house_url = ('http://shenzhen.qfang.com'\
                     + house.xpath('div/div/a/@href'))
      sel = download(house_url)
      time.sleep(1)
      house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      item = [apartment, house_layout, area, region,
                total_price, house_years, mortgage_info]
      print("正在爬取",apartment)
      data_writer(item)

if __name__ == '__main__':

    for i in range(1, 2):
      spider(pre_url + str(i))

我为学渣代言 发表于 2020-11-6 18:38:03

suchocolate 发表于 2020-11-6 17:44
不一定,把代码发上来吧。

import requests
import csv
import time
from lxml import etree

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'

def download(url):

    html = requests.get(url, headers = headers)
    html.encoding = 'utf-8'
    print(html.status_code)
    time.sleep(2)
    print(html.text)
    print(1)
    return etree.HTML(html.text)

def data_writer(item):

    with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
      writer = csv.writer(fp)
      writer.writerow(item)

def spider(list_url):

    selector = download(list_url)

    house_list = selector.xpath('/html/body/div/div/div/div/ul/li')

    #print(house_list)
    for house in house_list:
      apartment = house.xpath("div/div/a/text()")
      house_layout = house.xpath("div/div/p/text()")
      area = house.xpath("div/div/p/text()")
      region = house.xpath("div/div/p/text()")
      total_price = house.xpath("div/p/span/text()")
      print(1)
      house_url = ('http://shenzhen.qfang.com'\
                     + house.xpath('div/div/a/@href'))
      sel = download(house_url)
      time.sleep(1)
      house_years = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div/ul/li/div/text()')
      item = [apartment, house_layout, area, region,
                total_price, house_years, mortgage_info]
      print("正在爬取",apartment)
      data_writer(item)

if __name__ == '__main__':

    for i in range(1, 2):
      spider(pre_url + str(i))

YunGuo 发表于 2020-11-10 14:58:14

大致看了一下,你的代码应该没问题,分析了一下网站请求,确认是网站做了cookie反爬。
浏览器的请求过程是:先请求https://shenzhen.qfang.com/sale/f1   ==>   判断是否有一个叫wzws_cid的cookie,如果没有或者失效,得到的响应内容就是“请开启JavaScript并刷新该页.”这个页面   ==>然后浏览器自动请求https://shenzhen.qfang.com/WZWSREL3NhbGUvZjE=?这个url为浏览器设置cookie(这个url有个加密请求参数)   ==>   最后重定向回来原请求网址就是显示正常内容。
cookie可以直接从浏览器复制cookie中复制,不过cookie有效期比较短,短时间爬取应该没问题,如果要长时间爬取就需要逆向js找到https://shenzhen.qfang.com/WZWSREL3NhbGUvZjE=?这个请求的加密参数生成方式。
页: [1]
查看完整版本: 爬虫求助