做了个爬二手房的脚本
做了个爬二手房信息的脚本,发现个问题,求大神指点下:我用脚本爬取页面得到的内容,和我用浏览访问的内容似乎是不一样的。
import requests
from lxml import etree
import time
from ua_info import ua_list
class EsfSpider:
def __init__(self):
self.url = '''https://bj.lianjia.com/ershoufang/pg{}/'''
#修改heeader信息模拟浏览器访问
self.headers = {'UserAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'}
self.blog = 1
#获取页面内容
def get_html(self,url):
#对失败页面尝试请求三次
if self.blog <= 3:
try:
res = requests.get(url,headers=self.headers,timeout=3)
res.encoding = 'utf-8'
self.html = res.text
return self.html
except Exception as e :
print(e)
self.blog += 1
self.get_html(url)
else:
print('访问页面失败')
exit()
#Xpath解析页面,爬取所需信息
def xpath_html(self,url):
html = self.get_html(url)
p = etree.HTML(html)
# 房源信息节点
info_clear = '''//div[@class="info clear"]'''
'''//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]'''
info_clear_list = p.xpath(info_clear)
fangyuan = []
for i in info_clear_list:
item = {}
# 地址
flood = '''.//div[@class="flood"]/div/a[@target="_blank"]/text()'''
flood = i.xpath(flood)
item['flood'] = flood +'- '+ flood
# 基础信息
address = '''.//div[@class="address"]/div/text()'''
address_list = i.xpath(address).strip()
item['address'] = address_list
# 优点
tag = '''.//div[@class="tag"]/span[@class="subway"]/text()'''
tag_list = i.xpath(tag)
item['tag'] = tag_list if tag_list else None
# 总价、单价
priceinfo = '''.//div[@class="priceInfo"]/node()/*/text()'''
#[' ', '548', '万', '48,977元/平']
priceinfo_list = i.xpath(priceinfo)
totaprice = priceinfo_list + priceinfo_list
item['price'] = '总价:%s,单价:%s'%(totaprice,priceinfo_list)
fangyuan.append(item)
return fangyuan
def run(self):
ye = input('需要几页的房源(每页30个房源):')
for i in range(0,int(ye)):
url = self.url.format(i)
print(url)
fangyuan = self.xpath_html(url)
self.blog = 1
return fangyuan
if __name__ =='__main__':
spider = EsfSpider()
f = spider.run()
我脚本最后会打印出脚本爬的连接,但我将这连接放浏览器上里面的房子信息内容就和爬出来的内容不一样。这是为什么呢?
如连接是
https://bj.lianjia.com/ershoufang/pg0/
我脚本里爬到的第一条房子的信息是:信和嘉园
但我将连接放浏览器里,打开搜不到:信和嘉园
浏览器看到的是经过css和js渲染后得到的结果,requests等get到的并没有渲染,这是正常的。所以写爬虫要以get的结果为准。 我打开链接第一个是信和嘉园,第二个是【西南四环】【南北通透两居室】 难道是我弄错了??? 嘉岳呀 发表于 2021-10-4 19:45
难道是我弄错了???
{:10_249:}why???{:10_299:} suchocolate 发表于 2021-10-4 18:55
浏览器看到的是经过css和js渲染后得到的结果,requests等get到的并没有渲染,这是正常的。所以写爬虫要以ge ...
{:10_277:}谢谢~~~
页:
[1]