|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本来是用scrapy爬虫在爬一个网页的信息并放到mongodb数据库里,但总被封ip。所以想用requests模块来摸索一下到底是为什么会被发现
代码如下
import requests
import pymongo
import re
import time
import random
from lxml import etree
base_url = 'http://bj.maitian.cn/zfall/PG'
client = pymongo.MongoClient("mongodb://AAA:BBB@localhost:27017")
db = client['麦田']
collection = db['租房']
# 等会从这边开始变成循环
for i in range(1, 101): # 可以选择需要的爬取的页数,目前只爬一页
print('正在爬取第%d页' % i)
two_many = []
url = base_url + str(i)
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
'Referer':url,
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=head)
data = response.text
parse_data = etree.HTML(data)
element_list = parse_data.xpath('//div[@class="list_title"]')
for zufang_item in element_list:
title = zufang_item.xpath('./h1/a/text()')[0].strip()
price = zufang_item.xpath('./div[@class="the_price"]/ol/strong/span/text()')[0].strip()
area = zufang_item.xpath('./p/span/text()')[0].replace('㎡', '').strip()
content = zufang_item.xpath('./p[@class="house_hot"]/span/text()[2]')[0]
pattern = r'昌平|朝阳|东城|大兴|丰台|海淀|石景山|顺义|通州|西城'
district = re.findall(pattern, content)[0]
item = {'标题': title, '价格': price, '面积': area, '区域': district}
two_many.append(item)
# 第一页的东西插入数据库
collection.insert_many(two_many)
t = random.uniform(3, 10)
time.sleep(t)
client.close()
但是爬了8页就被封了ip,提示403.
实在是不知道该怎么办了,一页一页往下点击肯定不会被封,能请大神帮忙看下吗
|
|