|
楼主 |
发表于 2020-11-6 17:53:15
|
显示全部楼层
import requests
import csv
import time
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64)\
AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/\
537.36 115Broswer/6.0.3'}
pre_url = 'https://shenzhen.qfang.com/sale/f'
def download(url):
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
print(html.status_code)
time.sleep(2)
print(html.text)
print(1)
return etree.HTML(html.text)
def data_writer(item):
with open('qfang1.csv', 'a', encoding = 'utf-8') as fp:
writer = csv.writer(fp)
writer.writerow(item)
def spider(list_url):
selector = download(list_url)
house_list = selector.xpath('/html/body/div[5]/div/div[1]/div[4]/ul/li')
#print(house_list)
for house in house_list:
apartment = house.xpath("div[2]/div[1]/a/text()")[0]
house_layout = house.xpath("div[2]/div[2]/p[1]/text()")[0]
area = house.xpath("div[2]/div[2]/p[2]/text()")[0]
region = house.xpath("div[2]/div[2]/p[6]/text()")[0]
total_price = house.xpath("div[3]/p[1]/span[1]/text()")[0]
print(1)
house_url = ('http://shenzhen.qfang.com'\
+ house.xpath('div[2]/div[1]/a/@href')[0])
sel = download(house_url)
time.sleep(1)
house_years = sel.xpath('//*[@id="scrollto-1"]/div[3]/ul/li[3]/div[2]/text()')
mortgage_info = sel.xpath('//*[@id="scrollto-1"]/div[3]/ul/li[5]/div[2]/text()')
item = [apartment, house_layout, area, region,
total_price, house_years, mortgage_info]
print("正在爬取",apartment)
data_writer(item)
if __name__ == '__main__':
for i in range(1, 2):
spider(pre_url + str(i)) |
|