|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 1 19:12:53 2020
@author: acliu
"""
import xlwt
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',\
"Host": "bj.ke.com",
}
# 将获取的信息保存到表格中
def save_info(content):
workbook = xlwt.Workbook(encoding = 'ascii')
worksheet = workbook.add_sheet('house info')
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = 'Times New Roman'
font.bold = True # 黑体
font.underline = True # 下划线
font.italic = True # 斜体字
style.font = font # 设定样式
worksheet.write(0, 0, '名称')
worksheet.write(0, 1, '位置')
worksheet.write(0, 2, '房屋信息')
worksheet.write(0, 3, '总价(万)')
worksheet.write(0, 4, '单价(元/平方米)')
for i, item in enumerate(content):
for j in range(5): #多添加一列(序号)
worksheet.write(i+1, j, content[i][j])
workbook.save('./house_info.xls') # 保存文件
# 获取房屋相关的信息
# 主要包括:title positon houseinfo totalprice unitprice
def get_info():
all_info = []
title_list = []
position_list = []
house_list = []
totalPrice_list = []
unitPrice_list = []
for i in range(3):
link = 'https://cd.ke.com/xiaoqu/damian/pg%dl2/' % i
r = requests.get(link, headers=headers, timeout=10)
print (str(i+1), 'status_code: ', r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
titleInfo = soup.findAll('div', {'class': 'info'})
positionInfo = soup.findAll('div', {'class': 'positionInfo'})
houseInfo = soup.findAll('div', {'class': 'houseInfo'})
totalPrice = soup.findAll('div', {'class': 'totalPrice'})
unitPrice = soup.findAll('div', {'class': 'unitPrice'})
for item in titleInfo:
title = item.div.a.text.strip()
title_list.append(title)
for item in positionInfo:
postion = item.a.text.strip()
position_list.append(postion)
for item in houseInfo:
house = item.text.strip().replace('\n', ' ').replace(' ', '')
house_list.append(house)
for item in totalPrice:
total_price = item.span.text.strip()
totalPrice_list.append(total_price)
for item in unitPrice:
unit_price = item.span.text.strip().replace('单价', '').replace('元/平米', '')
unitPrice_list.append(unit_price)
print (len(title_list))
print (len(position_list))
print (len(house_list))
print (len(totalPrice_list))
print (len(unitPrice_list))
for i in range(len(title_list)):
item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]]
all_info.append(item)
return all_info
if __name__ == "__main__":
all_info = get_info()
save_info(all_info)
源码中根本找不到 unitPrice 这个节点属性,所以这个提取的数据为空:
|
|