python爬虫，代码问题！,Python交流,编程语言专区,鱼C论坛

澍梵. 发表于 2020-9-14 23:51:50

python爬虫，代码问题！

代码可以运行但是无法爬出数据，运行完后csv表为空。
问题应该在select的定位上，但是总是解决不了。
感谢帮助！
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

#爬取数据
def request_Data(url):

#创建requests对象
req = urllib.request.Request(url)

page_data_list = []

with urllib.request.urlopen(req) as response:
   data = response.read()
   htmlstr = data.decode()
   L = parse_HTMLData(htmlstr)
   page_data_list.extend(L)

return page_data_list

#解析数据
def parse_HTMLData(htmlstr):

sp = BeautifulSoup(htmlstr,'html.parser')

#获得房子信息列表
house_list = sp.select('body > div.main-wrap > div.content-wrap > div.content-side-left> li:nth-child')


#当前页中的记录列表
page_list = []
for house in house_list:
   #每一行数据
   rows_list = []

   #获得房子标题
   title = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left> li')


   title = (title.text).strip()
   rows_list.append(title)

   #获得房子信息
   infos = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left > li > div.list-info > p')

   # 获得房子户型
   house_type = (infos.text).strip()
   rows_list.append(house_type)
   # 获得房子面积
   house_area = (infos.text).strip()
   rows_list.append(house_area)
   # 获得房子朝向
   house_face = (infos.text).strip()
   rows_list.append(house_face)
   # 获得房子楼层
   house_floor = (infos.text).strip()
   rows_list.append(house_floor)
   #获得房子所在城区
   addr_dist = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left > li > div.list-info > p:nth-child(3) > span > a:nth-child(2)')
                           body > div.main-wrap > div.content-wrap > div.content-side-left > ul > li:nth-child(1) > div.list-info > p:nth-child(3) > span > a:nth-child(2)
   rows_list.append(addr_dist)
   #获得房子所在小区
   addr_name = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left > li > div.list-info > p:nth-child(3) > span > a:nth-child(1)')

   addr_name = (addr_name.text).strip()
   rows_list.append(addr_name)
   #获得房子总价
   total_price = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left > li.sendsoj.hove > div.price > p.sum > b')

   total_price = (total_price.text).strip()
   rows_list.append(total_price)
   #获得房子单价
   price = house.select('body > div.main-wrap > div.content-wrap > div.content-side-left > li.sendsoj.hove > div.price > p.unit')


   price = (price.text).strip()
   rows_list.append(price)

   page_list.append(rows_list)

return page_list


url_temp = 'http://sh.ganji.com/ershoufang/pn{}/'

data_list = []

for i in range(1,11):#总共70页
url = url_temp.format(i)
print(url)
print('+++++第{}页++++++'.format(i))

try:
   L = request_Data(url)
   data_list.extend(L)
except Exception as e:

   #不再循环
   print('不再有数据，结束循环')
   break

print(data_list)

#保存数据
#列名
colsname = ['标题', '户型', '面积', '朝向', '楼层', '城区', '小区名', '总价', '单价']

df = pd.DataFrame(data_list, columns = colsname)
df.to_csv('house_data.csv',index = False,encoding='gbk')

澍梵. 发表于 2020-9-14 23:54:55

爬取58同城二手房

彩虹七号 发表于 2020-9-15 09:20:28

这个网站有反爬，加header试一试，

jackcoden 发表于 2020-9-15 14:30:33

需要使用代理IP吧

页: [1]

鱼C论坛's Archiver

python爬虫，代码问题！