|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- import re
- import bs4
- from bs4 import BeautifulSoup as soup
- import time
- import pandas as pd
- url1 = 'https://www.muniao.com/'
- url2 = '/null-0-0-0-0-0-0-0-'
- url3 = '.html?tn=mn19091015'
- city1 = ['beijing','shanghai','qinhuangdao','qingdao' ,'xiamen' ,
- 'chengdu','hangzhou','dalian' ,'chongqing','guangzhou',
- 'nanjing','xian' ,'sanya' ,'shenzhen' ,'weihai' ,
- 'wuhan' ,'yantai' ,'suzhou' ,'tianjin' ,'changsha' ] #热门城市名称
- citychinese = ['北京', '上海', '秦皇岛', '青岛', '厦门',
- '成都', '杭州', '大连', '重庆', '广州',
- '南京', '西安', '三亚', '深圳', '威海',
- '武汉', '烟台', '苏州', '天津', '长沙']
- def Qingxi(na, ca, ad): #数据清洗
- na1 = ' '.join('%s'%id for id in na) #列表转为字符串
- name = re.findall(r'>(.*?)</a>',na1) #遍历字符串截取名称内容
- #print(name)
- ca1 = ' '.join('%s'%id for id in ca)
- #print(ca1)
- case = re.findall(r'<span>(.*?)</',ca1) #取出条件部分
- case1 = ' '.join('%s'%id for id in case)
- case2 = []
- for i in case:
- j = re.findall(r'.*?评论',str(i)) #去除case里包含的评论条数
- if j == []:
- case2.append(i)
- #print(case2)
- #print(case)
- ad1 = ' '.join('%s' % id for id in ad)
- address = re.findall(r'地址:(.*?)\r', ad1 ,re.S) #取出地址部分
- #print(address)
- Chucun(name,case2,address)
- def Gethtml(city, k): #爬取网页
- for x in range(1,11): #循环嵌套网址换页
- time.sleep(1) #延时
- url = url1+city+url2+str(x)+url3 #网页url
- print(url)
- print('正在爬取'+citychinese[k]+'市的第'+str(x)+'页租房信息')
- r = requests.get(url)
- s = soup(r.text, 'lxml') #解析网页
- name = s.select("a[class=s_mn_house_t1]") #名称解析
- case = s.select('p span') #条件解析
- address = s.select("div[class=list_address]") #地址获取
- Qingxi(name, case, address)
- def Chengshi(): #更换城市
- k = 0
- for i in city1:
- Gethtml(i, k)
- k += 1
- def Chucun(na,ca,ad):
- ca1 = [] #居室
- ca2 = [] #出租情况
- ca3 = [] #居住人数
- for i in ca[::3]: #将case分成三个部分,居室,出租情况,居住人数
- ca1.append(i)
- #print(ca1)
- for i in ca[1::3]:
- ca2.append(i)
- #print(ca2)
- for i in ca[2::3]:
- ca3.append(i)
- #print(ca3)
- data1 = {'名称':na, '居室':ca1, '出租情况':ca2, '居住人数':ca3,'地址':ad}
- df1 = pd.DataFrame(data1)
- df1.to_csv('E:/木鸟短租热门城市爬取.csv', index=None) #csv文件的写入位置
- Chengshi()
复制代码
存的表格里面只有前三页网页的内容,帮忙看一下谢谢
|
|