网页爬取 print可以输出内容,但存为csv时只存了前三个网页的内容,后面都没了
import requests
import re
import bs4
from bs4 import BeautifulSoup as soup
import time
import pandas as pd
url1 = 'https://www.muniao.com/'
url2 = '/null-0-0-0-0-0-0-0-'
url3 = '.html?tn=mn19091015'
city1 = ['beijing','shanghai','qinhuangdao','qingdao','xiamen' ,
'chengdu','hangzhou','dalian' ,'chongqing','guangzhou',
'nanjing','xian' ,'sanya' ,'shenzhen' ,'weihai' ,
'wuhan','yantai','suzhou' ,'tianjin','changsha'] #热门城市名称
citychinese = ['北京', '上海', '秦皇岛', '青岛', '厦门',
'成都', '杭州', '大连','重庆', '广州',
'南京', '西安', '三亚','深圳', '威海',
'武汉', '烟台', '苏州','天津', '长沙']
def Qingxi(na, ca, ad): #数据清洗
na1 = ' '.join('%s'%id for id in na) #列表转为字符串
name = re.findall(r'>(.*?)</a>',na1) #遍历字符串截取名称内容
#print(name)
ca1 = ' '.join('%s'%id for id in ca)
#print(ca1)
case = re.findall(r'<span>(.*?)</',ca1) #取出条件部分
case1 = ' '.join('%s'%id for id incase)
case2 = []
for i in case:
j = re.findall(r'.*?评论',str(i)) #去除case里包含的评论条数
if j == []:
case2.append(i)
#print(case2)
#print(case)
ad1 = ' '.join('%s' % id for id in ad)
address = re.findall(r'地址:(.*?)\r', ad1 ,re.S) #取出地址部分
#print(address)
Chucun(name,case2,address)
def Gethtml(city, k): #爬取网页
for x in range(1,11): #循环嵌套网址换页
time.sleep(1) #延时
url = url1+city+url2+str(x)+url3 #网页url
print(url)
print('正在爬取'+citychinese+'市的第'+str(x)+'页租房信息')
r = requests.get(url)
s = soup(r.text, 'lxml') #解析网页
name = s.select("a") #名称解析
case = s.select('p span') #条件解析
address = s.select("div") #地址获取
Qingxi(name, case, address)
def Chengshi(): #更换城市
k = 0
for i in city1:
Gethtml(i, k)
k += 1
def Chucun(na,ca,ad):
ca1 = [] #居室
ca2 = [] #出租情况
ca3 = [] #居住人数
for i in ca[::3]: #将case分成三个部分,居室,出租情况,居住人数
ca1.append(i)
#print(ca1)
for i in ca:
ca2.append(i)
#print(ca2)
for i in ca:
ca3.append(i)
#print(ca3)
data1 = {'名称':na, '居室':ca1, '出租情况':ca2, '居住人数':ca3,'地址':ad}
df1 = pd.DataFrame(data1)
df1.to_csv('E:/木鸟短租热门城市爬取.csv', index=None) #csv文件的写入位置
Chengshi()
存的表格里面只有前三页网页的内容,帮忙看一下谢谢 to_csv()不会追加,每次都会覆写 路神 发表于 2021-6-7 22:10
to_csv()不会追加,每次都会覆写
谢谢,刚刚查了一下,加个mode='a'就好了,谢谢
页:
[1]