马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬取2017年房价排行榜数据:
import requests
import bs4
import re
import openpyxl
def open_url(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
html = requests.get(url,headers = headers)
#print(html.encoding)
encoding = html.encoding
return html
def get_data(html):
soup = bs4.BeautifulSoup(html.text,'html.parser')
#print(soup.find_all('li',class_=re.compile('^clearfix.+')))
mylist = []
for each in soup.find_all('li',class_=re.compile('^clearfix.+')):
mylist.append(each.text.strip().split('\n'))
new_mylist = []
for each_list in mylist:
each_list[0] = re.search(r'2017年(.+)房价',each_list[0]).group(1)
new_mylist.append(each_list)
return new_mylist
def save_as_excel(mylist):
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = '地区'
ws['B1'] = '房价'
ws['C1'] = '同比'
for each in mylist:
ws.append(each)
wb.save('2017全国房价排行.xlsx')
def main():
url = 'https://www.anjuke.com/fangjia/quanguo2017/'
html = open_url(url)
mylist = get_data(html)
save_as_excel(mylist)
if __name__ == '__main__':
main()
|