每日一爬(不定期更新,嘿嘿嘿~)练手系列
爬取2017年房价排行榜数据:import requests
import bs4
import re
import openpyxl
def open_url(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
html = requests.get(url,headers = headers)
#print(html.encoding)
encoding = html.encoding
return html
def get_data(html):
soup = bs4.BeautifulSoup(html.text,'html.parser')
#print(soup.find_all('li',class_=re.compile('^clearfix.+')))
mylist = []
for each in soup.find_all('li',class_=re.compile('^clearfix.+')):
mylist.append(each.text.strip().split('\n'))
new_mylist = []
for each_list in mylist:
each_list = re.search(r'2017年(.+)房价',each_list).group(1)
new_mylist.append(each_list)
return new_mylist
def save_as_excel(mylist):
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = '地区'
ws['B1'] = '房价'
ws['C1'] = '同比'
for each in mylist:
ws.append(each)
wb.save('2017全国房价排行.xlsx')
def main():
url = 'https://www.anjuke.com/fangjia/quanguo2017/'
html = open_url(url)
mylist = get_data(html)
save_as_excel(mylist)
if __name__ == '__main__':
main()
可以可以~~ 可不可以不用bs4? 本帖最后由 snail:) 于 2018-3-2 23:14 编辑
小甲鱼 发表于 2018-3-2 16:20
可以可以~~
谢谢坛主,继续努力 qiuyouzhi 发表于 2018-3-2 20:13
可不可以不用bs4?
可以的,你直接用正则也行,bs4就是重新把网络的框架梳理出来,便于分析
页:
[1]