|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import json
import requests
from requests.exceptions import RequestException
import re
import time
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('ANSI')
return None
except RequestException:
return None
url = 'http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/left.htm'
html = get_one_page(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.prettify()
data_name_list = []
data_xls_list = []
pre_url = 'http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/'
for ul in soup.find_all('ul'):
for li in ul.find_all(name='li'):
a = li.a
#数据格式为xls,去掉其他的
if a != None and a.attrs['href'][-1] != 'm':
data_name_list.append(li.get_text())
data_xls_list.append(pre_url + a.attrs['href'])
import urllib
import os
path = 'C:\\Users\\'
i = 0
for url in data_xls_list:
print(url)
filename = os.path.join(path, data_name_list[i] + '.xls')
urllib.request.urlretrieve(url, filename)
i += 1
- def get_one_page(url):
- try:
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
- }
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- return response.content.decode('ANSI')
- return None
- except RequestException:
- return None
- url = 'http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/left.htm'
- html = get_one_page(url)
- soup = BeautifulSoup(html, 'lxml')
- soup.prettify()
- data_name_list = []
- data_xls_list = []
- pre_url = 'http://www.stats.gov.cn/tjsj/pcsj/rkpc/6rp/'
- count = 0
- for ul in soup.find_all('ul'):
- # print(type(ul))
- for li in ul.find_all(name='li'):
- try:
- a = li.a['href'] # 获取href str
- # print(li.a['href'])
- # print(li.a['href'][-1])
- # print(a, li.get_text())
- except:
- pass
- if li.a != None and a[-1] == 'm':
- data_name_list.append(li.get_text())
- data_xls_list.append(pre_url + a)
- break
- path = 'C:\\Users\\'
- i = 0
- for url in data_xls_list:
- print(url)
- filename = os.path.join(path, data_name_list[i] + '.xls')
- urllib.request.urlretrieve(url, filename)
- i += 1
复制代码
|
|