|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from requests.exceptions import RequestException
- import time
- from bs4 import BeautifulSoup
- import pandas as pd
- def get_one_page(url):
- try:
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
- }
- response = requests.get(url, headers=headers)
- response.encoding = 'gb2312'
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- return None
- def parse_one_page(html):
- soup = BeautifulSoup(html, 'html.parser')
- fList = soup.select('.fixList')
- temp = []
- for each in fList[0].select('li'):
- alink = {}
- alink['title'] = each.text
- alink['time'] = each.select('.time')[0].text
- alink['url'] = each.select('a')[0]['href']
- temp.append(alink)
- return temp
- def main(offset):
- url = 'http://roll.mil.news.sina.com.cn/col/zgjq/index_' + str('offset') + '.shtml'
- html = get_one_page(url)
- text.extend(parse_one_page(html))
- if __name__ == '__main__':
- text = []
- for i in range(10):
- main(offset=i)
- time.sleep(1)
- df = pd.DataFrame(text)
- df.to_csv("sina.csv", index=False, encoding='utf_8_sig')
复制代码
回复有爬虫书籍《》PDF以及配套视频 |
|