抓取新浪 军事中国军情标题、时间和链接——回复有彩蛋
import requestsfrom requests.exceptions import RequestException
import time
from bs4 import BeautifulSoup
import pandas as pd
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = 'gb2312'
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
soup = BeautifulSoup(html, 'html.parser')
fList = soup.select('.fixList')
temp = []
for each in fList.select('li'):
alink = {}
alink['title'] = each.text
alink['time'] = each.select('.time').text
alink['url'] = each.select('a')['href']
temp.append(alink)
return temp
def main(offset):
url = 'http://roll.mil.news.sina.com.cn/col/zgjq/index_' + str('offset') + '.shtml'
html = get_one_page(url)
text.extend(parse_one_page(html))
if __name__ == '__main__':
text = []
for i in range(10):
main(offset=i)
time.sleep(1)
df = pd.DataFrame(text)
df.to_csv("sina.csv", index=False, encoding='utf_8_sig')
回复有爬虫书籍**** Hidden Message *****《》PDF以及配套视频 谢谢分享 谢谢分享 谢谢分享 谢谢大佬 {:5_109:} 看看 0.0.0.0.0. 谢谢分享 大佬啊 谢谢分享
学习 谢谢分享 不是吧 看看
要 {:5_94:} 111
页:
[1]