求助大佬们!!!我的xinlang新闻关键字爬虫爬取出来的数据有不断地循环数据,而且...
求助大佬们!!!我的xinlang新闻关键字爬虫爬取出来的数据有不断地循环数据,而且有很多没爬到的数据,求助!!!import requests
from bs4 import BeautifulSoup
import time
import json
import re
import pandas
import sys
import xlwt
def getnewcontent(url,v):
result = {}
info = requests.get(url)
info.encoding = 'utf-8'
html = BeautifulSoup(info.text, 'html.parser')
article = []
try:
result['title'] = html.select('.second-title').text
except:
result['title'] = v["origin_title"]
try:
result['date'] = html.select('.date').text
except:
result['date'] = v["datetime"]
try:
result['source'] = html.select('.source').text
except:
result['source'] = v["media"]
try:
for v in html.select('.article p')[:-1]:
# select('.article p')
article.append(v.text.strip())
author_info = '\n'.join(article)
result['content'] = author_info
except:
result['content'] = v["origin_title"]
# 执行程序
# result['title'] = html.findAll(name="div", attrs={"class": "second-title"}).text
# result['date'] = html.findAll(name="div", attrs={"class": "date"}).text
# result['source'] = html.findAll(name="div", attrs={"class": "source"}).text
# # result['date'] = html.select('.date').text
# # result['source'] = html.select('.source').text
# for v in html.findAll(name="div", attrs={"class": "article p"})[:-1]:
# # select('.article p')
# article.append(v.text.strip())
# author_info = '\n'.join(article)
# result['content'] = author_info
# 以上执行结束
# result['title'] = html.select('.second-title').text
# result['date'] = html.select('.date').text
# result['source'] = html.select('.source').text
# for v in html.select('.article p')[:-1]:
# # select('.article p')
# article.append(v.text.strip())
# author_info = '\n'.join(article)
# result['content'] = author_info
# result['author'] = html.select('.show_author').text.lstrip('责任编辑:')
# newsid = url.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
# commenturl = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gj&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback=jsonp_1536041889769&_=1536041889769'
# comments = requests.get(commenturl.format(newsid))
# regex = re.compile(r'(.*?)\(')#去除左边特殊符号
# tmp = comments.text.lstrip(regex.search(comments.text).group())
# jd = json.loads(tmp.rstrip(')'))
# result['comment'] = jd['result']['count']['total'] #获取评论数
return result
def getnullcontent(v):
result = {}
# info = requests.get(url)
# info.encoding = 'utf-8'
# html = BeautifulSoup(info.text, 'html.parser')
result['title'] = v["origin_title"]
result['date'] = v["datetime"]
result['source'] = v["media"]
result['content'] = v["origin_title"]
# article = []
# for v in html.findAll(name="div", attrs={"class": "article p"})[:-1]:
# # select('.article p')
# article.append(v.text.strip())
# author_info = '\n'.join(article)
# result['content'] = author_info
return result
def getnewslink(news):
# test = requests.get(url)
# test2 =test.text.lstrip('newsloadercallback(')
# jd = json.loads(test2.rstrip(')\n'))
# content = []
# for v in jd['result']['data']:
# content.append(getnewcontent(v['url']))
content = []
for v in news:
if len(v['url']) == 0:
content.append(getnullcontent(v))
else:
content.append(getnewcontent(v['url'],v))
return content
def getdata(news):
# url = 'https://interface.sina.cn/news/get_news_by_channel_new_v2018.d.html?cat_1=51923&show_num=27&level=1,2&page={}&callback=newsloadercallback&_=1536044408917'
weibo_info = []
# for i in range(1,3):
# newsurl = url.format(i)#字符串格式化用i替换{}
weibo_info.extend(getnewslink(news))
return weibo_info
def getnews(page, news):
headers = {"Host": "interface.sina.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive",
"Referer": r"https://search.sina.com.cn/?q=%E4%B8%AD%E5%9B%BD%E7%96%AB%E6%83%85&c=news&from=channel&ie=utf-8",
"Cookie": "UOR=,news.sina.com.cn,; SINAGLOBAL=110.183.38.179_1641435958.806869; Apache=110.183.38.179_1641435958.806870; ULV=1641435964876:2:2:2:110.183.38.179_1641435958.806870:1641435959002; SEARCH-SINA-COM-CN=; UM_distinctid=17e2d3a83ee95-06cc0b77977a27-4303066-1fa400-17e2d3a83efa91; __gads=ID=7f46c33619471efc-22035f09a6cf00ec:T=1641436317:RT=1641436317:S=ALNI_Mb1QROC5p67VTvD3iPkYlapHKLRMg; beegosessionID=16a193444f911fefb8f7e1cdd8e945c2",
"TE": "Trailers"}
params = {"t": "", "q": "中国疫情", "pf": "0", "ps": "0", "page": page, "stime": "2019-03-30", "etime": "2021-01-10",
"sort": "rel", "highlight": "1", "num": "10", "ie": "utf-8"}
response = requests.get("https://interface.sina.cn/homepage/search.d.json?", params=params, headers=headers)
dic = json.loads(response.text)
news += dic["result"]["list"]
return news
news = []
for i in range(1, 801):
news = getnews(i, news)
new_info = getdata(news)
print(news)
#
print(new_info)
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('MySheet')
worksheet.write(0, 0, "标题")
worksheet.write(0, 1, "时间")
worksheet.write(0, 2, "媒体")
worksheet.write(0, 3, "网址")
worksheet.write(0, 4, "副标题")
worksheet.write(0, 5, "内容")
for i in range(len(news)):
print(news)
print(new_info)
worksheet.write(i+1, 0, news["origin_title"])
worksheet.write(i+1, 1, news["datetime"])
worksheet.write(i+1, 2, news["media"])
worksheet.write(i+1, 3, news["url"])
worksheet.write(i + 1, 4, new_info["title"])
worksheet.write(i + 1, 5, new_info["content"])
workbook.save('data/data(content).xls')
页:
[1]