鱼C论坛

 找回密码
 立即注册
查看: 925|回复: 0

求助大佬们!!!我的xinlang新闻关键字爬虫爬取出来的数据有不断地循环数据,而且...

[复制链接]
发表于 2022-1-13 17:59:44 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
求助大佬们!!!我的xinlang新闻关键字爬虫爬取出来的数据有不断地循环数据,而且有很多没爬到的数据,求助!!!
import requests
from bs4 import BeautifulSoup
import time
import json
import re
import pandas
import sys
import xlwt

def getnewcontent(url,v):
    result = {}
    info = requests.get(url)
    info.encoding = 'utf-8'
    html = BeautifulSoup(info.text, 'html.parser')
    article = []
    try:
        result['title'] = html.select('.second-title')[0].text
    except:
        result['title'] = v["origin_title"]


    try:
        result['date'] = html.select('.date')[0].text

    except:
        result['date'] = v["datetime"]


    try:
        result['source'] = html.select('.source')[0].text

    except:
        result['source'] = v["media"]


    try:
        for v in html.select('.article p')[:-1]:
            # select('.article p')
            article.append(v.text.strip())
        author_info = '\n'.join(article)
        result['content'] = author_info
    except:
        result['content'] = v["origin_title"]


    # 执行程序
    # result['title'] = html.findAll(name="div", attrs={"class": "second-title"})[0].text
    # result['date'] = html.findAll(name="div", attrs={"class": "date"})[0].text
    # result['source'] = html.findAll(name="div", attrs={"class": "source"})[0].text
    # # result['date'] = html.select('.date')[0].text
    # # result['source'] = html.select('.source')[0].text
    # for v in html.findAll(name="div", attrs={"class": "article p"})[:-1]:
    #     # select('.article p')
    #     article.append(v.text.strip())
    #     author_info = '\n'.join(article)
    #     result['content'] = author_info
    # 以上执行结束

    # result['title'] = html.select('.second-title')[0].text
    # result['date'] = html.select('.date')[0].text
    # result['source'] = html.select('.source')[0].text
    # for v in html.select('.article p')[:-1]:
    #     # select('.article p')
    #     article.append(v.text.strip())
    # author_info = '\n'.join(article)
    # result['content'] = author_info

    # result['author'] = html.select('.show_author')[0].text.lstrip('责任编辑:')
    # newsid = url.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
    # commenturl = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gj&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback=jsonp_1536041889769&_=1536041889769'
    # comments = requests.get(commenturl.format(newsid))
    # regex = re.compile(r'(.*?)\(')#去除左边特殊符号
    # tmp = comments.text.lstrip(regex.search(comments.text).group())
    # jd = json.loads(tmp.rstrip(')'))
    # result['comment'] = jd['result']['count']['total'] #获取评论数
    return result
def getnullcontent(v):
    result = {}
    # info = requests.get(url)
    # info.encoding = 'utf-8'
    # html = BeautifulSoup(info.text, 'html.parser')
    result['title'] = v["origin_title"]
    result['date'] = v["datetime"]
    result['source'] = v["media"]
    result['content'] = v["origin_title"]
    # article = []
    # for v in html.findAll(name="div", attrs={"class": "article p"})[:-1]:
    #     # select('.article p')
    #     article.append(v.text.strip())
    #     author_info = '\n'.join(article)
    #     result['content'] = author_info
    return result
def getnewslink(news):
    # test = requests.get(url)
    # test2 =  test.text.lstrip('newsloadercallback(')
    # jd = json.loads(test2.rstrip(')\n'))
    # content = []
    # for v in jd['result']['data']:
    #     content.append(getnewcontent(v['url']))
    content = []
    for v in news:
        if len(v['url']) == 0:
            content.append(getnullcontent(v))
        else:
            content.append(getnewcontent(v['url'],v))


    return content
def getdata(news):
    # url = 'https://interface.sina.cn/news/get_news_by_channel_new_v2018.d.html?cat_1=51923&show_num=27&level=1,2&page={}&callback=newsloadercallback&_=1536044408917'
    weibo_info = []

    # for i in range(1,3):
    #     newsurl = url.format(i)#字符串格式化用i替换{}
    weibo_info.extend(getnewslink(news))
    return weibo_info
def getnews(page, news):

    headers = {"Host": "interface.sina.cn",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
               "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
               "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive",
               "Referer": r"https://search.sina.com.cn/?q=%E4%B8%AD%E5%9B%BD%E7%96%AB%E6%83%85&c=news&from=channel&ie=utf-8",
               "Cookie": "UOR=,news.sina.com.cn,; SINAGLOBAL=110.183.38.179_1641435958.806869; Apache=110.183.38.179_1641435958.806870; ULV=1641435964876:2:2:2:110.183.38.179_1641435958.806870:1641435959002; SEARCH-SINA-COM-CN=; UM_distinctid=17e2d3a83ee95-06cc0b77977a27-4303066-1fa400-17e2d3a83efa91; __gads=ID=7f46c33619471efc-22035f09a6cf00ec:T=1641436317:RT=1641436317:S=ALNI_Mb1QROC5p67VTvD3iPkYlapHKLRMg; beegosessionID=16a193444f911fefb8f7e1cdd8e945c2",
               "TE": "Trailers"}

    params = {"t": "", "q": "中国疫情", "pf": "0", "ps": "0", "page": page, "stime": "2019-03-30", "etime": "2021-01-10",
              "sort": "rel", "highlight": "1", "num": "10", "ie": "utf-8"}

    response = requests.get("https://interface.sina.cn/homepage/search.d.json?", params=params, headers=headers)
    dic = json.loads(response.text)
    news += dic["result"]["list"]
    return news
news = []
for i in range(1, 801):
    news = getnews(i, news)

new_info = getdata(news)
print(news)
#
print(new_info)
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('MySheet')

worksheet.write(0, 0, "标题")
worksheet.write(0, 1, "时间")
worksheet.write(0, 2, "媒体")
worksheet.write(0, 3, "网址")
worksheet.write(0, 4, "副标题")
worksheet.write(0, 5, "内容")


for i in range(len(news)):
    print(news)
    print(new_info)
    worksheet.write(i+1, 0, news["origin_title"])
    worksheet.write(i+1, 1, news["datetime"])
    worksheet.write(i+1, 2, news["media"])
    worksheet.write(i+1, 3, news["url"])
    worksheet.write(i + 1, 4, new_info["title"])
    worksheet.write(i + 1, 5, new_info["content"])

workbook.save('data/data(content).xls')
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-10-5 23:18

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表