|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬的时新浪网的新闻,但存储到csv格式中出错了吗,求大神- import requests
- import re
- import json
- import pandas
- from requests.exceptions import RequestException
- from bs4 import BeautifulSoup
- def parse_comments_news(url):
- commentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3'#尝试
- pattern=re.search('doc-i(.*?).shtml',url)
- newsid=pattern.group(1)
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
- }
- try:
- response=requests.get(commentsurl.format(newsid),headers=headers)
- if response.status_code==200:
- return json.loads(response.text)['result']['count']['total']#是一个字典的格式
- return None
- except RequestException:
- print('请求评论错误')
- return None
- def parse_page_news(url):
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
- }
- try:
- response=requests.get(url,headers=headers)
- response.encoding='utf-8'
- if response.status_code==200:
- soup = BeautifulSoup(response.text, 'lxml')
- result = {}
- result['title'] = soup.select('.main-title')[0].text
- result['source'] = soup.select('.source')[0].text
- result['date'] = soup.select('.date')[0].text
- if soup.select('.show_author')[0].text.strip()[5:]:
- result['author'] = soup.select('.show_author')[0].text.strip()[5:]
- items = soup.select('#article p')
- del items[-1] # 删除列表中最后一个元素
- result['content'] = '\n'.join([item.text.replace('\u3000\u3000', '').replace('\xa0 ', '\n') for item in items])
- result['comments'] = parse_comments_news(url)
- return result
- return None
- except RequestException:
- print('请求网页错误')
- return None
- def parse_List_Links(list_url):
- newsdetail=[]
- response=requests.get(list_url)
- if response.status_code==200:
- jd=json.loads(response.text.lstrip(' newsloadercallback(').rstrip(');'))#json类型的字符串变成字典的格式
- for item in jd['result']['data']:
- newsdetail.append(parse_page_news(item['url']))
- return newsdetail
- return None
- def write_to_csv(content):
- df = pandas.DataFrame(content)
- print(df)
- df.to_csv(r'C:\Users\lenovo\Desktop\爬虫\新闻\国内新闻.csv', 'r',encoding='gbk')
- def main(page):
- url='http://news.sina.com.cn/o/2018-08-10/doc-ihhnunsq7748786.shtml'
- list_url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page='+str(page)+'&callback=newsloadercallback'
- news_total = []
- newsurl = list_url.format(page)
- newary = parse_List_Links(newsurl)
- news_total.extend(newary)
- write_to_csv(news_total)
- if __name__=='__main__':
- for i in range(10):
- main(page=i)
复制代码
出现的错误是 UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 95: illegal multibyte sequence |
|