|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 chunguang 于 2018-8-27 17:41 编辑
python爬取评论,但只能爬取每一页的第一条评论,求大神- import requests
- import re
- import csv
- from bs4 import BeautifulSoup
- from requests.exceptions import RequestException
- from urllib.parse import urlencode
- from multiprocessing import Pool
- def get_comments(url):
- try:
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
- }
- response=requests.get(url,headers=headers)
- response.encoding = 'gbk'
- if response.status_code==200:
- return response.text
- print('请求评论错误')
- return None
- except RequestException:
- return None
- def parse_comments(html):
- soup = BeautifulSoup(html, 'lxml')
- items=soup.find_all('body')
- for item in items:
- name = item.select('.username ')[1].text
- rank = item.select('#commentnormalinfo a')[0].text
- comments = item.select('.commenttext')[0].text
- province = item.select('.commentinfo span')[3].text
- price = item.select('.price_num')[0].text
- data={
- 'name':name,
- 'rank':rank,
- 'comments': comments,
- 'province': province,
- 'price': price
- }
- yield data
- def main():
- url = 'http://www.yanyue.cn/product/comments/148?paramsend=postget&productid=148¶msend=postget&page_offset=1'
- html=get_comments(url)
- for item in parse_comments(html):
- print(item)
- if __name__=='__main__':
- main()
复制代码 |
|