马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 chunguang 于 2018-8-27 17:41 编辑
python爬取评论,但只能爬取每一页的第一条评论,求大神import requests
import re
import csv
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from urllib.parse import urlencode
from multiprocessing import Pool
def get_comments(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response=requests.get(url,headers=headers)
response.encoding = 'gbk'
if response.status_code==200:
return response.text
print('请求评论错误')
return None
except RequestException:
return None
def parse_comments(html):
soup = BeautifulSoup(html, 'lxml')
items=soup.find_all('body')
for item in items:
name = item.select('.username ')[1].text
rank = item.select('#commentnormalinfo a')[0].text
comments = item.select('.commenttext')[0].text
province = item.select('.commentinfo span')[3].text
price = item.select('.price_num')[0].text
data={
'name':name,
'rank':rank,
'comments': comments,
'province': province,
'price': price
}
yield data
def main():
url = 'http://www.yanyue.cn/product/comments/148?paramsend=postget&productid=148¶msend=postget&page_offset=1'
html=get_comments(url)
for item in parse_comments(html):
print(item)
if __name__=='__main__':
main()
|