采集豆瓣评价问题请指教

swanseabrian · 发表于 2022-5-1 22:15:09

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

我有个代码采集豆瓣评价

fake useraget 设置了

代理ip也买了

可以是还是采集几页就

这是啥情况呢请问
下面是代码

请帮我看下,

谢谢

#!/usr/bin/env python
# coding: utf-8
# In[135]:
import json
import random
import time
import copyheaders
import pandas
import requests
from fake_useragent import UserAgent
# In[136]:
#pip install -i https://pypi.doubanio.com/simple/ fake_useragent
import tempfile
a = tempfile.gettempdir()
print(a)
ua = UserAgent(verify_ssl=False)
print(ua.random)
# In[137]:
headers = copyheaders.headers_raw_to_dict(b"""
Accept: application/json
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
Cache-Control: no-cache
Connection: keep-alive
Host: m.douban.com
Pragma: no-cache
Referer: https://m.douban.com/movie/subject/27662747/comments?sort=time&start=25
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
User-Agent: ua.random
X-Requested-With: XMLHttpRequest
cookie: bid=wy6mZvf0dsM;ll="118271";
""")
# In[138]:
golols = {
"dl": {
'https': '16150'
}
}
# In[139]:
end_time = time.mktime(time.strptime('2015-01-01 00:00:00','%Y-%m-%d %H:%M:%S'))
# In[140]:
def getrate(text):
if '很差' in text:
return '1'
elif '较差' in text:
return '2'
elif '还行' in text:
return '3'
elif '推荐' in text:
return '4'
elif '力荐' in text:
return '5'
else:
return '-'
# In[141]:
def setproxy():
try:
time.sleep(random.randint(1, 10))
res = requests.get('http://http.tiqu.alibabaapi.com/getip?num=1&type=2&neek=563834&port=11&lb=1&pb=4&regions=')
ip = (res.json().get("data")[0].get("ip") + ":" + res.json().get("data")[0].get("port"))
golols['dl']["https"] = "https://" + ip
print("https://" + ip)
except Exception as e:
print(e, "设置代理错误！")
time.sleep(random.randint(1, 10))
setproxy()
# In[142]:
def get_proxy(headers):
#proxy_url为您在网站上的API
proxy_url = 'http://http.9vps.com/getip.asp?username=17844629386&pwd=4dea396a2a6519e5817632c7552c2d33&geshi=1&fenge=1&fengefu=&getnum=1'
aaa=requests.get(proxy_url, headers=headers).text
proxy_host = aaa.splitlines()[0]
print('代理IP为：'+proxy_host)
proxy = {
'http': proxy_host,
'https': proxy_host,
}
return proxy
# In[143]:
def download_start_1(icode, iname): #改动过
count = 0 #控制评论start =
result = []
pids = []
for i in range(9999):
base_api = f'https://m.douban.com/rexxar/api/v2/movie/{icode}/interests?count=50&order_by=latest&anony=0&start={count}&ck=&for_mobile=1'
print(f'正在请求地址：{base_api}')
time.sleep(1)
while True:
proxy = get_proxy(headers)
try:
res = requests.get(base_api,headers=headers,proxies=proxy, verify=False,timeout=10)
time.sleep(1)
break
except Exception as e:
print(f"网络异常：{e}")
get_proxy(headers)
time.sleep(5)
coms_list = res.json().get("interests") #解析json数据
print(f'当前页码请求数量',len(coms_list))
for icom in coms_list:
if icom.get("id") in pids:
print(f"重复id暂不采集： {icom.get('id')}")
continue
else:
pids.append(icom.get("id"))
count+=1
try:
saveitem = {}
saveitem["pid"] = icom.get("id")
saveitem["username"] = icom.get("user").get("id")
saveitem["rating_level"] = icom.get("rating").get("value") if icom.get("rating") != None else ''
saveitem["create_time"] = icom.get("create_time")
saveitem["vote_count"] = icom.get("vote_count")
saveitem["comment"] = icom.get("comment")
if time.mktime(time.strptime(icom.get("create_time"), '%Y-%m-%d %H:%M:%S')) <= end_time:
print(f"超出时间 ==》exit")
data_p = pandas.DataFrame(result)
data_p.to_excel(f"C:\\Users\\31051\\Desktop\\wait\\评论\\data_{icode}({iname}).xlsx",index=False)
return
result.append(saveitem)
print(f'页码：{i} 数量：{count} {saveitem}')
except Exception as e:
print(f"网络异常：{e}")
continue
if len(coms_list) != 0:
flag_count = 0
if len(coms_list) == 0:
flag_count = flag_count + 1
if flag_count >= 5:
print("NO.1")
print(flag_count)
break
if len(coms_list) <=40 and len(coms_list) % 100 != 0:
print(f"当前页码无数据 exit")
print("NO.2")
break
if count >= res.json().get("total"):
print(f"超出最大限制 exit")
print("NO.3")
break
print(len(result))
data_p = pandas.DataFrame(result)
# data_p.to_excel(f"C:\\Users\\31051\\Desktop\\wait\\评论\\data_{icode}({iname}).xlsx",index=False)
# In[144]:
import pandas as pd
# In[145]:
df_empty = pd.DataFrame(columns = ['name','id'])
# In[146]:
#df_empty = df_empty.append({'name': '一秒钟', 'id': '30257787'}, ignore_index=True)
df_empty = df_empty.append({'name': '寻汉计', 'id': '30464901'}, ignore_index=True)
df_empty = df_empty.append({'name': '日常幻想指南', 'id': '26823520'}, ignore_index=True)
#df_empty = df_empty.append({'name': '寂静之地2', 'id': '30206311'}, ignore_index=True)
#df_empty = df_empty.append({'name': '荞麦疯长', 'id': '30170833'}, ignore_index=True)
#df_empty = df_empty.append({'name': '蜜熊的音乐奇旅', 'id': '26935358'}, ignore_index=True)
#df_empty = df_empty.append({'name': '大红包', 'id': '33457717'}, ignore_index=True)
#df_empty = df_empty.append({'name': '五个扑水的少年', 'id': '35030151'}, ignore_index=True)
#df_empty = df_empty.append({'name': '中国医生', 'id': '35087699'}, ignore_index=True)
#df_empty = df_empty.append({'name': '花木兰', 'id': '26357307'}, ignore_index=True)
#df_empty = df_empty.append({'name': '送你一朵小红花', 'id': '35096844'}, ignore_index=True)
#df_empty = df_empty.append({'name': '宝可梦：超梦的逆袭进化', 'id': '30272753'}, ignore_index=True)
#df_empty = df_empty.append({'name': '小妇人', 'id': '26348103'}, ignore_index=True)
#df_empty = df_empty.append({'name': '第一次的离别', 'id': '30337172'}, ignore_index=True)
#df_empty = df_empty.append({'name': '哆啦A梦：大雄的新恐龙', 'id': '34454004'}, ignore_index=True)
# In[147]:
df_empty
# In[148]:
for i in range(len(df_empty)):
flag_count = 0
icode = df_empty['id'].iloc[i]
iname = df_empty['name'].iloc[i]
download_start_1(icode, iname)
# In[ ]:

复制代码

isdkz · 发表于 2022-5-1 22:34:44

本帖最后由 isdkz 于 2022-5-1 22:37 编辑

你确定你能爬取几页？

第一：requests 设置 proxy 的时候 url 是要带上协议的，

你提取代理的时候返回的代理只有 ip 地址和端口，所以你的 proxy 字典里面要自己加上去

def get_proxy(headers):

#proxy_url为您在网站上的API

proxy_url = 'http://http.9vps.com/getip.asp?username=17844629386&pwd=4dea396a2a6519e5817632c7552c2d33&geshi=1&fenge=1&fengefu=&getnum=1'

aaa=requests.get(proxy_url, headers=headers).text

proxy_host = aaa.splitlines()[0]

print('代理IP为：'+proxy_host)

proxy = {

'http': 'http://' + proxy_host, # 得在 ip 前面加上协议（代理的类型）

'https': 'http://' + proxy_host, # 得在 ip 前面加上协议（代理的类型）

}

return proxy
复制代码

第二：好像你那个代理接口提取出来的代理很多都不能用的

第三：不要随随便便把自己的接口暴露出来

swanseabrian · 发表于 2022-5-1 22:42:08

isdkz 发表于 2022-5-1 22:34
你确定你能爬取几页？

第一：requests 设置 proxy 的时候 url 是要带上协议的，

这代码是买ip的网站给你实例代码,我没改就,应该没问题吧

采的数据跟不设置是一样的,都是600条数据
但我一个朋友测试他可以采集1800条
所以我觉得还是被限制了

账号		自动登录	找回密码
密码			立即注册

采集豆瓣评价问题 请指教

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块

采集豆瓣评价问题请指教