|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
我有个代码采集豆瓣评价
fake useraget 设置了
代理ip也买了
可以是还是采集几页就
这是啥情况 呢请问
下面是代码
请帮我看下,
谢谢
- #!/usr/bin/env python
- # coding: utf-8
- # In[135]:
- import json
- import random
- import time
- import copyheaders
- import pandas
- import requests
- from fake_useragent import UserAgent
- # In[136]:
- #pip install -i https://pypi.doubanio.com/simple/ fake_useragent
- import tempfile
- a = tempfile.gettempdir()
- print(a)
- ua = UserAgent(verify_ssl=False)
- print(ua.random)
- # In[137]:
- headers = copyheaders.headers_raw_to_dict(b"""
- Accept: application/json
- Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
- Cache-Control: no-cache
- Connection: keep-alive
- Host: m.douban.com
- Pragma: no-cache
- Referer: https://m.douban.com/movie/subject/27662747/comments?sort=time&start=25
- Sec-Fetch-Dest: empty
- Sec-Fetch-Mode: cors
- Sec-Fetch-Site: same-origin
- User-Agent: ua.random
- X-Requested-With: XMLHttpRequest
- cookie: bid=wy6mZvf0dsM;ll="118271";
- """)
- # In[138]:
- golols = {
- "dl": {
- 'https': '16150'
- }
- }
- # In[139]:
- end_time = time.mktime(time.strptime('2015-01-01 00:00:00','%Y-%m-%d %H:%M:%S'))
- # In[140]:
- def getrate(text):
- if '很差' in text:
- return '1'
- elif '较差' in text:
- return '2'
- elif '还行' in text:
- return '3'
- elif '推荐' in text:
- return '4'
- elif '力荐' in text:
- return '5'
- else:
- return '-'
- # In[141]:
- def setproxy():
- try:
- time.sleep(random.randint(1, 10))
- res = requests.get('http://http.tiqu.alibabaapi.com/getip?num=1&type=2&neek=563834&port=11&lb=1&pb=4®ions=')
-
- ip = (res.json().get("data")[0].get("ip") + ":" + res.json().get("data")[0].get("port"))
- golols['dl']["https"] = "https://" + ip
- print("https://" + ip)
- except Exception as e:
- print(e, "设置代理错误!")
- time.sleep(random.randint(1, 10))
- setproxy()
- # In[142]:
- def get_proxy(headers):
- #proxy_url为您在网站上的API
- proxy_url = 'http://http.9vps.com/getip.asp?username=17844629386&pwd=4dea396a2a6519e5817632c7552c2d33&geshi=1&fenge=1&fengefu=&getnum=1'
- aaa=requests.get(proxy_url, headers=headers).text
- proxy_host = aaa.splitlines()[0]
- print('代理IP为:'+proxy_host)
- proxy = {
- 'http': proxy_host,
- 'https': proxy_host,
- }
- return proxy
- # In[143]:
- def download_start_1(icode, iname): #改动过
- count = 0 #控制评论start =
- result = []
- pids = []
- for i in range(9999):
- base_api = f'https://m.douban.com/rexxar/api/v2/movie/{icode}/interests?count=50&order_by=latest&anony=0&start={count}&ck=&for_mobile=1'
- print(f'正在请求地址:{base_api}')
- time.sleep(1)
- while True:
- proxy = get_proxy(headers)
- try:
-
-
- res = requests.get(base_api,headers=headers,proxies=proxy, verify=False,timeout=10)
- time.sleep(1)
- break
- except Exception as e:
- print(f"网络异常:{e}")
- get_proxy(headers)
- time.sleep(5)
- coms_list = res.json().get("interests") #解析json数据
- print(f'当前页码请求数量',len(coms_list))
- for icom in coms_list:
- if icom.get("id") in pids:
- print(f"重复id暂不采集: {icom.get('id')}")
- continue
- else:
- pids.append(icom.get("id"))
- count+=1
- try:
- saveitem = {}
- saveitem["pid"] = icom.get("id")
- saveitem["username"] = icom.get("user").get("id")
- saveitem["rating_level"] = icom.get("rating").get("value") if icom.get("rating") != None else ''
- saveitem["create_time"] = icom.get("create_time")
- saveitem["vote_count"] = icom.get("vote_count")
- saveitem["comment"] = icom.get("comment")
- if time.mktime(time.strptime(icom.get("create_time"), '%Y-%m-%d %H:%M:%S')) <= end_time:
- print(f"超出时间 ==》exit")
- data_p = pandas.DataFrame(result)
- data_p.to_excel(f"C:\\Users\\31051\\Desktop\\wait\\评论\\data_{icode}({iname}).xlsx",index=False)
- return
- result.append(saveitem)
- print(f'页码:{i} 数量:{count} {saveitem}')
- except Exception as e:
- print(f"网络异常:{e}")
- continue
- if len(coms_list) != 0:
- flag_count = 0
-
- if len(coms_list) == 0:
- flag_count = flag_count + 1
-
- if flag_count >= 5:
- print("NO.1")
- print(flag_count)
- break
-
- if len(coms_list) <=40 and len(coms_list) % 100 != 0:
- print(f"当前页码无数据 exit")
- print("NO.2")
- break
- if count >= res.json().get("total"):
- print(f"超出最大限制 exit")
- print("NO.3")
- break
- print(len(result))
- data_p = pandas.DataFrame(result)
- # data_p.to_excel(f"C:\\Users\\31051\\Desktop\\wait\\评论\\data_{icode}({iname}).xlsx",index=False)
- # In[144]:
- import pandas as pd
- # In[145]:
- df_empty = pd.DataFrame(columns = ['name','id'])
- # In[146]:
- #df_empty = df_empty.append({'name': '一秒钟', 'id': '30257787'}, ignore_index=True)
- df_empty = df_empty.append({'name': '寻汉计', 'id': '30464901'}, ignore_index=True)
- df_empty = df_empty.append({'name': '日常幻想指南', 'id': '26823520'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '寂静之地2', 'id': '30206311'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '荞麦疯长', 'id': '30170833'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '蜜熊的音乐奇旅', 'id': '26935358'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '大红包', 'id': '33457717'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '五个扑水的少年', 'id': '35030151'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '中国医生', 'id': '35087699'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '花木兰', 'id': '26357307'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '送你一朵小红花', 'id': '35096844'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '宝可梦:超梦的逆袭 进化', 'id': '30272753'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '小妇人', 'id': '26348103'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '第一次的离别', 'id': '30337172'}, ignore_index=True)
- #df_empty = df_empty.append({'name': '哆啦A梦:大雄的新恐龙', 'id': '34454004'}, ignore_index=True)
- # In[147]:
- df_empty
- # In[148]:
- for i in range(len(df_empty)):
- flag_count = 0
- icode = df_empty['id'].iloc[i]
- iname = df_empty['name'].iloc[i]
- download_start_1(icode, iname)
- # In[ ]:
复制代码
|
|