|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
#coding=utf-8
import requests
import json
class doubanspider:
def __init__(self):
self.url_temp_list =[
{"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
"country":"US"
},
{"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
"country": "UK"
},
{
"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_animation_hot/items?start={}&count=18&loc_id=108288",
"country": "AC"
}
]
self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
def parse_url(self,url):#发送请求,响应请求
print(url)
response =requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,json_str):#提取数据
dict_ret = json.loads(json_str)#json转换成python类型
content_str = dict_ret["subject_collection_items"] #17
total = dict_ret["total"]
return content_str,total
def save_content_list(self,content_list,country):
with open("douban.txt","a",encoding="utf-8") as f:
for content in content_list:
content["country"] = country
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("保存成功")
def run(self):#实现主要逻辑
for url_temp in self.url_temp_list:
num =0 #当前页码数
total = 100#假设有第一页
while num<total+18:
#1.start_url 开始地址
url =url_temp["url_temp"].format(num)
#2.发送请求,响应请求
json_str = self.parse_url(url) #得到响应字符串
#3.提取数据
content_list,total = self.get_content_list(json_str) #36
#4.保存
self.save_content_list(content_list,url_temp["country"])
# if len(content_list)<18:
# break
#5.构造下一页的url,进入循环
num +=18
if __name__ == '__main__':
douban_spider = doubanspider()
douban_spider.run() #46
|
|