|
楼主 |
发表于 2018-5-4 21:09:27
|
显示全部楼层
- import requests
- import bs4
- def open_url(url):
- headers = {
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
- "Referer":"https://movie.douban.com/top250"
- }
- res = requests.get(url,headers=headers)
- soup = bs4.BeautifulSoup(res.text,"html.parser")
- #print("服务器君要爆炸了!!!!!")
- return soup
-
- def movices_name_information_score_hotcomment(soup,count):
- name = []
- information = []
- score = []
- hotcomment = []
- result = []
-
- movice_name = soup.find_all("div",class_="hd")
- for each in movice_name:
- name.append(each.a.span.text.strip())
-
- movice_information = soup.find_all("div",class_="bd")
- for each in movice_information:
- information.append(each.p.text.strip())
- del information[0]
- movice_score = soup.find_all("span",class_="rating_num")
- for each in movice_score:
- score.append(each.text.strip())
-
- movice_hotcomment = soup.find_all("span",class_="inq")
- for each in movice_hotcomment:
- hotcomment.append(each.text.strip())
- if count==7:
- hotcomment.insert(14,"无简评")
- if count==9:
- hotcomment.insert(22,"无简评")
-
-
- numbers = len(name)
- for i in range(numbers):
- result.extend(name[i]+information[i]+score[i]+"\n"+hotcomment[i]+"\n\n")
- return result
- def download_top(result):
- with open("豆瓣TOP250电影一览.txt","a",encoding = "utf-8") as f:
- for each in result:
- f.write(each)
-
-
- def main():
- count = 0
- while count<10:
- print("正在爬取第%s页" %str(count+1))
- url = "https://movie.douban.com/top250" + "?start="+str(25 * count)
- soup = open_url(url)
- result = movices_name_information_score_hotcomment(soup,count)
-
- download_top(result)
-
- count +=1
-
- print("已经爬取第%s页" %str(count))
- if __name__ == "__main__":
- main()
复制代码 |
|