鱼C论坛

 找回密码
 立即注册
查看: 1375|回复: 0

[原创] python爬虫下载某网的pdf书

[复制链接]
发表于 2020-1-10 15:11:13 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 快速收敛 于 2020-1-10 15:11 编辑

1.爬取程序crawl.py
  1. # -*- encoding: utf-8 -*-
  2. import requests
  3. import re
  4. import os
  5. import json
  6. from settings import HEADERS, AJAX_HEADERS, BASE_URL, AJAX_URL, DOWNLOAD_HEADERS
  7. import logging
  8. import numpy as np


  9. class PdfBook(object):
  10.     def get_save_list(self):
  11.         return [name.split('.')[0] for name in os.listdir('./book')]

  12.     def get_request(self, url, headers=None, params=None):
  13.         while True:
  14.             try:
  15.                 response = requests.get(url, headers=headers, params=params, timeout=10)
  16.                 if response.status_code in [200, 302]:
  17.                     return response
  18.                 logging.info(response.status_code)
  19.                 raise AssertionError
  20.             except Exception:
  21.                 logging.error('Error', exc_info=True)
  22.                 return None

  23.     def savemore(self, data, name, filetype):
  24.         logging.info(f"{name}-->开始保存")
  25.         with open('D:\project\pdfbooks\\book\\' + name + filetype, 'wb') as f:
  26.             f.write(data)
  27.             logging.info(f"{name}-->保存完毕")

  28.     def savepdf(self, data, name):
  29.         logging.info(f"{name}-->开始保存")
  30.         with open('D:\project\pdfbooks\\book\\' + name + '.pdf', 'wb') as f:
  31.             f.write(data)
  32.             logging.info(f"{name}-->保存完毕")

  33.     def download(self, url, name):
  34.         logging.info(f"{name}-->加载中...")
  35.         # DOWNLOAD_HEADERS["Referer"] = refer
  36.         response = self.get_request(url, headers=DOWNLOAD_HEADERS)
  37.         content = response.content
  38.         logging.info(f"{name}-->加载完毕")
  39.         self.savepdf(content, name)

  40.     def get_download_url(self, url, name):
  41.         response = self.get_request(url, headers=AJAX_HEADERS)
  42.         text = json.loads(response.text)
  43.         logging.info(text)
  44.         download_url = text["downurl"]
  45.         self.download(download_url, name)

  46.     def get_json_url(self, url, name):
  47.         filetype = os.path.splitext(url)[1]
  48.         if filetype:
  49.             response = self.get_request(url)
  50.             if not response:
  51.                 logging.warning(f"访问异常,跳过:{name}下载")
  52.                 return
  53.             content = response.content
  54.             self.savemore(content, name, filetype)
  55.             return
  56.         response = self.get_request(url, headers=AJAX_HEADERS)
  57.         if not response:
  58.             logging.warning(f"访问异常,跳过:{name}下载")
  59.             return
  60.         text = response.text
  61.         infos = re.findall(r'<a href="javascript:void\(0\)" class="button light" onclick="free_down(.*?)" id="free_down_link">', text)
  62.         logging.info(infos)
  63.         params = eval(infos[0])
  64.         uid = '1163421'
  65.         fid = params[0]
  66.         file_chk = params[2]
  67.         rd = str(np.random.random())
  68.         ajax_url = AJAX_URL % (uid, fid, fid, file_chk, rd)
  69.         logging.info(ajax_url)
  70.         self.get_download_url(ajax_url, name)

  71.     def get_detail_url(self, url, name):
  72.         response = self.get_request(url, headers=HEADERS)
  73.         text = response.text
  74.         infos = re.findall(r'>点击下载:.*?<a.*? href="(.*?)"\starget', text, re.S)
  75.         logging.info(infos)
  76.         book_download_url = infos[0]
  77.         self.get_json_url(book_download_url, name)

  78.     def get_url(self, url):
  79.         response = self.get_request(url, headers=HEADERS)
  80.         text = response.text
  81.         li_list = re.findall(r'<a href="(.*?)" rel="bookmark" title="(.*?)"', text)
  82.         logging.info(str(li_list))
  83.         save_name_list = self.get_save_list()
  84.         for li_url, book_name in li_list:
  85.             if book_name in save_name_list:
  86.                 logging.info(f"{book_name}-->已爬取过")
  87.                 continue
  88.             logging.info(f"爬取页面:{li_url}")
  89.             self.get_detail_url(li_url, book_name)
  90.             # break

  91.     def run(self):
  92.         start_urls = [BASE_URL % p for p in range(1, 143)]
  93.         for url in start_urls:
  94.             logging.info(f'开始爬取页面:{url}')
  95.             self.get_url(url)
  96.             # break
复制代码


2. 运行日志程序pbook.py
  1. # -*- encoding:utf-8 -*-
  2. import logging
  3. import sys
  4. import crawl

  5. logger = logging.getLogger()
  6. logger.setLevel(level='INFO')

  7. # StreamHandler
  8. stream_handler = logging.StreamHandler(sys.stdout)
  9. fmt = '%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s'
  10. formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
  11. stream_handler.setLevel(level=logging.INFO)
  12. stream_handler.setFormatter(formatter)
  13. logger.addHandler(stream_handler)

  14. # FileHandler
  15. file_handler = logging.FileHandler('output.log', encoding='utf-8')
  16. fmt = '%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s'
  17. formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
  18. file_handler.setLevel(level=logging.WARN)
  19. file_handler.setFormatter(formatter)
  20. logger.addHandler(file_handler)

  21. app = crawl.PdfBook()


  22. if __name__ == '__main__':
  23.     app.run()
复制代码


3. setting.py一些参数
  1. from fake_useragent import UserAgent

  2. BASE_URL = 'http://www.pdfbook.cn/page/%d'
  3. AJAX_URL = 'https://stockbook.ctfile.com/get_file_url.php?uid=%s&fid=%s&folder_id=0&fid=%s&file_chk=%s&mb=0&app=0&acheck=1&verifycode=&rd=%s'
  4. HEADERS = {
  5.       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
  6.       "Accept-Encoding": "gzip, deflate",
  7.       "Accept-Language": "zh-CN,zh;q=0.9",
  8.       "Cache-Control": "max-age=0",
  9.       "Connection": "keep-alive",
  10.       "Host": "www.pdfbook.cn",
  11.       "Upgrade-Insecure-Requests": "1",
  12.       "User-Agent": UserAgent().random
  13. }
  14. AJAX_HEADERS = {
  15.       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
  16.       "Accept-Encoding": "gzip, deflate",
  17.       "Accept-Language": "zh-CN,zh;q=0.9",
  18.       "Cache-Control": "max-age=0",
  19.       "Connection": "keep-alive",
  20.       # "Cookie": "clicktopay=1566976785407; PHPSESSID=uqf94r0l7a8f1ff7pb6q2ehb56; checkadb=1; Hm_lvt_74590c71164d9fba556697bee04ad65c=1567157793,1567157799,1567157996,1567175582; protected_uid=197284123830; Hm_lpvt_74590c71164d9fba556697bee04ad65c=1567175719",
  21.       "Host": "stockbook.ctfile.com",
  22.       "Sec-Fetch-Mode": "navigate",
  23.       "Sec-Fetch-Site": "none",
  24.       "Sec-Fetch-User": "?1",
  25.       "Upgrade-Insecure-Requests": "1",
  26.       "User-Agent": UserAgent().random
  27. }
  28. DOWNLOAD_HEADERS = {
  29.       # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
  30.       # "Accept-Encoding": "gzip, deflate",
  31.       # "Accept-Language": "zh-CN,zh;q=0.9",
  32.       # "Cache-Control": "max-age=0",
  33.       # "Connection": "keep-alive",
  34.       # "Host": "1163421.170.ctc.data.tv002.com",
  35.       # "Upgrade-Insecure-Requests": "1",
  36.       "User-Agent": UserAgent().random
  37. }
复制代码



错误日志会保存再当前目录下的output.log文件中,书下载的位置在当前目录下的book目录中
该网站书不多,小爬虫,练练手!
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-6-1 15:27

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表