快速收敛 发表于 2020-1-10 15:11:13

python爬虫下载某网的pdf书

本帖最后由 快速收敛 于 2020-1-10 15:11 编辑

1.爬取程序crawl.py
# -*- encoding: utf-8 -*-
import requests
import re
import os
import json
from settings import HEADERS, AJAX_HEADERS, BASE_URL, AJAX_URL, DOWNLOAD_HEADERS
import logging
import numpy as np


class PdfBook(object):
    def get_save_list(self):
      return for name in os.listdir('./book')]

    def get_request(self, url, headers=None, params=None):
      while True:
            try:
                response = requests.get(url, headers=headers, params=params, timeout=10)
                if response.status_code in :
                  return response
                logging.info(response.status_code)
                raise AssertionError
            except Exception:
                logging.error('Error', exc_info=True)
                return None

    def savemore(self, data, name, filetype):
      logging.info(f"{name}-->开始保存")
      with open('D:\project\pdfbooks\\book\\' + name + filetype, 'wb') as f:
            f.write(data)
            logging.info(f"{name}-->保存完毕")

    def savepdf(self, data, name):
      logging.info(f"{name}-->开始保存")
      with open('D:\project\pdfbooks\\book\\' + name + '.pdf', 'wb') as f:
            f.write(data)
            logging.info(f"{name}-->保存完毕")

    def download(self, url, name):
      logging.info(f"{name}-->加载中...")
      # DOWNLOAD_HEADERS["Referer"] = refer
      response = self.get_request(url, headers=DOWNLOAD_HEADERS)
      content = response.content
      logging.info(f"{name}-->加载完毕")
      self.savepdf(content, name)

    def get_download_url(self, url, name):
      response = self.get_request(url, headers=AJAX_HEADERS)
      text = json.loads(response.text)
      logging.info(text)
      download_url = text["downurl"]
      self.download(download_url, name)

    def get_json_url(self, url, name):
      filetype = os.path.splitext(url)
      if filetype:
            response = self.get_request(url)
            if not response:
                logging.warning(f"访问异常,跳过:{name}下载")
                return
            content = response.content
            self.savemore(content, name, filetype)
            return
      response = self.get_request(url, headers=AJAX_HEADERS)
      if not response:
            logging.warning(f"访问异常,跳过:{name}下载")
            return
      text = response.text
      infos = re.findall(r'<a href="javascript:void\(0\)" class="button light" onclick="free_down(.*?)" id="free_down_link">', text)
      logging.info(infos)
      params = eval(infos)
      uid = '1163421'
      fid = params
      file_chk = params
      rd = str(np.random.random())
      ajax_url = AJAX_URL % (uid, fid, fid, file_chk, rd)
      logging.info(ajax_url)
      self.get_download_url(ajax_url, name)

    def get_detail_url(self, url, name):
      response = self.get_request(url, headers=HEADERS)
      text = response.text
      infos = re.findall(r'>点击下载:.*?<a.*? href="(.*?)"\starget', text, re.S)
      logging.info(infos)
      book_download_url = infos
      self.get_json_url(book_download_url, name)

    def get_url(self, url):
      response = self.get_request(url, headers=HEADERS)
      text = response.text
      li_list = re.findall(r'<a href="(.*?)" rel="bookmark" title="(.*?)"', text)
      logging.info(str(li_list))
      save_name_list = self.get_save_list()
      for li_url, book_name in li_list:
            if book_name in save_name_list:
                logging.info(f"{book_name}-->已爬取过")
                continue
            logging.info(f"爬取页面:{li_url}")
            self.get_detail_url(li_url, book_name)
            # break

    def run(self):
      start_urls =
      for url in start_urls:
            logging.info(f'开始爬取页面:{url}')
            self.get_url(url)
            # break


2. 运行日志程序pbook.py
# -*- encoding:utf-8 -*-
import logging
import sys
import crawl

logger = logging.getLogger()
logger.setLevel(level='INFO')

# StreamHandler
stream_handler = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s %(filename)s %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
stream_handler.setLevel(level=logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# FileHandler
file_handler = logging.FileHandler('output.log', encoding='utf-8')
fmt = '%(asctime)s %(filename)s %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
file_handler.setLevel(level=logging.WARN)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

app = crawl.PdfBook()


if __name__ == '__main__':
    app.run()

3. setting.py一些参数
from fake_useragent import UserAgent

BASE_URL = 'http://www.pdfbook.cn/page/%d'
AJAX_URL = 'https://stockbook.ctfile.com/get_file_url.php?uid=%s&fid=%s&folder_id=0&fid=%s&file_chk=%s&mb=0&app=0&acheck=1&verifycode=&rd=%s'
HEADERS = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      "Accept-Encoding": "gzip, deflate",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Cache-Control": "max-age=0",
      "Connection": "keep-alive",
      "Host": "www.pdfbook.cn",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}
AJAX_HEADERS = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      "Accept-Encoding": "gzip, deflate",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Cache-Control": "max-age=0",
      "Connection": "keep-alive",
      # "Cookie": "clicktopay=1566976785407; PHPSESSID=uqf94r0l7a8f1ff7pb6q2ehb56; checkadb=1; Hm_lvt_74590c71164d9fba556697bee04ad65c=1567157793,1567157799,1567157996,1567175582; protected_uid=197284123830; Hm_lpvt_74590c71164d9fba556697bee04ad65c=1567175719",
      "Host": "stockbook.ctfile.com",
      "Sec-Fetch-Mode": "navigate",
      "Sec-Fetch-Site": "none",
      "Sec-Fetch-User": "?1",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}
DOWNLOAD_HEADERS = {
      # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      # "Accept-Encoding": "gzip, deflate",
      # "Accept-Language": "zh-CN,zh;q=0.9",
      # "Cache-Control": "max-age=0",
      # "Connection": "keep-alive",
      # "Host": "1163421.170.ctc.data.tv002.com",
      # "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}


错误日志会保存再当前目录下的output.log文件中,书下载的位置在当前目录下的book目录中
该网站书不多,小爬虫,练练手!
页: [1]
查看完整版本: python爬虫下载某网的pdf书