鱼C论坛

 找回密码
 立即注册
查看: 1389|回复: 0

[原创] python爬虫下载某网的pdf书

[复制链接]
发表于 2020-1-10 15:11:13 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 快速收敛 于 2020-1-10 15:11 编辑

1.爬取程序crawl.py
# -*- encoding: utf-8 -*-
import requests
import re
import os
import json
from settings import HEADERS, AJAX_HEADERS, BASE_URL, AJAX_URL, DOWNLOAD_HEADERS
import logging
import numpy as np


class PdfBook(object):
    def get_save_list(self):
        return [name.split('.')[0] for name in os.listdir('./book')]

    def get_request(self, url, headers=None, params=None):
        while True:
            try:
                response = requests.get(url, headers=headers, params=params, timeout=10)
                if response.status_code in [200, 302]:
                    return response
                logging.info(response.status_code)
                raise AssertionError
            except Exception:
                logging.error('Error', exc_info=True)
                return None

    def savemore(self, data, name, filetype):
        logging.info(f"{name}-->开始保存")
        with open('D:\project\pdfbooks\\book\\' + name + filetype, 'wb') as f:
            f.write(data)
            logging.info(f"{name}-->保存完毕")

    def savepdf(self, data, name):
        logging.info(f"{name}-->开始保存")
        with open('D:\project\pdfbooks\\book\\' + name + '.pdf', 'wb') as f:
            f.write(data)
            logging.info(f"{name}-->保存完毕")

    def download(self, url, name):
        logging.info(f"{name}-->加载中...")
        # DOWNLOAD_HEADERS["Referer"] = refer
        response = self.get_request(url, headers=DOWNLOAD_HEADERS)
        content = response.content
        logging.info(f"{name}-->加载完毕")
        self.savepdf(content, name)

    def get_download_url(self, url, name):
        response = self.get_request(url, headers=AJAX_HEADERS)
        text = json.loads(response.text)
        logging.info(text)
        download_url = text["downurl"]
        self.download(download_url, name)

    def get_json_url(self, url, name):
        filetype = os.path.splitext(url)[1]
        if filetype:
            response = self.get_request(url)
            if not response:
                logging.warning(f"访问异常,跳过:{name}下载")
                return
            content = response.content
            self.savemore(content, name, filetype)
            return
        response = self.get_request(url, headers=AJAX_HEADERS)
        if not response:
            logging.warning(f"访问异常,跳过:{name}下载")
            return
        text = response.text
        infos = re.findall(r'<a href="javascript:void\(0\)" class="button light" onclick="free_down(.*?)" id="free_down_link">', text)
        logging.info(infos)
        params = eval(infos[0])
        uid = '1163421'
        fid = params[0]
        file_chk = params[2]
        rd = str(np.random.random())
        ajax_url = AJAX_URL % (uid, fid, fid, file_chk, rd)
        logging.info(ajax_url)
        self.get_download_url(ajax_url, name)

    def get_detail_url(self, url, name):
        response = self.get_request(url, headers=HEADERS)
        text = response.text
        infos = re.findall(r'>点击下载:.*?<a.*? href="(.*?)"\starget', text, re.S)
        logging.info(infos)
        book_download_url = infos[0]
        self.get_json_url(book_download_url, name)

    def get_url(self, url):
        response = self.get_request(url, headers=HEADERS)
        text = response.text
        li_list = re.findall(r'<a href="(.*?)" rel="bookmark" title="(.*?)"', text)
        logging.info(str(li_list))
        save_name_list = self.get_save_list()
        for li_url, book_name in li_list:
            if book_name in save_name_list:
                logging.info(f"{book_name}-->已爬取过")
                continue
            logging.info(f"爬取页面:{li_url}")
            self.get_detail_url(li_url, book_name)
            # break

    def run(self):
        start_urls = [BASE_URL % p for p in range(1, 143)]
        for url in start_urls:
            logging.info(f'开始爬取页面:{url}')
            self.get_url(url)
            # break

2. 运行日志程序pbook.py
# -*- encoding:utf-8 -*-
import logging
import sys
import crawl

logger = logging.getLogger()
logger.setLevel(level='INFO')

# StreamHandler
stream_handler = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
stream_handler.setLevel(level=logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# FileHandler
file_handler = logging.FileHandler('output.log', encoding='utf-8')
fmt = '%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
file_handler.setLevel(level=logging.WARN)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

app = crawl.PdfBook()


if __name__ == '__main__':
    app.run()

3. setting.py一些参数
from fake_useragent import UserAgent

BASE_URL = 'http://www.pdfbook.cn/page/%d'
AJAX_URL = 'https://stockbook.ctfile.com/get_file_url.php?uid=%s&fid=%s&folder_id=0&fid=%s&file_chk=%s&mb=0&app=0&acheck=1&verifycode=&rd=%s'
HEADERS = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      "Accept-Encoding": "gzip, deflate",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Cache-Control": "max-age=0",
      "Connection": "keep-alive",
      "Host": "www.pdfbook.cn",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}
AJAX_HEADERS = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      "Accept-Encoding": "gzip, deflate",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Cache-Control": "max-age=0",
      "Connection": "keep-alive",
      # "Cookie": "clicktopay=1566976785407; PHPSESSID=uqf94r0l7a8f1ff7pb6q2ehb56; checkadb=1; Hm_lvt_74590c71164d9fba556697bee04ad65c=1567157793,1567157799,1567157996,1567175582; protected_uid=197284123830; Hm_lpvt_74590c71164d9fba556697bee04ad65c=1567175719",
      "Host": "stockbook.ctfile.com",
      "Sec-Fetch-Mode": "navigate",
      "Sec-Fetch-Site": "none",
      "Sec-Fetch-User": "?1",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}
DOWNLOAD_HEADERS = {
      # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
      # "Accept-Encoding": "gzip, deflate",
      # "Accept-Language": "zh-CN,zh;q=0.9",
      # "Cache-Control": "max-age=0",
      # "Connection": "keep-alive",
      # "Host": "1163421.170.ctc.data.tv002.com",
      # "Upgrade-Insecure-Requests": "1",
      "User-Agent": UserAgent().random
}


错误日志会保存再当前目录下的output.log文件中,书下载的位置在当前目录下的book目录中
该网站书不多,小爬虫,练练手!
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-6-14 04:47

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表