python爬虫下载某网的pdf书
本帖最后由 快速收敛 于 2020-1-10 15:11 编辑1.爬取程序crawl.py
# -*- encoding: utf-8 -*-
import requests
import re
import os
import json
from settings import HEADERS, AJAX_HEADERS, BASE_URL, AJAX_URL, DOWNLOAD_HEADERS
import logging
import numpy as np
class PdfBook(object):
def get_save_list(self):
return for name in os.listdir('./book')]
def get_request(self, url, headers=None, params=None):
while True:
try:
response = requests.get(url, headers=headers, params=params, timeout=10)
if response.status_code in :
return response
logging.info(response.status_code)
raise AssertionError
except Exception:
logging.error('Error', exc_info=True)
return None
def savemore(self, data, name, filetype):
logging.info(f"{name}-->开始保存")
with open('D:\project\pdfbooks\\book\\' + name + filetype, 'wb') as f:
f.write(data)
logging.info(f"{name}-->保存完毕")
def savepdf(self, data, name):
logging.info(f"{name}-->开始保存")
with open('D:\project\pdfbooks\\book\\' + name + '.pdf', 'wb') as f:
f.write(data)
logging.info(f"{name}-->保存完毕")
def download(self, url, name):
logging.info(f"{name}-->加载中...")
# DOWNLOAD_HEADERS["Referer"] = refer
response = self.get_request(url, headers=DOWNLOAD_HEADERS)
content = response.content
logging.info(f"{name}-->加载完毕")
self.savepdf(content, name)
def get_download_url(self, url, name):
response = self.get_request(url, headers=AJAX_HEADERS)
text = json.loads(response.text)
logging.info(text)
download_url = text["downurl"]
self.download(download_url, name)
def get_json_url(self, url, name):
filetype = os.path.splitext(url)
if filetype:
response = self.get_request(url)
if not response:
logging.warning(f"访问异常,跳过:{name}下载")
return
content = response.content
self.savemore(content, name, filetype)
return
response = self.get_request(url, headers=AJAX_HEADERS)
if not response:
logging.warning(f"访问异常,跳过:{name}下载")
return
text = response.text
infos = re.findall(r'<a href="javascript:void\(0\)" class="button light" onclick="free_down(.*?)" id="free_down_link">', text)
logging.info(infos)
params = eval(infos)
uid = '1163421'
fid = params
file_chk = params
rd = str(np.random.random())
ajax_url = AJAX_URL % (uid, fid, fid, file_chk, rd)
logging.info(ajax_url)
self.get_download_url(ajax_url, name)
def get_detail_url(self, url, name):
response = self.get_request(url, headers=HEADERS)
text = response.text
infos = re.findall(r'>点击下载:.*?<a.*? href="(.*?)"\starget', text, re.S)
logging.info(infos)
book_download_url = infos
self.get_json_url(book_download_url, name)
def get_url(self, url):
response = self.get_request(url, headers=HEADERS)
text = response.text
li_list = re.findall(r'<a href="(.*?)" rel="bookmark" title="(.*?)"', text)
logging.info(str(li_list))
save_name_list = self.get_save_list()
for li_url, book_name in li_list:
if book_name in save_name_list:
logging.info(f"{book_name}-->已爬取过")
continue
logging.info(f"爬取页面:{li_url}")
self.get_detail_url(li_url, book_name)
# break
def run(self):
start_urls =
for url in start_urls:
logging.info(f'开始爬取页面:{url}')
self.get_url(url)
# break
2. 运行日志程序pbook.py
# -*- encoding:utf-8 -*-
import logging
import sys
import crawl
logger = logging.getLogger()
logger.setLevel(level='INFO')
# StreamHandler
stream_handler = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s %(filename)s %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
stream_handler.setLevel(level=logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
# FileHandler
file_handler = logging.FileHandler('output.log', encoding='utf-8')
fmt = '%(asctime)s %(filename)s %(levelname)s %(message)s'
formatter = logging.Formatter(fmt=fmt, datefmt='%Y/%m/%d %H:%M:%S')
file_handler.setLevel(level=logging.WARN)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
app = crawl.PdfBook()
if __name__ == '__main__':
app.run()
3. setting.py一些参数
from fake_useragent import UserAgent
BASE_URL = 'http://www.pdfbook.cn/page/%d'
AJAX_URL = 'https://stockbook.ctfile.com/get_file_url.php?uid=%s&fid=%s&folder_id=0&fid=%s&file_chk=%s&mb=0&app=0&acheck=1&verifycode=&rd=%s'
HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.pdfbook.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": UserAgent().random
}
AJAX_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
# "Cookie": "clicktopay=1566976785407; PHPSESSID=uqf94r0l7a8f1ff7pb6q2ehb56; checkadb=1; Hm_lvt_74590c71164d9fba556697bee04ad65c=1567157793,1567157799,1567157996,1567175582; protected_uid=197284123830; Hm_lpvt_74590c71164d9fba556697bee04ad65c=1567175719",
"Host": "stockbook.ctfile.com",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": UserAgent().random
}
DOWNLOAD_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "Host": "1163421.170.ctc.data.tv002.com",
# "Upgrade-Insecure-Requests": "1",
"User-Agent": UserAgent().random
}
错误日志会保存再当前目录下的output.log文件中,书下载的位置在当前目录下的book目录中
该网站书不多,小爬虫,练练手!
页:
[1]