|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 玄夜Python之路 于 2017-3-17 16:59 编辑
解析到615,634,697,1062,1076,1305,1335,1346,1355章节时编码错误,百度说是Python爬取gzip压缩网页问题,可根据这个网页(Python爬取网页Utf-8解码错误及gzip压缩问题的解决办法 - 知乎专栏
https://zhuanlan.zhihu.com/p/25095566?refer=zjying2000)提高的信息并没有得到解决,不知道还有谁曾遇到过此问题!!!望大神解惑一下。
我仔细分析了一下这些网页其头文件中都是Accept-Encoding:gzip, deflate,如图:Accept-Encoding(此图暂时上传不了)我将headers复制粘贴吧:
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:no-cache
Connection:keep-alive
DNT:1
Host:www.23us.com
Pragma:no-cache
User-Agent:Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 BIDUBrowser/7.6 Safari/537.36
X-DevTools-Emulate-Network-Conditions-Client-Id:86C74E7B-700F-405A-96FD-DC2B68C49A9D
1489125733(1).png (166.1 KB, 下载次数: 0)
源码:
#!usr/bin/env python3
# -*- coding=utf-8 -*-
# 2017.3.8 斗破苍穹 顶点小说 url:http://www.23us.com/html/0/298/
# http://www.23us.com/html/0/298/1962332.html
import requests
from bs4 import BeautifulSoup
import time
home_url = 'http://www.23us.com/html/0/298/' #主页
def get(home_url): # 网页分析
header = {'Accept-Encoding':'','User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 BIDUBrowser/7.6 Safari/537.36'}
html = requests.get(home_url,headers=header).text.encode('ISO-8859-1')
soup = BeautifulSoup(html,'html.parser')
return soup
def zhangjie(home_url): # 章节获取
soup = get(home_url )
novel = ''.join([novel.text for novel in soup.select('h1')])[:4] #小说名称
print('正在下载的小说是:%s' %novel)
# urllib.request.urlretrieve(novel,'D:\\Python之窗\\Spider\\xiaoshuo\\%s' % novel)
url = [i.get('href') for i in soup.select('td.L a')[:-2]] #章节链接url
name = [i.text for i in soup.select('td.L a')[:-2]] #章节名称name
print('%s共计%d章' % (novel,len(name)))
print('现在开始获取%s的内容' % novel,'\n\n\n')
return url
def content(home_url): #内容获取
id = 0
for href in zhangjie(home_url):
url = home_url + href #章节链接url
soup = get(url)
id += 1
title = [title.text for title in soup.select('h1')] #标题list
title = ''.join(title)[3:]
print(title)
# contents = soup.select('dd#contents')
contents = [content.text for content in soup.select('dd#contents')] #章节内容list
contents = ''.join(contents).replace('\xa0\xa0\xa0\xa0','') #去掉空格
print('%s的内容已获取完毕!' % title)
path = 'D:\\Python之窗\\Spider\\xiaoshuo\\斗破苍穹\\'
try:
with open(path+title+'.txt','wt',encoding='utf-8') as f:
f.write('\t'+title+'\n'+contents)
print('%s的内容已下载完毕!' % title)
f.close()
except UnicodeEncodeError:
print('%s编码错误' % title)
time.sleep(2)
except UnicodeDecodeError:
print('%s编码错误' % title)
except OSError:
print('%s不知道啥意思'%title)
time.sleep(2)
content(home_url)
|
|