|
10鱼币
本帖最后由 Mr.Consummate 于 2020-3-7 14:44 编辑
python入门选手,刚写了一个爬取网站小说的爬虫,刚开始还不知道还有gzip压缩这回事,
先是经历了 UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte 的错误,
百度一番后知道了需要对接收到的数据进行解码,故参照 https://zhuanlan.zhihu.com/p/25095566 使用了gzip库解压网页再解码,本以为问题解决了,可谁知在爬取部分页面后又报了一个新的错误:
raise BadGzipFile('Not a gzipped file (%r)' % magic)
gzip.BadGzipFile: Not a gzipped file (b'<!')
目前还未找到错误原因以及解决办法,希望大佬帮忙看看
代码如下:
import urllib.request
import re
import gzip
import time
#首先爬取小说每一章节的链接
def get_url(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
page = urllib.request.urlopen(req)
content = page.read()
#content是gzip压缩过的数据,所以需要对我们接收的字节码进行一个解码操作
content = gzip.decompress(content).decode('utf-8')
str2_ = r"<dd><a href='([^.]*)"
str2 = re.findall(str2_,content)
return str2
#打开链接
def open_page(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
page = urllib.request.urlopen(req)
if ('Content-Encoding','gzip') in page.headers._headers:
#content是gzip压缩过的数据,所以需要对我们接收的字节码进行一个解码操作
content = page.read()
content = gzip.decompress(content).decode('utf-8')
else:
content = page.read().decode('utf-8')
return content
#抓取并保存小说
def save_content(strs):
content_ = r' ([^<]*)'
title_ = r'<title>([^_]*)'
content = re.findall(content_,strs)
title = re.findall(title_,strs)
content[0] = title[0] + '\n\n\n ' + content[0]
with open("元尊.txt","a") as f:
f.write(str('\n\n '.join(content)))
f.write('\n\n')
if __name__ == "__main__":
str1 = 'http://www.xbiquge.la/'
str3 = '.html'
str2 = get_url('http://www.xbiquge.la/14/14930/')
len_list = len(str2)
for i in range(0,len_list):
url = str1 + str2[i] + str3
print(url)
save_content(open_page(url))
time.sleep(1)
|
|