|
|
发表于 2019-3-26 15:08:42
|
显示全部楼层
改了个保存方式就好了
from urllib.request import Request, urlopen
from urllib.parse import urlencode
from fake_useragent import UserAgent
import re
def get_html(url):
headers = {
"User-Agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
request = Request(url, headers=headers)
response = urlopen(request)
# print(response.read().decode())
return response.read().decode()
def save_html(filename, html_bytes):
with open(filename, "w")as f:
f.write(html_bytes)
def main():
content = input("请输入要下载的内容:")
num = input("请输入要下载多少页:")
base_url = "http://tieba.baidu.com/f?ie=utf-8&{}"
# url = 'http://tieba.baidu.com/f?kw=%E8%8B%B9%E6%9E%9C&ie=utf-8&pn=50'
for pn in range(int(num)):
args = {
"pn": pn * 50,
"kw": content
}
filename = "第" + str(pn + 1) + "页.html"
args = urlencode(args)
print("正在保存" + filename)
html_bytes = get_html(base_url)
save_html(filename, html_bytes)
# print(html_bytes.decode())
# with open('1.html', 'w') as f:
# f.write(html_bytes.decode())
if __name__ == "__main__":
main()
|
|