先说会 你这发的是代码???
我把这些程序粘贴到pycharm上改格式都改了半天,另外我用的python2,有些不兼容,又花时间改了些![](static/image/smiley/ARU/aru-1x-1_039.png) ![](static/image/smiley/ARU/aru-1x-1_039.png) ![](static/image/smiley/ARU/aru-1x-1_039.png)
基本改完了,也不报错了,只是下载的都是乱码,明天再看吧。。。#!usr/bin/env python2
# -*- coding=utf-8 -*-
# 2017.3.8 斗破苍穹 顶点小说 url:http://www.23us.com/html/0/298/
# http://www.23us.com/html/0/298/1962332.html
import requests
from bs4 import BeautifulSoup
import time
home_url = 'http://www.23us.com/html/0/298/' #主页
def get(home_url):# 网页分析
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'}
html = requests.get(home_url,headers=header).text.encode('utf-8')
soup = BeautifulSoup(html,'html.parser')
return soup
def zhangjie(home_url):# 章节获取
soup = get(home_url )
novel = ''.join([novel.text for novel in soup.select('h1')])[:4]#小说名称
# print('正在下载的小说是:%s' %novel)
# urllib.request.urlretrieve(novel,'D:\\Python之窗\\Spider\\xiaoshuo\\%s' % novel)
url = [i.get('href') for i in soup.select('td.L a')[:-2]]#章节链接url
name = [i.text for i in soup.select('td.L a')[:-2]]#章节名称name
#print'%s共计%d章' % (novel,len(name)))
#print('现在开始获取%s的内容' % novel,'\n\n\n')
return url
def content(home_url):#内容获取
id = 0
for href in zhangjie(home_url):
url = home_url + href#章节链接url
soup = get(url)
id += 1
title = [title.text for title in soup.select('h1')]#标题list
title = ''.join(title)[3:]
#print(title)
# contents = soup.select('dd#contents')
contents = [content.text for content in soup.select('dd#contents')]#章节内容list
contents = ''.join(contents)
#print('%s的内容已获取完毕!' % title)
path = 'D:\\Python2\\Spider\\xiaoshuo\\'
try:
with open(path+title+'.txt','wt') as f:
f.write('\t'+title+'\n'+contents)
#print('%s的内容已下载完毕!' % title)
f.close()
except UnicodeEncodeError:
#print('%s编码错误' % title)
time.sleep(2)
except UnicodeDecodeError,e:
print e
#print('%s编码错误' % title)
except OSError,e:
print e
time.sleep(2)
content(home_url)
这才是发代码的正确格式。 |