|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬小说网站,但是爬一段时间就会出现出现“远程主机没响应”之类的错误,热后又得要重头开始爬,这样的异常该怎么写?求大神指点!
新手,代码写的比较乱,见笑了~!
- import requests
- import re
- import pymysql
- conn = pymysql.connect(
- host = '127.0.0.1',
- port = 3306,
- user = 'root',
- passwd = 'root',
- db = 'xiaoshu',
- charset = 'utf8'
- )
- curson = conn.cursor()
- def get_next(url):
- response = requests.get(url)
- response.encoding = 'utf-8'
- return response.text
- def re_next(next_html):
- r = re.compile('class="tspage">.*?1/(.*?) 每页.*?<a',re.S)
- item = re.findall(r,next_html)
- page_num = item[0]
- return page_num
- def re_page_html(page_html):
- r = re.compile('<li>.*?class="s">.*?href="(.*?)"><img.*?">(.*?)</a>.*?class="u">.*?</li>',re.S)
- item = re.findall(r,page_html)
- return item
- def get_l(url):
- response = requests.get(url)
- response.encoding = 'utf-8'
- return response.text
- def get_arc_list(url):
- response = requests.get(url)
- response.encoding = 'utf-8'
- return response.text
- def re_arc_list(arc_list_url):
- r = re.compile('<li><a href="(.*?)">.*?</a></li>',re.S)
- item = re.findall(r,arc_list_url)
- items = item[24:]
- return items
- def get_arc_html(uu):
- response = requests.get(uu)
- response.encoding = 'utf-8'
- return response.text
- def re_arc(arc_html):
- r = re.compile('class="txt_cont">.*?<h1>(.*?)</h1>.*?html">(.*?)TXT.*?id="content1">(.*?)</div>',re.S)
- item = re.findall(r,arc_html)
- return item
- def main():
- for i in range(1,2):
- url = 'http://www.sjtxt.la/soft/{}/Soft_00{}_1.html'.format(i,i)
- next_html = get_next(url)
- page_nums = re_next(next_html)
- for j in range(1,int(page_nums)+1):
- urls = 'http://www.sjtxt.la/soft/{}/Soft_00{}_{}.html'.format(i,i,j)
- page_html = requests.get(urls)
- for data in re_page_html(page_html.text):
- page_url = 'http://www.sjtxt.la/book/' + data[0][-10:-5]
- txtname = data[1]
- print(txtname)
- curson.execute("insert into book(txtname) value('{}')".format(txtname))
- idtxtname = curson.lastrowid
- conn.commit()
- arc_list_url = get_arc_list(page_url)
- for u in re_arc_list(arc_list_url):
- uu = page_url + '/' + u
- arc_html = get_arc_html(uu)
- for arc_data in re_arc(arc_html):
- title = arc_data[0]
- con = arc_data[2].split()
- content = ''.join(con)
- print(title,content)
- curson.execute("insert into con(idtxtname,title,content) value('{}','{}','{}')".format(idtxtname,title,content))
- conn.commit()
- if __name__ == '__main__':
- main()
复制代码
注:如果HTTPError 和 URLError 同时使用,HTTPError 必须写在前面。
import urllib.request
from urllib.error import *
写法1:(推荐)
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print(e.reason)
elif hasattr(e, 'code'):
print(e.code)
else:
print(e.read())
写法2:
req = urllib.request.Request("http://www.fishc.com/ooxx.html")
try:
urllib.request.urlopen(req)
except HTTPError as e:
print(e.code)
print(e.reason)
print(e.read())
except URLError as e:
print(e.reason)
|
|