|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
请大佬帮忙看下,是那里的问题
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : ${DATE} ${TIME}
- # @Author : Aries
- # @Site : ${SITE}
- # @File : ${NAME}.py
- # @Software: ${PRODUCT_NAME}
- import urllib2
- import re
- domain = 'http://www.quanshuwang.com'
- headers = { 'User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
- def getTypeList(pn=1):
- req = urllib2.Request('http://www.quanshuwang.com/map/%s.html'%pn)
- req.headers = headers
- res = urllib2.urlopen(req)
- html =res.read().decode('gbk').encode('utf-8')
- reg =r'<a href="(/book/.*?)" target="_blank">(.*?)</a>'
- reg = re.compile(reg)
- return re.findall(reg,html)
- def getNovelList(url):
- req = urllib2.Request(domain + url)
- req.headers = headers
- res = urllib2.urlopen(req)
- html = res.read().decode('gkb')
- reg =r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>'
- reg = re.compile(reg)
- return re.findall(reg,html)
- def getNovelContent(url):
- req = urllib2.Request(domain + url)
- req.headers = headers
- res = urllib2.urlopen(req)
- html = res.read().decode('gbk')
- reg =r'style5\(\);</script>(.*?)<script type="text/javascript">style6\(\)'
- return re.findall(reg,html)[0]
- if __name__== '__main__ ':
- for type in range(1,10):
- for url,title in getTypeList(type):
- for zurl,ztitle in getNovelList(url):
- print u'正在爬取------%s' %ztitle
- content = getNovelContent(url.replace('index.html'),zurl)
复制代码
为什么在pycham运行不了@! |
|