|
|

楼主 |
发表于 2016-9-4 17:12:00
|
显示全部楼层
不是很理解一定要传二个参数吗,但我二级链接是列表
- #正则二级内容
- def getcontent_2(url):
- user_agent='Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.04'
- headers={'User-Agent':user_agent}
- try:
- request=urllib2.Request(url,headers = headers)
- response=urllib2.urlopen(request,timeout=5)
- content_2 = response.read().decode('utf-8')
- pattern=re.compile('<a href=".*=u8396862907">.*</a>\'\)</script>\s+</p>\s+</div>\s+<div class="postContent">\s+(.*)\s*<[span]*',re.M)
- contenturl_2=[]
- contenturl_2=re.findall(pattern,content_2)
- for j in contenturl_2:
- pattern_1=re.compile(r'[1-9]([0-9]{5,11})')
- haveqq=re.search(pattern_1,j)
- if haveqq:
- print j
- response.close()
- except socket.timeout, e:
- print type(e)
- except urllib2.HTTPError, e:
- print e.code
- print e.reason
-
- #二级链接
- def geturl_2(contenturl):
- for k,j in contenturl:
- url=k
- # print url
- getcontent_2(url)
复制代码 |
|