|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
我用程序爬取网页新闻需要翻页,爬取的网站的特点是如果翻到的页数大于最大页数就会一直显示最后一页,于是我想通过设置一个计数变量temp,如果一个网页爬取的新闻超过20条已经爬过,就判定已经至最后一页。
但写的代码一直有问题。。。
代码如下:- #!/usr/bin/env python
- #--*-- coding: utf-8--*--
- from bs4 import BeautifulSoup
- import urllib2
- import urllib
- import re
- def open_url(url,page_num,viewstate):
- print url
- headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
- values={'__VIEWSTATE':viewstate,'__EVENTTARGET':'List1$AspNetPager1','__EVENTARGUMENT':page_num}
- data = urllib.urlencode(values)
- req=urllib2.Request(url,data,headers)
- page=urllib2.urlopen(req)
- html=page.read().decode('utf-8')
- #f=open('a.txt','wb')
- #f.write(html.encode('utf-8'))
- html=html.encode('utf-8')
- return html
-
- def get_urls(html,page_num):
- temp=0
- soup0 = BeautifulSoup(html, 'html.parser')
- if page_num==1:
- soup_v=soup0.find('input',id='__VIEWSTATE')
- viewstate=soup_v.get('value')
- soup1=soup0.find('div',id='mright1')
- soup=soup1.find_all('a')
- for each in soup:
- add=each.get('href')
- try:
- if add in adds or'/skhtmlnews/' not in add:
- temp=temp+1
- continue
- else:
- adds.append(add)
- title=each.get_text(strip=True)
- titles.append(title)
- print add
- print title
- except:
- continue
- print temp
- return viewstate,temp
- adds=[]
- titles=[]
- url0='http://www.science-weekly.cn/MoreList.aspx?id='
- viewstate='/wEPDwUKMTUzMTEwMzM5Mg9kFgICAw9kFgICBQ9kFgRmDzwrAAsBAA8WCh4MRGF0YUtleUZpZWxkBQJpZB4IRGF0YUtleXMWHgK5FQKUFQLWFAKxFALcEgK5EgKSEgLrEQLqEQKkEQL5EALTEAKZDwKFDwLyDgLWDgK5DgKgDgKLDgLXDQK3DQKnDQKIDQLaDAKpDAKiDALSCwLCCwK4CwKnCx4LXyFJdGVtQ291bnQCHh4JUGFnZUNvdW50AgEeFV8hRGF0YVNvdXJjZUl0ZW1Db3VudAIeZBYCZg9kFjwCAQ9kFgJmD2QWAmYPFQMcL3NraHRtbG5ld3MvMjAxNS8xLzI3NDUuaHRtbC3lrabmnK/mnJ/liIrku5jotLnmqKHlvI/pnIDmioDmnK/luILlnLrlgJLpgLwKMjAxNS0wMS0wNWQCAg9kFgJmD2QWAmYPFQMdL3NraHRtbG5ld3MvMjAxNC8xMi8yNzA4Lmh0bWwk56CU56m25omA5YiG57G75pS56Z2p5Li65Yib5paw562R6LevCjIwMTQtMTItMDNkAgMPZBYCZg9kFgJmDxUDHS9za2h0bWxuZXdzLzIwMTQvMTAvMjY0Ni5odG1sHuenkeWtpueahOagh+WHhuS4jeiDveS4luS/l+WMlgoyMDE0LTEwLTA4ZAIED2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDE0LzkvMjYwOS5odG1sJ+i9rOWfuuWboOKAnOS4iei+k+KAneagvOWxgOS6n+W+heegtOinowoyMDE0LTA5LTAxZAIFD2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDE0LzQvMjM5Ni5odG1sJOaJk+mAmuenkeaKgOS9k+WItuKAnOS7u+edo+S6jOiEieKAnQoyMDE0LTA0LTAzZAIGD2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDE0LzMvMjM2MS5odG1sHumbvumcvumUgeWfju+8jOivpeWQrOiwgeeahO+8nwoyMDE0LTAzLTExZAIHD2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDE0LzIvMjMyMi5odG1sHueUqOWIq+S6uueahOecvOedm+WuoeinhuiHquW3sQoyMDE0LTAyLTEyZAIID2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDE0LzEvMjI4My5odG1sFeenkeWtpueahOWbveWutuWxnuaApwoyMDE0LTAxLTA2ZAIJD2QWAmYPZBYCZg8VAx0vc2todG1sbmV3cy8yMDEzLzExLzIyODIuaHRtbCfni6znibnnmoTngbXprYLmiY3og73pgKDlsLHkuIDmtYHlpKflraYKMjAxMy0xMS0yN2QCCg9kFgJmD2QWAmYPFQMcL3NraHRtbG5ld3MvMjAxMy85LzIyMTIuaHRtbDDlj5bmtojmlofnkIbliIbnp5HmmK/mj5DljYfnp5HlrabntKDlhbvnmoTlhbPplK4KMjAxMy0wOS0yOWQCCw9kFgJmD2QWAmYPFQMcL3NraHRtbG5ld3MvMjAxMy85LzIxNjkuaHRtbCHku5bku6zkuLrku4DkuYjkuI3nm7jkv6Hnp5HlrabvvJ8KMjAxMy0wOS0xMmQCDA9kFgJmD2QWAmYPFQMcL3NraHRtbG5ld3MvMjAxMy84LzIxMzEuaHRtbCLkvZXku6XnoLTop6PigJzmioDmnK/mgZDmg6fnl4figJ0gCjIwMTMtMDgtMDFkAg0PZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTMvMi8xOTQ1Lmh0bWwV6L+O5paw5LiO56eR5a2m55uY54K5CjIwMTMtMDItMDZkAg4PZBYCZg9kFgJmDxUDHS9za2h0bWxuZXdzLzIwMTIvMTIvMTkyNS5odG1sG+KAnOWNg+S6uuKAneeahOWOhuWPsuWdkOaghwoyMDEyLTEyLTI1ZAIPD2QWAmYPZBYCZg8VAx0vc2todG1sbmV3cy8yMDEyLzExLzE5MDYuaHRtbB7pobXlsqnmsJTpnanlkb3nmoTnp5HlrabpgLvovpEKMjAxMi0xMS0xNmQCEA9kFgJmD2QWAmYPFQMdL3NraHRtbG5ld3MvMjAxMi8xMC8xODc4Lmh0bWwb6LWw5ZCR5rex5rW36aG756eR5oqA57uZ5YqbCjIwMTItMTAtMTVkAhEPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvOS8xODQ5Lmh0bWwb4oCc5Zu956eR5aSn4oCd55qE5paw5L2/5ZG9CjIwMTItMDktMjRkAhIPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvOC8xODI0Lmh0bWwn56eR5a2m5piv6Leo6LaK5YiG5q2n55qE5pyA5aSn5YWs57qm5pWwCjIwMTItMDgtMTVkAhMPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvNy8xODAzLmh0bWwb6LWw5ZCR5byA5pS+5LiO5Y2P5ZCM5Yib5pawCjIwMTItMDctMTNkAhQPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvNS8xNzUxLmh0bWwS6YeN5aGR56eR5oqA5Lym55CGCjIwMTItMDUtMTVkAhUPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvNC8xNzE5Lmh0bWwY54Of6I2J44CB56eR5a2m5LiO5pS/5rK7CjIwMTItMDQtMTdkAhYPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvMy8xNzAzLmh0bWwV5LiN6Ieq55Sx77yM5peg5Yib5pawCjIwMTItMDMtMTlkAhcPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvMi8xNjcyLmh0bWwP56eR5a2m55qE5Lu35YC8CjIwMTItMDItMTRkAhgPZBYCZg9kFgJmDxUDHC9za2h0bWxuZXdzLzIwMTIvMS8xNjI2Lmh0bWwb6YeN6L+U56eR5a2m5Ye65Y+R55qE5Zyw5pa5CjIwMTItMDEtMTVkAhkPZBYCZg9kFgJmDxUDHS9za2h0bWxuZXdzLzIwMTEvMTIvMTU3Ny5odG1sHueOr+Wig+mihuWvvOWKm+S4juekvuS8muWPkeWxlQoyMDExLTEyLTA2ZAIaD2QWAmYPZBYCZg8VAx0vc2todG1sbmV3cy8yMDExLzExLzE1NzAuaHRtbCHorqnkv6Hmga/lhazlvIDmuKDpgZPmm7TliqDpgJrnlYUKMjAxMS0xMS0xMGQCGw9kFgJmD2QWAmYPFQMdL3NraHRtbG5ld3MvMjAxMS8xMC8xNDkwLmh0bWwb6LCB5p2l55uR566h5a2m5pyv5LiN56uv77yfCjIwMTEtMTAtMTBkAhwPZBYCZg9kFgJmDxUDHS9za2h0bWxuZXdzLzIwMTEvMTAvMTQ3NC5odG1sKuS6jOWFg+e7j+a1juWvvOiHtOKAnOWvkumXqOmavuWHuui0teWtkOKAnQoyMDExLTEwLTEwZAIdD2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDExLzgvMTQ2NC5odG1sG+WKqOi9pui/veWwvuS4juWPkeWxlemAn+W6pgoyMDExLTA4LTA4ZAIeD2QWAmYPZBYCZg8VAxwvc2todG1sbmV3cy8yMDExLzcvMTQ0Ny5odG1sJOenkeeglOivmuS/oeS9k+ezu+W/hemhu+eLrOeri+e7n+S4gAoyMDExLTA3LTA4ZAIBDw8WBh4OQ3VzdG9tSW5mb1RleHQFX+iusOW9leaAu+aVsO+8mjxiPjExNjwvYj4g5oC76aG15pWw77yaPGI+NDwvYj4g5b2T5YmN5Li656ysPGZvbnQgY29sb3I9InJlZCI+PGI+MjwvYj48L2ZvbnQ+6aG1HhBDdXJyZW50UGFnZUluZGV4AgIeC1JlY29yZGNvdW50AnRkZGSqrrri7McHqkS66vhhSAuchQ9hsw=='
- for i in range(1,100):
- try:
- url=url0+str(i)
- for j in range(1,10000):
- try:
- html=open_url(url,j,viewstate)
- viewstate,temp=get_urls(html,j)
- print '======================='#此处只能输出一次?
- if temp>20:
- break
- except:
- continue
- except:
- continue
复制代码
问题:我检查下来,语句print '======================='只输出了一次,但它的上一句执行了很多次,这是为什么?
在你的geturl 函数加上global viewstate如下
def get_urls(html,page_num):
global viewstate
也可以将viewstate换个名字(如:viewstate1),并给变量一个值
因为如果你的viewstate1没有赋值,第二次时,下面语句没有执行 ,就会直return viewstate1,就出出错
if page_num==1:
soup_v=soup0.find('input',id='__VIEWSTATE')
viewstate1=soup_v.get('value')
|
|