|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
用爬虫爬取网站的内容但是爬不到。不知道是代码出问题还是哪里出问题了,请大神指点。(代码是按书本上python2然后自己根据3修改了一部分,不知道还有没有哪里没修改到的)
以下是代码:
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
import re, sys
def download(url, user_agent='wawp', num_retries=2):
print ('Downlosding:', url)
headers = {'User-agent': user_agent}
req = urllib.request.Request(url, headers = headers)
try:
html = urllib.request.urlopen(req).read()
except urllib.request.URLError as e:
print('Downlosd error:' , e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
#recursively retry 5XX HTTP errors
return download(url, user_agent, num_retries)
return html
def crawl_sitemap(url):
#download the sitemap file
sitemap = download(url)
#extract the sitemap links
links = re.findall(r'<loc>(.*?)</loc>', 'sitemap')
#download each link
for link in links:
html = download(link)
# scrape html here
#...
#download('http://example.webscraping.com/sitemap.xml')
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
以下是爬取的网页的内容
This XML file does not appear to have any style information associated with it. The document tree is shown below.
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>
http://example.webscraping.com/places/default/view/Afghanistan-1
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/p ... iew/Aland-Islands-2
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/places/default/view/Albania-3
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/places/default/view/Algeria-4
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/p ... ew/American-Samoa-5
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/places/default/view/Andorra-6
</loc>
</url>
<url>
<loc>
http://example.webscraping.com/places/default/view/Angola-7
</loc>
|
|