|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
下面是我写的一段爬取京东手机图片的代码 程序没有报错 但是没有爬出东西 求大大帮忙看看 具体是正则用法的错误 还是其他的错误
- import re
- import urllib.request
- def craw(url,page):
- html1 = urllib.request.urlopen(url).read()
- html1 = str(html1)
- pat1 = '<div id="J_goodsList".+?"page clearfix"'
- result1 = re.compile(pat1).findall(html1)
- result1 = result1[0]
- pat2 = '<img width="220" height="220" class="err-product" data-img="1" src="//img(.+?\.jpg)">'
- photolist = re.compile(pat2).findall(result1)
- x = 1
- for photourl in photolist:
- photoname = 'E:\\京东手机图片\\img1' + str(page) + str(x) + '.jpg'
- photourl = 'http://' + photourl
- try:
- urllib.request.urlretrieve(photourl,filename = photoname)
- except urllib.error.URLError as e:
- if hasattr(e,'code'):
- x+=1
- if hasattr(e,'reason'):
- x+=1
- x+=1
- for i in range(1,50,2):
- url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=' + str(i)
- craw(url,i)
复制代码
- import re
- import urllib.request
- def craw(url,page):
- html1 = urllib.request.urlopen(url).read().decode('utf-8')
- p = r'<img width="220" height="220" class="err-product" data-img="1" src="(//.+\.jpg)" />'
- photolist = re.compile(p).findall(html1)
- x = 1
- for photourl in photolist:
- photoname = 'E:\\京东手机图片\\img1\\' + str(page) + str(x) + '.jpg'
- photourl = 'http:' + photourl
- print(photourl)
- try:
- urllib.request.urlretrieve(photourl,filename = photoname)
- except urllib.error.URLError as e:
- if hasattr(e,'code'):
- x+=1
- if hasattr(e,'reason'):
- x+=1
- x+=1
- url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1'
- craw(url,1)
复制代码
|
|