|  | 
 
 发表于 2015-8-26 15:56:22
|
显示全部楼层 
| import urllib.request import os
 import random
 
 def url_open(url):
 data=None
 #'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'#
 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
 req = urllib.request.Request(url,data,headers)
 #proxies = ['117.177.243.43:8086']
 #proxy = random.choice(proxies)
 #'124.93.222.95:8080'
 #proxy_support = urllib.request.ProxyHandler({"http":proxy})
 #opener = urllib.request.build_opener(proxy_support)
 #urllib.request.install_opener(opener)
 response = urllib.request.urlopen(req)
 html = response.read()
 
 return html
 
 
 def get_page(url):
 html = url_open(url).decode('utf-8')
 
 
 a = html.find('current-comment-page') + 23
 b = html.find(']',a)
 
 return html[a:b]
 
 
 
 def find_imgs(url):
 html = url_open(url).decode('utf-8')
 img_addrs = []
 a = html.find('img src=')
 while a != -1:
 b = html.find('.jpg',a,a+255)
 if b != -1:
 img_addrs.append(html[a+9:b+4])
 else:
 b = a+9
 a = html.find('img src',b)
 return img_addrs
 
 
 
 def save_imgs(folder,img_addrs):
 for each in img_addrs:
 filename = each.split('/')[-1]
 with open(filename,'wb') as f:
 img =url_open(each)
 f.write(img)
 
 def download_mm(floder='煎蛋网爬虫',pages=10):
 os.mkdir(floder)
 os.chdir(floder)
 
 url ="http://jandan.net/ooxx/"
 page_num = int(get_page(url))
 
 for i in range(pages):
 page_num -= i
 page_url = url + 'page-' + str(page_num) +'#comments'
 img_addrs = find_imgs(page_url)
 save_imgs(floder,img_addrs)
 
 if __name__ == '__main__':
 download_mm()
 
 
 | 
 |