|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 晓屁屁 于 2018-9-18 15:51 编辑
import urllib.request
import urllib.parse
import bs4
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import re
import os
import requests
import random
ip_list = [{'https':'182.88.128.170:8123'},
{'http': '113.101.252.226:61234'},
{'http':'122.96.93.158:49435'},
{'http':'120.5.173.106:38495'}]
porx = urllib.request.ProxyHandler(random.choice(ip_list))
openr = urllib.request.build_opener(porx)
openr.add_handler = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')]
urllib.request.install_opener(openr)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://jandan.net/ooxx'
def open_url(url):
dacp = dict(DesiredCapabilities.PHANTOMJS)
dacp['User-Agent'] = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
text = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs', desired_capabilities=dacp) #前提需下载phantomjs浏览器 路径为d:/phantomjs/bin/phantomjs
text.get(url)
text.set_page_load_timeout(30)
html = text.page_source
return html
def get_page(soup):
img_url = soup.find_all(src=re.compile("sinaimg.cn"))
for each in img_url:
yield each['src']
def down_img(url):
os.chdir('e:/img') #图片保存路径
html = open_url(url)
soup = bs4.BeautifulSoup(html,'html.parser')
str1 = soup.find_all(class_="current-comment-page")
num = re.findall('\d\d',str(str1))
max_num = int(num[0])
while max_num:
url = 'http://jandan.net/ooxx/page-{0}#comments'.format(max_num)
html = open_url(url)
soup = bs4.BeautifulSoup(html,'html.parser')
for each in get_page(soup):
file_name = each.split('/')[-1]
with open(file_name,'wb') as f:
req = requests.get(each)
req.raise_for_status()
for x in req.iter_content(100000):
f.write(x)
time.sleep(2)
max_num -= 1
down_img(url)
和甲鱼兄写的有点区别 |
|