| 
 | 
 
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册  
 
x
 
- from selenium import webdriver
 
 - from selenium.webdriver.common.by import By
 
 - from selenium.webdriver.support import expected_conditions as EC
 
 - from selenium.webdriver.support.wait import WebDriverWait
 
 - from selenium.common.exceptions import TimeoutException
 
 - import time
 
 - import requests
 
 - import os
 
  
- '''屏蔽掉浏览器界面'''
 
 - URL = 'http://jandan.net/ooxx'
 
 - chrome_options = webdriver.ChromeOptions()
 
 - chrome_options.add_argument('--headless')
 
 - browser = webdriver.Chrome(chrome_options = chrome_options)
 
 - wait = WebDriverWait(browser, 10)
 
 - browser.get(URL)
 
  
 
- def next_page():  # 点击下一页
 
 -     button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'previous-comment-page')))
 
 -     return button
 
  
 
- def cur_page():  # 获取当前页数
 
 -     page = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'current-comment-page')))
 
 -     return page.text
 
  
 
- def parse_html(lyst):
 
 -     """
 
 -     :param img:单个图片链接
 
 -     :param lyst:存储图片链接
 
 -     """
 
 -     imgs_info = browser.find_elements_by_xpath('//*[@id="comments"]/ol/li//p/img')
 
 -     for img in imgs_info:
 
 -         img = img.get_attribute('src')
 
 -         if img[len(img) - 3:] == 'jpg':  # 剔除广告的'.gif'图片
 
 -             lyst.append(img)
 
  
 
- def dowmloader(url):
 
 -     '''图片下载'''
 
 -     try:
 
 -         response = requests.get(url)
 
 -         if response.status_code == 200:
 
 -             return response.content
 
 -         return None
 
 -     except Exception:
 
 -         pass
 
  
 
- def save_img(img_content, num):
 
 -     '''
 
 -     :param img_content:二进制数据
 
 -     :param num:图片保存的次序
 
 -     '''
 
 -     with open(str(num) + '.jpg', 'wb') as f:
 
 -         f.write(img_content)
 
  
 
- def jandan_crawlers(lyst):
 
 -     '''
 
 -     :param FLAG:抓取网页页数
 
 -     '''
 
 -     try:
 
 -         current_page = cur_page()
 
 -         FLAG = int(current_page[1:len(current_page) - 1])
 
 -         while FLAG:
 
 -             print('正在抓取煎蛋网第%d页图片' % FLAG)
 
 -             parse_html(lyst)
 
 -             time.sleep(3)
 
 -             button = next_page()
 
 -             button.click()
 
 -             FLAG -= 1
 
 -             if FLAG == 1:
 
 -                 jandan_crawlers(lyst)
 
 -                 FLAG = False
 
 -         return lyst
 
 -     except TimeoutException:
 
 -         pass
 
  
 
- if __name__ == '__main__':
 
 -     os.chdir('picture')  # 切换图片的保存路径
 
 -     lyst = []
 
 -     num = 1
 
 -     url_info = jandan_crawlers(lyst)
 
 -     for url in url_info:
 
 -         img_content = dowmloader(url)
 
 -         print('正在保存图片:' + str(num) + '.jpg')
 
 -         save_img(img_content, num)
 
 -         num += 1
 
 -     print('煎蛋网图片抓取完成')
 
 -     browser.close()
 
 
  复制代码 |   
 
 
 
 |