|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
这些是前面的代码,回复有下面的代码
from selenium import webdriver
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import urllib.request
from selenium.webdriver.support import expected_conditions as EC
import re
import socket
socket.setdefaulttimeout(10.0)
# 缓存
browser = webdriver.PhantomJS(service_args=['--disk-cache=true'])
wait = WebDriverWait(browser, 10)
# --disk-cache=true
#模仿点击事件
def search():
try:
submit = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#comments > div:nth-child(4) > div > a.previous-comment-page')))
submit.click()
except:
pass
#初始界面的页码
def get_pagenum(url):
html = open_url(url).decode('UTF-8')
p = re.compile(r'.*?current-comment-page">\[(.*?)]</span>', re.S)
num = re.findall(p, html)[0]
return num
#保存图片
def saveimage(floder, imageattr):
for each in imageattr:
filename = each.split('/')[-1]
print('正在保存图片%s' % filename)
try:
urllib.request.urlretrieve(each, filename, schedule)
except:
pass
#下载图片的过程用xx%表示
def schedule(a, b, c):
"""
:param a:已经下载的数据块
:param b: 数据块的大小
:param c: 远程文件的大小
:return:返回百分数
"""
per = 100.0 * a * b / c
if per > 100:
per = 100
print('%.2f%%' % per)
#利用获取的html利用正则搜索到图片地址并放到列表中
def find_images(html):
try:
p = re.compile('<p>.*?<img src="(.*?\.jpg)".*?</p>', re.S)
imagelist = re.findall(p, html)
imageattr = []
for each in imagelist:
imagelist = 'http:' + each
imageattr.append(imagelist)
return imageattr
except:
pass
#打开url返回源代码
def open_url(url):
# 读取url
req = urllib.request.Request(url)
req.add_header('User_Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
try:
response = urllib.request.urlopen(url)
html = response.read()
return html
except:
pass
|
|