|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import os
import base64
import re
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
print(html)
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = r'<span class="current-comment-page">\[(\d+)\]</span>'
page_list = re.findall(a,html)
for each in page_list:
t = each
return t
def base_num(page_num):
times = '20200811-' + str(page_num)
s = times.encode('utf-8')
sw = base64.b64encode(s)
str_sw = str(sw)
string = str_sw.split("'")[-1]
print (string)
return string
def find_pic_url(url):
html = url_open(url).decode('utf-8')
pic_url_list = []
a = r'<img src="([^"]+\.jpg)"'
b = re.findall(a,html)
if not len(b):
pic_url_list.append(b)
for each in pic_url_list:
print(each)
def save_pic(folder, pic_url):
pass
def download_mm(folder = 'mm', pages = 10):
#创建一个文件夹
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
#获取页码
page_num = int(get_page(url))
for i in range(pages):
page_num -=i
#base64加密
x = base_num(page_num)
#获取页码地址
page_url = url + x + '#comments'
#获取图片具体地址并保存成列表
pic_url = find_pic_url(page_url)
save_pic(folder, pic_url)
if __name__ == '__main__':
download_mm()
我这个到底哪里出错我也拿不准,求大神
本帖最后由 suchocolate 于 2020-8-15 11:53 编辑
不用xpath或bs太低效了
- import requests
- from lxml import etree
- import os
- def ck_dir():
- # Please fill in the name of the path where you want to save the images
- pic_dir = 'mm'
- if not os.path.exists(pic_dir):
- os.mkdir(pic_dir)
- os.chdir(pic_dir)
- def get_pic_url(act_url):
- r = requests.get(act_url, headers=headers)
- html = etree.HTML(r.text)
- # pic list
- result = html.xpath('//img/@src')
- # last pic is not mm pic
- result.pop()
- # next page url
- act_nx_pg = html.xpath('//a[contains(text(),"99")]/@href')[0]
- return result, act_nx_pg
- def main():
- # pic counter
- n = 1
- # check pic directory
- ck_dir()
- # pic url list
- pic_list = []
- # next page url
- nx_pg_url = ''
- # get pics urls
- for item in range(100, 98, -1):
- if item == 100:
- pic_url, nx_pg = get_pic_url(url)
- else:
- pic_url, nx_pg = get_pic_url(nx_pg_url)
- nx_pg_url = f'{url}/{nx_pg}#comments'
- pic_list.extend(pic_url)
- # download pics
- for item in pic_list:
- r = requests.get('http:' + item, headers=headers)
- pic_name = item.split('/')[-1]
- with open(pic_name, 'wb') as f:
- f.write(r.content)
- print(f'{pic_name} has been downloaded. total number: {n}')
- n = n + 1
- if __name__ == '__main__':
- # global variables
- url = 'http://jandan.net/ooxx'
- headers = {'User-agent': 'firefox'}
- # main func
- main()
复制代码
|
|