爬取图片（没有加密）

Cool_Breeze · 发表于 2020-6-10 08:59:13

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由 Cool_Breeze 于 2020-6-12 14:20 编辑

#!/usr/bin/env python3
#coding=utf-8
from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os
def main():
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/'
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
get_photo_url_list(gethtml(home + page_nu))
def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)
return html
def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")
for i in subject.find_all('a', class_="img"):
data.append(i.attrs['href'])
# print(data)
return data
def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_'，为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
# print(attr.attrs)
ph_url.append(attr['href'])
thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
count += 1
thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
i.start()
for i in thread:
i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
'https:' + u,
str(count) + '.jpeg')
if __name__ == '__main__':
main()

复制代码

#!/usr/bin/env python3
#coding=utf-8
def special_str_replace(special):
limitstr = r'\/:*?"<>|'
test = list(special)
for index in range(len(test)):
if test[index] in limitstr:
test[index] = '_'
return ''.join(test)

复制代码

Mike_python小 · 发表于 2020-6-10 16:48:00

不好看

Cool_Breeze · 发表于 2020-6-10 17:22:25

Mike_python小发表于 2020-6-10 16:48
不好看

拿来练手的！图片都是一个表情，没有什么好看的！

Cool_Breeze · 发表于 2020-6-10 17:24:14

至今还不会自动登录网页，进行爬取。不知道有没有现成的教学

Cool_Breeze · 发表于 2020-6-12 19:12:59

#!/usr/bin/env python3
#coding=utf-8
from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os
def main(page):
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/' + page
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
get_photo_url_list(gethtml(home + page_nu))
def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)
return html
def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")
for i in subject.find_all('a', class_="img"):
data.append(i.attrs['href'])
# print(data)
return data
def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_'，为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
# print(attr.attrs)
ph_url.append(attr['href'])
thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
count += 1
thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
i.start()
for i in thread:
i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
'https:' + u,
str(count) + '.jpeg')
if __name__ == '__main__':
for i in range(2,9):
main('index_' + str(i) +'.html')

复制代码

形单影只的鱼 · 发表于 2021-7-22 10:25:46

如何获取一个网页的所有图片呢

鸬鹚鸟 · 发表于 2021-7-31 07:35:38

Cool_Breeze 发表于 2020-6-12 19:12

666

[技术交流] 爬取图片（没有加密）

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块

账号		自动登录	找回密码
密码			立即注册