爬取图片（没有加密）,Python交流,编程语言专区,鱼C论坛

Cool_Breeze 发表于 2020-6-10 08:59:13

爬取图片（没有加密）

本帖最后由 Cool_Breeze 于 2020-6-12 14:20 编辑

#!/usr/bin/env python3
#coding=utf-8

from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os

def main():
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/'
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
   get_photo_url_list(gethtml(home + page_nu))

def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}

req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)

return html

def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")

for i in subject.find_all('a', class_="img"):
   data.append(i.attrs['href'])

# print(data)
return data

def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_'，为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
   os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
   # print(attr.attrs)
   ph_url.append(attr['href'])

thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
   count += 1
   thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
   i.start()
for i in thread:
   i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
   'https:' + u,
   str(count) + '.jpeg')
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#coding=utf-8

def special_str_replace(special):
limitstr = r'\/:*?"<>|'
test = list(special)
for index in range(len(test)):
   if test in limitstr:
         test = '_'

return ''.join(test)

Mike_python小 发表于 2020-6-10 16:48:00

不好看{:10_256:}{:10_256:}{:10_256:}

Cool_Breeze 发表于 2020-6-10 17:22:25

Mike_python小发表于 2020-6-10 16:48
不好看

拿来练手的！图片都是一个表情，没有什么好看的！

Cool_Breeze 发表于 2020-6-10 17:24:14

至今还不会自动登录网页，进行爬取。不知道有没有现成的教学

Cool_Breeze 发表于 2020-6-12 19:12:59

#!/usr/bin/env python3
#coding=utf-8

from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os

def main(page):
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/' + page
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
   get_photo_url_list(gethtml(home + page_nu))

def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}

req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)

return html

def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")

for i in subject.find_all('a', class_="img"):
   data.append(i.attrs['href'])

# print(data)
return data

def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_'，为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
   os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
   # print(attr.attrs)
   ph_url.append(attr['href'])

thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
   count += 1
   thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
   i.start()
for i in thread:
   i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
   'https:' + u,
   str(count) + '.jpeg')
if __name__ == '__main__':
for i in range(2,9):
   main('index_' + str(i) +'.html')

形单影只的鱼 发表于 2021-7-22 10:25:46

如何获取一个网页的所有图片呢

鸬鹚鸟 发表于 2021-7-31 07:35:38

Cool_Breeze 发表于 2020-6-12 19:12

666

页: [1]

鱼C论坛's Archiver

爬取图片（没有加密）