爬取图片(没有加密)
本帖最后由 Cool_Breeze 于 2020-6-12 14:20 编辑#!/usr/bin/env python3
#coding=utf-8
from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os
def main():
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/'
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
get_photo_url_list(gethtml(home + page_nu))
def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)
return html
def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")
for i in subject.find_all('a', class_="img"):
data.append(i.attrs['href'])
# print(data)
return data
def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_',为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
# print(attr.attrs)
ph_url.append(attr['href'])
thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
count += 1
thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
i.start()
for i in thread:
i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
'https:' + u,
str(count) + '.jpeg')
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#coding=utf-8
def special_str_replace(special):
limitstr = r'\/:*?"<>|'
test = list(special)
for index in range(len(test)):
if test in limitstr:
test = '_'
return ''.join(test) 不好看{:10_256:}{:10_256:}{:10_256:} Mike_python小 发表于 2020-6-10 16:48
不好看
拿来练手的!图片都是一个表情,没有什么好看的! 至今还不会自动登录网页,进行爬取。不知道有没有现成的教学 #!/usr/bin/env python3
#coding=utf-8
from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os
def main(page):
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/' + page
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
get_photo_url_list(gethtml(home + page_nu))
def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)
return html
def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")
for i in subject.find_all('a', class_="img"):
data.append(i.attrs['href'])
# print(data)
return data
def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_',为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
# print(attr.attrs)
ph_url.append(attr['href'])
thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
count += 1
thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
i.start()
for i in thread:
i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
'https:' + u,
str(count) + '.jpeg')
if __name__ == '__main__':
for i in range(2,9):
main('index_' + str(i) +'.html') 如何获取一个网页的所有图片呢 Cool_Breeze 发表于 2020-6-12 19:12
666
页:
[1]