马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 Cool_Breeze 于 2020-6-12 14:20 编辑 #!/usr/bin/env python3
#coding=utf-8
from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os
def main():
url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/'
home = 'https://www.woyaogexing.com'
html = gethtml(url)
for page_nu in get_page_list(html):
get_photo_url_list(gethtml(home + page_nu))
def gethtml(url):
head = {
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
}
req = urllib.request.Request(url=url, headers=head)
response = urllib.request.urlopen(req)
html = bfs(response,'html.parser') #解析html
# print(html)
return html
def get_page_list(html):
data = []
subject = html.find('div', class_="pMain")
for i in subject.find_all('a', class_="img"):
data.append(i.attrs['href'])
# print(data)
return data
def get_photo_url_list(html):
#<h1>....</h1>
title = str(html.find('h1').string).replace(':','_')
#替换字符串中的特殊字符为'_',为了创建文件夹
title = special_str_replace(title)
if not os.path.exists('./' + title):
os.mkdir(title)
os.chdir(title)
#ul class="artCont cl"
filterurl = html.find('ul', class_="artCont cl")
ph_url = []
for attr in filterurl.find_all('a'):
# print(attr.attrs)
ph_url.append(attr['href'])
thread_photo(ph_url)
os.chdir('../') #返回文件夹
def thread_photo(url):
thread = []
count = 0
for i in url:
count += 1
thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
for i in thread:
i.start()
for i in thread:
i.join()
def get_ptoto(u, count):
print(u, '===>', count, '.jpeg')
urllib.request.urlretrieve(\
'https:' + u,
str(count) + '.jpeg')
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#coding=utf-8
def special_str_replace(special):
limitstr = r'\/:*?"<>|'
test = list(special)
for index in range(len(test)):
if test[index] in limitstr:
test[index] = '_'
return ''.join(test)
|