|

楼主 |
发表于 2020-6-12 19:12:59
|
显示全部楼层
- #!/usr/bin/env python3
- #coding=utf-8
- from special_str_replace import special_str_replace
- import urllib.request,urllib.error
- from bs4 import BeautifulSoup as bfs
- import threading
- import os
- def main(page):
- url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/' + page
- home = 'https://www.woyaogexing.com'
- html = gethtml(url)
- for page_nu in get_page_list(html):
- get_photo_url_list(gethtml(home + page_nu))
- def gethtml(url):
- head = {
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
- }
-
- req = urllib.request.Request(url=url, headers=head)
- response = urllib.request.urlopen(req)
- html = bfs(response,'html.parser') #解析html
- # print(html)
-
- return html
- def get_page_list(html):
- data = []
- subject = html.find('div', class_="pMain")
-
- for i in subject.find_all('a', class_="img"):
- data.append(i.attrs['href'])
-
- # print(data)
- return data
-
- def get_photo_url_list(html):
- #<h1>....</h1>
- title = str(html.find('h1').string).replace(':','_')
- #替换字符串中的特殊字符为'_',为了创建文件夹
- title = special_str_replace(title)
- if not os.path.exists('./' + title):
- os.mkdir(title)
- os.chdir(title)
- #ul class="artCont cl"
- filterurl = html.find('ul', class_="artCont cl")
- ph_url = []
- for attr in filterurl.find_all('a'):
- # print(attr.attrs)
- ph_url.append(attr['href'])
-
- thread_photo(ph_url)
- os.chdir('../') #返回文件夹
- def thread_photo(url):
- thread = []
- count = 0
- for i in url:
- count += 1
- thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
- for i in thread:
- i.start()
- for i in thread:
- i.join()
- def get_ptoto(u, count):
- print(u, '===>', count, '.jpeg')
- urllib.request.urlretrieve(\
- 'https:' + u,
- str(count) + '.jpeg')
- if __name__ == '__main__':
- for i in range(2,9):
- main('index_' + str(i) +'.html')
复制代码 |
-
|