Cool_Breeze 发表于 2020-6-10 08:59:13

爬取图片(没有加密)

本帖最后由 Cool_Breeze 于 2020-6-12 14:20 编辑

#!/usr/bin/env python3
#coding=utf-8

from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os

def main():
    url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/'
    home = 'https://www.woyaogexing.com'
    html = gethtml(url)
    for page_nu in get_page_list(html):
      get_photo_url_list(gethtml(home + page_nu))

def gethtml(url):
    head = {
    'Accept-Language': 'zh-CN,zh;q=0.9',
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
    }
   
    req = urllib.request.Request(url=url, headers=head)
    response = urllib.request.urlopen(req)
    html = bfs(response,'html.parser') #解析html
    # print(html)
   
    return html

def get_page_list(html):
    data = []
    subject = html.find('div', class_="pMain")
   
    for i in subject.find_all('a', class_="img"):
      data.append(i.attrs['href'])
   
    # print(data)
    return data
   
def get_photo_url_list(html):
    #<h1>....</h1>
    title = str(html.find('h1').string).replace(':','_')
    #替换字符串中的特殊字符为'_',为了创建文件夹
    title = special_str_replace(title)
    if not os.path.exists('./' + title):
      os.mkdir(title)
    os.chdir(title)
    #ul class="artCont cl"
    filterurl = html.find('ul', class_="artCont cl")
    ph_url = []
    for attr in filterurl.find_all('a'):
      # print(attr.attrs)
      ph_url.append(attr['href'])
      
    thread_photo(ph_url)
    os.chdir('../') #返回文件夹
def thread_photo(url):
    thread = []
    count = 0
    for i in url:
      count += 1
      thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
    for i in thread:
      i.start()
    for i in thread:
      i.join()
def get_ptoto(u, count):
    print(u, '===>', count, '.jpeg')
    urllib.request.urlretrieve(\
      'https:' + u,
      str(count) + '.jpeg')
if __name__ == '__main__':
    main()
#!/usr/bin/env python3
#coding=utf-8


def special_str_replace(special):
    limitstr = r'\/:*?"<>|'
    test = list(special)
    for index in range(len(test)):
      if test in limitstr:
            test = '_'
            
    return ''.join(test)

Mike_python小 发表于 2020-6-10 16:48:00

不好看{:10_256:}{:10_256:}{:10_256:}

Cool_Breeze 发表于 2020-6-10 17:22:25

Mike_python小 发表于 2020-6-10 16:48
不好看

拿来练手的!图片都是一个表情,没有什么好看的!

Cool_Breeze 发表于 2020-6-10 17:24:14

至今还不会自动登录网页,进行爬取。不知道有没有现成的教学

Cool_Breeze 发表于 2020-6-12 19:12:59

#!/usr/bin/env python3
#coding=utf-8

from special_str_replace import special_str_replace
import urllib.request,urllib.error
from bs4 import BeautifulSoup as bfs
import threading
import os

def main(page):
    url = 'https://www.woyaogexing.com/touxiang/z/qlshouhui/' + page
    home = 'https://www.woyaogexing.com'
    html = gethtml(url)
    for page_nu in get_page_list(html):
      get_photo_url_list(gethtml(home + page_nu))

def gethtml(url):
    head = {
    'Accept-Language': 'zh-CN,zh;q=0.9',
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36"
    }
   
    req = urllib.request.Request(url=url, headers=head)
    response = urllib.request.urlopen(req)
    html = bfs(response,'html.parser') #解析html
    # print(html)
   
    return html

def get_page_list(html):
    data = []
    subject = html.find('div', class_="pMain")
   
    for i in subject.find_all('a', class_="img"):
      data.append(i.attrs['href'])
   
    # print(data)
    return data
   
def get_photo_url_list(html):
    #<h1>....</h1>
    title = str(html.find('h1').string).replace(':','_')
    #替换字符串中的特殊字符为'_',为了创建文件夹
    title = special_str_replace(title)
    if not os.path.exists('./' + title):
      os.mkdir(title)
    os.chdir(title)
    #ul class="artCont cl"
    filterurl = html.find('ul', class_="artCont cl")
    ph_url = []
    for attr in filterurl.find_all('a'):
      # print(attr.attrs)
      ph_url.append(attr['href'])
      
    thread_photo(ph_url)
    os.chdir('../') #返回文件夹
def thread_photo(url):
    thread = []
    count = 0
    for i in url:
      count += 1
      thread.append(threading.Thread(target=get_ptoto, args=(i,count)))
    for i in thread:
      i.start()
    for i in thread:
      i.join()
def get_ptoto(u, count):
    print(u, '===>', count, '.jpeg')
    urllib.request.urlretrieve(\
      'https:' + u,
      str(count) + '.jpeg')
if __name__ == '__main__':
    for i in range(2,9):
      main('index_' + str(i) +'.html')

形单影只的鱼 发表于 2021-7-22 10:25:46

如何获取一个网页的所有图片呢

鸬鹚鸟 发表于 2021-7-31 07:35:38

Cool_Breeze 发表于 2020-6-12 19:12


666
页: [1]
查看完整版本: 爬取图片(没有加密)