noah-py 发表于 2021-4-15 10:24:01

{:10_279:}

愷龍 发表于 2021-4-15 17:02:48

爬虫学的好,牢饭吃到饱https://cdn.jsdelivr.net/gh/hishis/forum-master/public/images/patch.gif

愷龍 发表于 2021-4-15 17:04:05

文件损坏了啊https://cdn.jsdelivr.net/gh/hishis/forum-master/public/images/patch.gif

小白_Pythong 发表于 2021-4-16 17:06:44

文件损坏,下载不了!!!

embededarm 发表于 2021-4-17 12:31:11

学习中!!

embededarm 发表于 2021-4-17 13:47:53

压缩包打不开。

淡淡凉 发表于 2021-4-17 15:47:47

Lesl1e 发表于 2021-4-14 13:21
这lxml也安装了

import requests
#from lxml import etreepython3.5以上版本不可这样导入
from lxml import html
etree = html.etree
import os
import time

def geturl(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
    #params = {"show_raw":1}
    #response = requests.get(url,params=params,headers = headers)
    response = requests.get(url,headers = headers)
    response.encoding = 'gbk'
    html = response.text
    return html

def get_mggs(url):
    mggs = []
    html = geturl(url)
    html = etree.HTML(html)
    items = html.xpath('//ul[@class="list_con_box_ul"]/li')

    for li in items:
      href = li.xpath("./a/@href")
      title = li.xpath("./a/@title")
      mgg =
      mggs.append(mgg)

    return mggs

def getmgg():
    mggs = get_mggs(url)
    for mgg in mggs:
      count = 1
      mggurl1 = url.split("/meinv") + mgg
      if not os.path.exists(mgg):
            dil = dils(mgg)

      else:
            os.chdir(mgg)

      for i in range(15):
            mggurl = mggurl1
            if count == 1:
                mggurl = mggurl1

            else:
                mggurl = mggurl1.split(".html") + "_" + str(count) + ".html"

            count += 1

            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
            resp = requests.get(url=mggurl,headers=headers)
            links = []
            if resp.status_code ==200:
                html = resp.text
                html = etree.HTML(html)
                link = html.xpath('//div[@id="bigpic"]/a/img/@src')
                mm_jpg = requests.get(link,headers=headers)

                with open(link.split("/")[-1],'wb')as f:
                  f.write(mm_jpg.content)
                  print("成功下载一张图片")
                  time.sleep(1)
      os.chdir(".\\..")

      #print("成功下载一套图片")


def dils(name = "ooxx"):
    os.mkdir(name)
    os.chdir(name)


if __name__ == "__main__":
   
    url1 = "https://www.tupianzj.com/meinv/"
    lis = {"清纯美女":"xiezhen/","性感":"xinggan/",
         "古装":"guzhuang/","人体艺术":"yishu/",
         "香车美女":"chemo/","丝袜美女":"siwa/"
      }
    name = input("选择要下载的图片分类(清纯美女,性感,古装,人体艺术,香车美女,丝袜美女)")
    url = url1 + lis

    geturl(url)
    mggs_url = get_mggs(url)
    maggs = getmgg()

淡淡凉 发表于 2021-4-17 15:48:38

import requests
#from lxml import etreepython3.5以上版本不可这样导入
from lxml import html
etree = html.etree
import os
import time

def geturl(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
    #params = {"show_raw":1}
    #response = requests.get(url,params=params,headers = headers)
    response = requests.get(url,headers = headers)
    response.encoding = 'gbk'
    html = response.text
    return html

def get_mggs(url):
    mggs = []
    html = geturl(url)
    html = etree.HTML(html)
    items = html.xpath('//ul[@class="list_con_box_ul"]/li')

    for li in items:
      href = li.xpath("./a/@href")
      title = li.xpath("./a/@title")
      mgg =
      mggs.append(mgg)

    return mggs

def getmgg():
    mggs = get_mggs(url)
    for mgg in mggs:
      count = 1
      mggurl1 = url.split("/meinv") + mgg
      if not os.path.exists(mgg):
            dil = dils(mgg)

      else:
            os.chdir(mgg)

      for i in range(15):
            mggurl = mggurl1
            if count == 1:
                mggurl = mggurl1

            else:
                mggurl = mggurl1.split(".html") + "_" + str(count) + ".html"

            count += 1

            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
            resp = requests.get(url=mggurl,headers=headers)
            links = []
            if resp.status_code ==200:
                html = resp.text
                html = etree.HTML(html)
                link = html.xpath('//div[@id="bigpic"]/a/img/@src')
                mm_jpg = requests.get(link,headers=headers)

                with open(link.split("/")[-1],'wb')as f:
                  f.write(mm_jpg.content)
                  print("成功下载一张图片")
                  time.sleep(1)
      os.chdir(".\\..")

      #print("成功下载一套图片")


def dils(name = "ooxx"):
    os.mkdir(name)
    os.chdir(name)


if __name__ == "__main__":
   
    url1 = "https://www.tupianzj.com/meinv/"
    lis = {"清纯美女":"xiezhen/","性感":"xinggan/",
         "古装":"guzhuang/","人体艺术":"yishu/",
         "香车美女":"chemo/","丝袜美女":"siwa/"
      }
    name = input("选择要下载的图片分类(清纯美女,性感,古装,人体艺术,香车美女,丝袜美女)")
    url = url1 + lis

    geturl(url)
    mggs_url = get_mggs(url)
    maggs = getmgg()

花哗哗~ 发表于 2021-4-20 11:39:33

我刚开始学Python你就给我看这个?

1589895304 发表于 2021-4-24 10:49:28

FengYang.X 发表于 2021-4-14 17:22
文件损坏了啊

直接改后缀名就行了

Puppet16 发表于 2021-5-29 14:13:12

文件损坏 无法解压
页: 1 [2]
查看完整版本: 新手爬虫——爬妹子