|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 xule1111 于 2019-1-13 01:39 编辑
- import requests
- from lxml import etree
- from bs4 import BeautifulSoup
- import random
- import string
- import os
- import time
- for a in range(2,102):
- html = 'http://www.mmjpg.com/home/'+str(a)
- #print (html)
- headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36','Referer': "http://www.mmjpg.com"}
- html = requests.get(html,headers=headers)
- data = etree.HTML(html.content)
- img_url = data.xpath('//li/a')
- for a in img_url[0:len(img_url)]:#len(img_url)
-
- url1=a.get('href')
-
- #print (url1)
-
-
- for b in range(2,50):
- jpg_url = url1+'/'+str(b)
- #print (jpg_url)
- html = requests.get(jpg_url,headers=headers)
- data = etree.HTML(html.content)
- img_url = data.xpath('//img')
- for c in img_url[0:1]:
- jpg_url = c.get('data-img')
- name =c.get('alt')
- name=name.split()[0]
- #print (jpg_url)
- #print (name)
- isExists = os.path.exists(name) #设置目录名判断是否已有目录
- if isExists:
- pass
- #print('名字叫做', title, '的文件夹已经存在了')
- else:
- os.makedirs(name)#建立目录
- #print('建一个名字叫做', title, '的文件夹')
- jpg_name =('./')+name+'/'+jpg_url.split('/')[-1]#random.choice('abcdefghijklmnopqrstuvwxyz')str(random.randint(1,100000))+
- #print (jpg_name)
- img = requests.get(jpg_url,headers=headers)
- with open(jpg_name, 'wb') as f:#必须加b二进制打开
- f.write(img.content) #写入图片或文件加上content,普通文本不加
- f.close()
- time.sleep(0.1)
-
- #爬一段时间会出错,可能需要验证,待验证
- #稍微修改一下可以爬取第一页妹子
复制代码 |
|