|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 wongyusing 于 2017-11-26 17:36 编辑
初步代码如下:
- import re
- import urllib
- import os
- import requests
- def url_open(url): # 网页打开函数,以防被禁
- headers = {
- 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36"}
- req = requests.get(url, headers=headers)
- req.encoding = 'utf-8'
- return req.text
- def get_folder(url):#文件夹名称
- ret = url_open(url)
- reg = r'<a href="/.*?/">(.*?)</a>'
- folder_name = re.findall(reg,ret)
- return folder_name
- def get_folder_url(url):#类型的url后缀
- ret = url_open(url)
- reg = r'<a href="/(.*?)/">.*?</a>'
- folder_url = re.findall(reg, ret)
- return folder_url
- def mmp_down():#主函数
- url = 'http://www.2meinv.com/'
- folder_url = get_folder_url(url) #获取分类的url后缀
- for i in folder_url:
- folder_url_a = url + i #拼接分类的url
- print(folder_url_a)
- if __name__ == "__main__":
- mmp_down()
复制代码
该爬虫目标是要爬取全站的妹子图,并且按照分类下载到4个文件夹中,再按妹子名再保存到一个文件夹中,但在分类上遇到了一些问题;
问题主要是当点开分类的第一页,url没有后缀,第二页开始有后缀.
当点开妹子图时,第一页也是没有后缀,第二页是会有后缀的.
如果强行把后缀改为1后,会出现错误的网站.
我是该先从获得分类第一页开始下载妹子图,还是遍历分类所有的后缀收集成一个列表后再加上第一页的url再进行下载妹子图呢?
现在问题变成了无法获取妹子图片集的页码了,浏览器能看到,但Python看不到
import requests
import os
import re
def request_url(url):
mock_header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36'}
resp = requests.get(url, headers=mock_header)
resp.encoding = 'utf-8'
return resp.text
def get_dir(url):
# request
resp = request_url(url)
resp_reg = r'<a href="(/.*?/)">(.*?)</a>'
resp_result = re.findall(resp_reg, resp)
return resp_result
def get_dir_pic(base_url, path):
# request
resp = request_url(base_url + path)
# <img src="http://20170901.sina678.com/uploads/170930/1506739524-VNHZB.jpg" width="636" alt="极品诱惑丰乳肥臀美女邝凯欣性感撩人" />
resp_reg_node = r'<img src="(http://.*?)".*?alt="(.*?)" />'
# resp_reg = r'<img src="(http://.*?)" width="\d+" alt="(.*?)" />'
resp_result = re.findall(resp_reg_node, resp)
# <a href="/siwameitui/index_54.html" target="_self">54</a>
# <a href="/siwameitui/index_2.html" target="_self">2</a>
if str(path).endswith('/'):
resp_reg_last_page = r'<a href="' + path + 'index_\d+\.html" target="_self">(\d+)</a>'
resp_page = re.findall(resp_reg_last_page, resp)
return resp_result, resp_page[len(resp_page) - 1]
return resp_result
def download_pic(save_dir, pic_tuple):
for pic in pic_tuple:
try:
resp = requests.get(pic[0])
except BaseException:
continue
pic_path = os.path.join(save_dir, pic[1] + str(pic[0])[str(pic[0]).rindex('.'):])
if not os.path.exists(pic_path):
os.mknod(pic_path)
with open(pic_path, 'wb') as f:
f.write(resp.content)
def down_all_pic():
url = 'http://www.2meinv.com'
dirs = get_dir(url)
for path, dir_name in dirs:
# create dir
save_dir = os.path.join(os.getcwd(), '2mienv', dir_name)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
resp_first_totalPage = get_dir_pic(url, path)
download_pic(save_dir, resp_first_totalPage[0])
for page in range(int(resp_first_totalPage[1])):
download_pic(save_dir, get_dir_pic(url, path + 'index_' + str(int(page) + 2) + '.html'))
if __name__ == '__main__':
down_all_pic()
|
|