python教程中OOXX中的问题。

浮云骑士 · 发表于 2016-7-23 16:38:39

def get_page(url):
html = url_open(url).decode('utf-8')

a = html.find('current-comment-page')+23       //  教程中讲的偏移23是怎么算的？
b = html.find(']',a)

return html[a:b]

def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []

a = html.find('img src=')

while a != -1:

      b = html.find('.jpg',a,a+255)
      if b!= -1:
         img_addrs.append(html[a+9:b+4]) //还有这里的a+9和b+4
      else:
         b = a+9

      a = html.find('img src=',b)

return img_addrs

帮忙举个例子讲一下偏移，图片如下：

无符号整形 · 发表于 2016-7-23 16:38:40

一.‘current-comment-page[XXXX’正好是23个字符，下一个就是‘]’。中间包含数字（页码）
二:
1.b是“.jpg”，而且.jpg这个字符正好4个字节
2.a是“img src=”，但是取出的网址不能有双引号，故必须加上一个双引号。所以这个字符串正好9字节。

SixPy · 发表于 2016-7-23 16:56:28

1、偏移23是怎么算的？
len('current-comment-page">[')

SixPy · 发表于 2016-7-23 17:00:01

同理：
len('img src="') #a+9

len('.jpg'') #b+4

竞技山 · 发表于 2016-7-24 02:14:53

SixPy 发表于 2016-7-23 17:00
同理：
len('img src="') #a+9

学习了。很有帮助

yintotti · 发表于 2016-7-26 22:08:09

最幸福是答案是

浮云骑士 · 发表于 2016-7-27 18:54:19

yintotti 发表于 2016-7-26 22:08
最幸福是答案是

what's means?

codgear · 发表于 2016-7-28 18:19:41

OOXX视频教程里面的代码一运行就是这个结果，求助！！！
Traceback (most recent call last):
  File "D:/Bit德沽/Codgear/Codes/FISHC/download_mm.py", line 67, in <module>
download_mm()
  File "D:/Bit德沽/Codgear/Codes/FISHC/download_mm.py", line 58, in download_mm
page_num = int(get_page(url))
  File "D:/Bit德沽/Codgear/Codes/FISHC/download_mm.py", line 22, in get_page
html = url_open(url).decode('utf-8')
  File "D:/Bit德沽/Codgear/Codes/FISHC/download_mm.py", line 15, in url_open
response = urllib.request.urlopen(url)
  File "D:\应用软件\python\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
  File "D:\应用软件\python\lib\urllib\request.py", line 471, in open
response = meth(req, response)
  File "D:\应用软件\python\lib\urllib\request.py", line 581, in http_response
'http', request, response, code, msg, hdrs)
  File "D:\应用软件\python\lib\urllib\request.py", line 503, in error
result = self._call_chain(*args)
  File "D:\应用软件\python\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
  File "D:\应用软件\python\lib\urllib\request.py", line 686, in http_error_302
return self.parent.open(new, timeout=req.timeout)
  File "D:\应用软件\python\lib\urllib\request.py", line 471, in open
response = meth(req, response)
  File "D:\应用软件\python\lib\urllib\request.py", line 581, in http_response
'http', request, response, code, msg, hdrs)
  File "D:\应用软件\python\lib\urllib\request.py", line 509, in error
return self._call_chain(*args)
  File "D:\应用软件\python\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
  File "D:\应用软件\python\lib\urllib\request.py", line 589, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable

Process finished with exit code 1

浮云骑士 · 发表于 2016-7-28 20:48:32

codgear 发表于 2016-7-28 18:19
OOXX视频教程里面的代码一运行就是这个结果，求助！！！
Traceback (most recent call last):
File "D: ...

煎蛋网加了反爬虫机制，原来的代码已经不行了，这是我找的其他的，你看看吧：

import urllib.request
import urllib.error
import os
import sys
import http.server
import http.client
import time
import re
import random
import math
data = None
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
enctype = 'utf-8'
proxies = []
max_error_times = 5 #最多允许失败5次，否则放弃该图片下载
def create_localhost():
number = int((math.sqrt(5)-1)/2) * len(proxies)
for x in range(number):
proxies.append(None)
def get_result(req_or_url,is_retrieve=False,filename = None): #flag是否使用retrieve
error_time = 0
while True:
try:
if error_time == max_error_times:
print('失败次数达%d次......放弃操作' % max_error_times)
return None
error_time += 1
if is_retrieve:
return urllib.request.urlretrieve(req_or_url,filename)
else:
return urllib.request.urlopen(req_or_url)
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code,e.reason)
change_proxy()
continue
elif hasattr(e,'reason'):
print(e)
change_proxy()
continue
except (ConnectionResetError,http.client.BadStatusLine) as e:
print(e)
change_proxy()
continue
except TimeoutError as e:
print(e)
print('服务器长时间无响应，自动切换代理.....')
change_proxy()
continue
def get_proxy():
global data,headers,proxies
req = urllib.request.Request('http://www.xici.net.co',None,headers)
response = get_result(req)
html = response.read().decode('utf-8')
p = re.compile(r'''<tr\sclass[^>]*>\s+
<td>.+</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
</tr>''',re.VERBOSE)
proxy_list = p.findall(html)
for each_proxy in proxy_list[1:]:
if each_proxy[4] == 'HTTP':
proxies.append(each_proxy[0]+':'+each_proxy[1])
def change_proxy():
proxy = random.choice(proxies)
if proxy == None:
proxy_support = proxy_support = urllib.request.ProxyHandler({})
else:
proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',headers['User-Agent'])]
urllib.request.install_opener(opener)
print('智能切换代理：%s' % ('本机' if proxy==None else proxy))
def get_page(): #获取最大页数
home = 'http://jandan.net/ooxx'
global data,headers,enctype
req = urllib.request.Request(home,data,headers)
response = get_result(req)
html = response.read().decode(enctype)
find_string = 'current-comment-page'
find_start = html.index(find_string) + len(find_string) + 3
find_end = html.index(']',find_start+1)
return int(html[find_start:find_end])
test = None
def get_pic(page): #生成器，返回一个图片链接
global data,headers,enctype
while True:
url = 'http://jandan.net/ooxx/page-%d' % page
print('当前页面：%d' % page)
req = urllib.request.Request(url,data,headers)
response = get_result(req)
if response == None:
print('获取页面失败.....')
sys.exit()
html = response.read().decode(enctype)
pic = re.compile(r'<img\s+src="(http://.+?\.(?:jpg|jpeg|gif))"')
for pic in pic.finditer(html):
yield pic.group(1)
time.sleep(5)
page -= 1
if page<1:
break
save_path = 'G:\\图片\\妹子图'
def download():
count = 1
global data,headers
for pic_url in get_pic(get_page()): #get_page()改为页数如1000可从1000页开始下载
file_name = os.path.split(pic_url)[1]
if not os.path.isdir(save_path): #目录不存在就创建
os.makedirs(save_path)
get_result(pic_url,True,save_path+'\\'+file_name)
print('本次成功下载第%d个图片! %s' % (count , pic_url))
count += 1
if __name__ == '__main__':
get_proxy()
create_localhost()
download()

复制代码

codgear · 发表于 2016-7-29 15:48:29

浮云骑士发表于 2016-7-28 20:48
煎蛋网加了反爬虫机制，原来的代码已经不行了，这是我找的其他的，你看看吧：

这个代码是鱼油分享的吧，我也有看到，我就是想测试成功，这个报错跟反爬虫有关？？我连续运行的话会报错503，服务器暂时无法服务，我觉得这个错才是因为反爬虫吧。

neilyoone · 发表于 2016-7-29 16:10:10

我认为完全没必要钻这个牛角尖，又简单方法直接点～

codgear · 发表于 2016-7-29 17:17:02

neilyoone 发表于 2016-7-29 16:10
我认为完全没必要钻这个牛角尖，又简单方法直接点～

那倒也是。。因为查到可能是存在编码问题，才跑来请教的

浮云骑士 · 发表于 2016-7-29 20:56:07

codgear 发表于 2016-7-29 17:17
那倒也是。。因为查到可能是存在编码问题，才跑来请教的

编码应该不会有问题，小甲鱼已经讲的很清楚了，我觉得是我们爬煎蛋网次数太多，所以他们采取了一些措施。

账号		自动登录	找回密码
密码			立即注册

python教程中OOXX中的问题。

最佳答案

浏览过的版块