|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
这是一个大神编的通过wangpansou.com来查找网盘的一个程序,里面有很多比如说urllib.request和re还有一些代码我不是很懂,有没有人一句一句给我解析一下,万分感谢!一人帮忙解析几行也行,人多力量大。
# coding = 'utf-8'
import urllib.request
import re
import random
def get_html(url, header):
req = urllib.request.Request(url, headers=header)
html = urllib.request.urlopen(req)
head_type = html.headers['Content-Type'].split('=')[-1]
status = html.getcode()
return html, head_type, status
headers = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0"},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}]
keyword = input('请输入要搜索的资源名:')
keyword = urllib.request.quote(keyword.encode('utf-8'))
url = "http://www.wangpansou.cn/s.php?wp=0&ty=gn&op=gn&q=" + keyword + "&q=" + keyword
header = random.choice(headers)
f_html, f_head_type, f_status = get_html(url, header)
if f_status == 200:
f_html = f_html.read()
f_html = f_html.decode(f_head_type)
pattern = re.compile(
'<a href="(.+)"><div class="cse-search-result_paging_num " tabindex="\d{1,3}">\d{1,3}</div></a>')
content = pattern.findall(f_html)
url_list = []
url_head = 'http://www.wangpansou.cn/'
for i in content:
i = url_head + i
if not i in url_list:
url_list.append(i)
first_url = url_list[0][:-2] + '0'
url_list.insert(0, first_url)
count = 0
for each_url in url_list:
header = random.choice(headers)
s_html, s_head_type, s_status = get_html(each_url, header)
if s_status == 200:
s_html = s_html.read()
s_html = s_html.decode(s_head_type)
s_pattern = re.compile('<a class=".+" href="(.+)" rel.+')
s_content = s_pattern.findall(s_html)
t_pattern = re.compile('<div id=".+" class="cse-search-result_content_item_mid">\s+(.+)')
t_content = t_pattern.findall(s_html)
else:
print('Website Error!')
for i in range(0, len(s_content)):
count += 1
print(str(count) + ':' + t_content[i] + '\n' + s_content[i])
print()
print('共搜索到%d个资源,已经全部爬取完毕!' % count)
本帖最后由 yongxi 于 2017-7-15 02:13 编辑
urllib这个是用python访问互联网的一个库
re是正则表达式。
一步一步学习吧。不要好高骛远,等你把视频教程关于爬虫的章节都学习了
自然而然就看懂了
|
|