>>>初学爬虫,抓取某音效网站数据, 感谢小甲鱼老师
本帖最后由 zua 于 2017-1-8 01:03 编辑代码很简单。复制运行,还没加代理,主要是学习逻辑。
import urllib.request
import traceback
import sys
import os
import re
def open_url(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_url(r,html):
url = re.findall(r,html)
return url
def chdir(folder):
if not os.path.exists(folder):
os.mkdir(folder)
os.chdir(folder)
def main():
folder = "C:\\Users\Administrator\Desktop\_Download_Sound"
chdir(folder) #创建文件夹
original_url = "http://sc.chinaz.com/yinxiao"
r = r'<a target="_blank" href="/(.+?)</a>'#主菜单链接
r1 = r'<p class="z"><a target="_blank" href="/(.+ alt.+?)">'
r2 = r'http.+?\.wav'
n = 0
html = open_url(original_url).decode("utf-8")
list = get_url(r,html)
for i in list:
a = i.split('">')
b = i.split('">')
if "/" in b:
b = b
#创建文件夹
chdir(b)
new_url = os.path.join("http://sc.chinaz.com//",a)
#print(new_url)
#下载每个链接里面的二级链接
new_html = open_url(new_url).decode("utf-8")
list = get_url(r1,new_html)
for i in list:
a = i.split('" alt="')
b = i.split('" alt="')
#获取wav文件下载网页
new_url = os.path.join("http://sc.chinaz.com//",a)
#print(new_url,b)#是下载目录链接了,名字
#取wav下载链接
wav_html = open_url(new_url).decode("utf-8")
list = get_url(r2,wav_html)
#print(list)
file = b +".wav"
if not os.path.exists(file):
file = b + ".wav"
else:
file = b + "%d"%(n) +".wav"
n += 1
with open(file,"wb") as f:
html = open_url(list)
print("正在下载:%s" % b)
f.write(html)
os.chdir(os.pardir) #下载完返回上一次目录
if __name__ == "__main__":
try:
main()
except SystemExit:
pass
except:
traceback.print_exc()
sys.quit()
input() 主要是做pygame的时候需要各种背景,所以就直接爬下来了、
学以致用啊 厉害了,谢谢分享。 直接复制粘贴就能用,厉害了word哥!!!!!赞一个 这个爬一小部分可以,要大量爬得用框架,多线程处理。 这周的目标就是自己编一个,能爬一小部分的{:10_327:} {:10_279:} 楼主辛苦
页:
[1]