|
|

楼主 |
发表于 2015-8-21 23:20:18
|
显示全部楼层
本帖最后由 ~风介~ 于 2015-8-30 23:56 编辑
- 代码是这样的,刚学Python,见笑了:
- import urllib.request
- import re
- import time
- def openurl(urls):
- htmls=[]
- for url in urls:
- req=urllib.request.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0')
- response = urllib.request.urlopen(url)
- htmls.append(response.read())
- time.sleep(5)
- return htmls
- def jiexi(htmls):
- pics=[]
- titles=[]
- contents=[]
- for html in htmls:
- html = html.decode('utf-8')
- pics.append(re.findall('<div class="player-wrapper".*?>.*?<img.*?src="(.*?).jp.*?".*?alt=".*"',html,re.S))
- titles.append(re.findall('class="vol-title">(.*?)</span>',html,re.S))
- contents.append(re.findall('<div.*?class="vol-desc">.*?(.*?)</div>',html,re.S))
-
- i = len(titles)
- with open('C:\\Users\\Administrator\\Desktop\\test.txt', 'w') as f:
- for x in range(i):
- print("正在下载期刊:%d" %(746-x))
- f.write("期刊名:"+str(titles[x])[2:-2]+"\n")
- f.write("图片链接:"+str(pics[x])[2:-2]+".jpg\n")
- content = str(contents[x])[4:-2]
- content.strip
- print(content.count("\<br\>\\n"))
- content.replace("<br>\n","#")
- f.write("配诗:"+content+"\n\n\n")
- yur='http://www.luoo.net/music/'
- urls = []
- for i in range(647,649):
- urls.append(yur + str(i))
- htmls = openurl(urls)
- pics = jiexi(htmls)
复制代码 |
评分
-
查看全部评分
|