|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import time
import re
import pymongo
from lxml import etree
import requests
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
def get_url_muisc(url):
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text)
muisc_hrefs=selector.xpath('//div[@class="p12"]/@href')
for muisc_href in muisc_hrefs:
get_muisc_info(muisc_href)
def get_muisc_info(url):
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text)
name=selector.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
author = re.findall('<span class="pl">"表演者":.*?>(.*?)</a>', html.text, re.S)
styles = re.findall('<span class="pl">流派:</span>" (.*?)"<br>', html.text, re.S)
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()
time = re.findall('<span class="pl">发行时间:</span>" (.*?)"<br>', html.text, re.S)
publishers = re.findall('<span class="pl">出版者:</span>" (.*?)"<br>', html.text, re.S)
if len(publishers) == 0:
publishers = '未知'
else:
publishers = publishers[0].strip()
score=selector.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()')[0]
#print(name,author,time,publishers,score)
info={
'name':name,
'author':author,
'styles':styles,
'time':time,
'publishers':publishers,
'score':score
}
print(info)
if __name__=='__main__':
urls=['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]
for url in urls:
get_url_muisc(url)
time.sleep(2)
麻烦同学看下 我这中间 抓取的匹配方法 有什么问题??谢谢了
你得到页面数据后,print出来,然后复制到正则工具,用你的表达式去匹配,看是否能匹配出来,能则表示你的表达式没有问题,不能则要修正表达式
|
|