from urllib import request
import re
class Spider():
url = "https://www.panda.tv/cate/lol?pdt=1.24.s1.3.2c6qoma1l34"
root_pattern = '<div class="video-info">([\s\S]*?)</div>'
name_pattern = '</i>([\s\S]+?)</span>'
number_pattern = '<span class="video-number">([\s\S]+?)</span>'
def __fetch_content(self):
r = request.urlopen(Spider.url)
htmls = r.read()
htmls = str(htmls,encoding = "utf-8")
return htmls
def __analysis(self,htmls):
root_htmls = re.findall(Spider.root_pattern,htmls)
anchors = []
#print(root_htmls[0])
for html in root_htmls:
name = re.findall(Spider.name_pattern,html)
number = re.findall(Spider.number_pattern,html)
anchor = {'name':name,'number':number}
anchors.append(anchor)
#print(anchors[0])
return anchors
def __refine(self,anchors):
l = lambda anchor:{'name':anchor['name'][0].strip(),'number':anchor['number'][0]}
return map(l,anchors)
def __sort(self,anchors):
anchors = sorted(anchors,key =self.__sort1)
def __sort1(self,anchor):
r = re.findall("\d*",anchor['number'])
number = float(r)
if "万" in anchor['number']:
number *= 10000
return number
def __show(self,anchors):
for i in anchors:
print(i['name'] + '------>' + i['number'])
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = list(self.__refine(anchors))
anchors = self.__sort(anchors)
self.__show(anchors)
#print(anchors[0])
s = Spider()
s.go()
|