|
|
发表于 2018-4-10 00:26:55
|
显示全部楼层
- import urllib.request as ur
- from bs4 import BeautifulSoup
- import re
- url=r'http://www.100ppi.com/sf/day-2017-01-04.html'
- header={'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
- req=ur.Request(url=url,headers=header)
- html=ur.urlopen(req).read().decode('utf-8')
- soup=BeautifulSoup(html,'html.parser')
- tab=soup.find_all('table')
- trs=soup.find_all('tr',align='center')
- trs1=str(trs)
- print(trs1)
- res2=r'<td><a style="color:#21469f;" target="_blank">(.*?)</a></td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td>\n<table width="100%"><tr><td align="center" width="50%"><font color=.*?>(.*?)</font></td><td align="center" width="50%"><font color=.*?>(.*?)</font></td></tr></table>\n</td>'
- l=re.findall(res2,trs1,re.S|re.M)
- l
复制代码 |
|