from lxml import etree
import requests
def openurl(url):
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',}
res=requests.get(url,headers=head)
res.encoding='gb2312'
text=res.text
return text
def parseurl(text):
train_info_dict={}
html=etree.HTML(text)
tr=html.xpath('//tr[@onmouseover="this.bgColor=\'#E6F2E7\';"]')
for each in tr:
checi=each.xpath('./td[1]/a/b/text()')[0]
xinghao=each.xpath('./td[2]/text()')[0]
shifazhan=each.xpath('./td[3]/text()')[0]
shifashijian=each.xpath('./td[4]/text()')[0]
zhongdianzhan=each.xpath('./td[8]/text()')[0]
daodashijian=each.xpath('./td[9]/text()')[0]
train_info_dict[checi]=[xinghao,shifazhan,shifashijian,zhongdianzhan,daodashijian]
print(train_info_dict)
print(len(train_info_dict))
def main():
url='http://qq.ip138.com/train/guangdong/guangzhounan.htm'
text=openurl(url)
parseurl(text)
if __name__=='__main__':
main()
我就直接打印出来了,你可以用pickle把它以二进制保存,或者json格式保存,当然也可以引用openpyxl保存到excal,但是考虑到原网站就是以表格形式输出,所以感觉没必要。共940行全部爬取 |