好吧,就这样了,又给你美化了一下
import requests
from lxml import etree
url="http://www.dmoztools.net/Computers/Programming/Languages/Python/Books/"
headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Firefox/73.0 Safari/537.36"}
response=requests.get(url,headers=headers)
tree = etree.HTML(response.text)
data=tree.xpath("//div[@id='site-list-content']/div")
lst=[]
for x in range(1,len(data)+1):
ddict={}
title=tree.xpath("//section[@class='results sites']/div[1]/div[1]/div[%d]/div[3]/a/div[1]/text()"%x)
content=tree.xpath("//section[@class='results sites']/div[1]/div[1]/div[%d]/div[3]/div[1]/text()"%x)
ddict["title"]=title[0]
ddict["content"]=" ".join(content[0].replace("\r","").replace("\n","").replace("\t","").split())
lst.append(ddict)
for x in lst:
print(x)
|