[爬虫]关于XPath表达式
本帖最后由 DrWhiter 于 2021-6-11 22:55 编辑我试着运行下方的代码:import requests
from selenium import webdriver
from lxml import etree
import time
from bs4 import BeautifulSoup # 导入所需模块
last = []
# list_url = ['http://nce.koolearn.com/20150122/781915.html', 'http://nce.koolearn.com/20150122/781916.html', 'http://nce.koolearn.com/20150122/781917.html', 'http://nce.koolearn.com/20150122/781918.html', 'http://nce.koolearn.com/20150122/781919.html', 'http://nce.koolearn.com/20150122/781920.html', 'http://nce.koolearn.com/20150122/781921.html', 'http://nce.koolearn.com/20150122/781922.html', 'http://nce.koolearn.com/20150122/781923.html', 'http://nce.koolearn.com/20150122/781924.html', 'http://nce.koolearn.com/20150123/781955.html', 'http://nce.koolearn.com/20150123/781956.html', 'http://nce.koolearn.com/20150123/781957.html', 'http://nce.koolearn.com/20150123/781958.html', 'http://nce.koolearn.com/20150123/781959.html', 'http://nce.koolearn.com/20150123/781960.html', 'http://nce.koolearn.com/20150123/781961.html', 'http://nce.koolearn.com/20150123/781962.html', 'http://nce.koolearn.com/20150123/781963.html', 'http://nce.koolearn.com/20150123/781964.html', 'http://nce.koolearn.com/20150126/781995.html', 'http://nce.koolearn.com/20150126/781996.html', 'http://nce.koolearn.com/20150126/781997.html', 'http://nce.koolearn.com/20150126/781998.html', 'http://nce.koolearn.com/20150126/781999.html', 'http://nce.koolearn.com/20150127/782015.html', 'http://nce.koolearn.com/20150127/782016.html', 'http://nce.koolearn.com/20150127/782017.html', 'http://nce.koolearn.com/20150127/782018.html', 'http://nce.koolearn.com/20150127/782019.html', 'http://nce.koolearn.com/20150128/782040.html', 'http://nce.koolearn.com/20150128/782041.html', 'http://nce.koolearn.com/20150128/782042.html', 'http://nce.koolearn.com/20150128/782043.html', 'http://nce.koolearn.com/20150128/782044.html', 'http://nce.koolearn.com/20150129/782060.html', 'http://nce.koolearn.com/20150129/782061.html', 'http://nce.koolearn.com/20150129/782062.html', 'http://nce.koolearn.com/20150129/782063.html', 'http://nce.koolearn.com/20150129/782064.html', 'http://nce.koolearn.com/20150130/782080.html', 'http://nce.koolearn.com/20150130/782081.html', 'http://nce.koolearn.com/20150130/782082.html', 'http://nce.koolearn.com/20150130/782083.html', 'http://nce.koolearn.com/20150130/782084.html', 'http://nce.koolearn.com/20150202/782103.html', 'http://nce.koolearn.com/20150202/782104.html', 'http://nce.koolearn.com/20150202/782105.html', 'http://nce.koolearn.com/20150202/782106.html', 'http://nce.koolearn.com/20150202/782107.html', 'http://nce.koolearn.com/20150203/782113.html', 'http://nce.koolearn.com/20150203/782114.html', 'http://nce.koolearn.com/20150203/782115.html', 'http://nce.koolearn.com/20150203/782116.html', 'http://nce.koolearn.com/20150203/782117.html', 'http://nce.koolearn.com/20150204/782129.html', 'http://nce.koolearn.com/20150204/782130.html', 'http://nce.koolearn.com/20150204/782131.html', 'http://nce.koolearn.com/20150204/782132.html', 'http://nce.koolearn.com/20150204/782133.html', 'http://nce.koolearn.com/20150205/782144.html', 'http://nce.koolearn.com/20150205/782145.html', 'http://nce.koolearn.com/20150205/782146.html', 'http://nce.koolearn.com/20150205/782147.html', 'http://nce.koolearn.com/20150205/782148.html', 'http://nce.koolearn.com/20150206/782154.html', 'http://nce.koolearn.com/20150206/782155.html', 'http://nce.koolearn.com/20150206/782156.html', 'http://nce.koolearn.com/20150206/782157.html', 'http://nce.koolearn.com/20150206/782158.html', 'http://nce.koolearn.com/20150206/782159.html', 'http://nce.koolearn.com/20150206/782160.html']
def get_text(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"} # 标头
reponse = requests.get(url=url,headers=headers) # 获得字节码
reponse.encoding = "utf-8" # 指定编码方式避免乱码
global result
result = reponse.text # 获取网页源代码
"""
soup = BeautifulSoup(result,"lxml")
global b
b = soup.select('.xqy_container.w1200>.xqy_container_box>.xqy_core>.xqy_core_main>.xqy_core_text>p')
for test in range(len(b)):
b = str(b)
b = b.replace("<p>","")
b = b.replace("</p>","")
b = b.replace("\u3000","")
"""
# 以上为不需执行的部分,可以直接略过
global c
global d
c = etree.HTML(result) # 实例化etree对象
d = c.xpath("/html/body/div/div/div/div/div/p//text()") # 通过XPath获取信息
i = 'http://nce.koolearn.com/20150122/781915.html'# 指定url
get_text(i)
.
当我试着获取d的值时,却返回了一个空列表,我该怎么办?
PS:我可以用代码中的XPath表达式在网页中找到相应元素
改成这样试试看:
d = c.xpath('//div[@class="xqy_core_main"]/div/p/text()')
想抓哪部分数据
页:
[1]