xiaosi4081 发表于 2020-8-5 14:55:19

xpath有误

本帖最后由 xiaosi4081 于 2020-8-5 14:58 编辑

部分代码:
try:
    res = get(url,headers=headers).text
    soup = BeautifulSoup(res,'lxml')
    vulstring = ""
    for target in soup.find_all("table",class_="plhin"):
      content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')
      string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
      vulstring += string
      vulstring += "\n"
   
    print(vulstring)
except exceptions.MissingSchema:
    print('url有误')

url是论坛上的帖子的地址,例:https://fishc.com.cn/thread-176366-1-1.html
主要是xpath的问题:

content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')

报错:

Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\x4\AppData\Local\Programs\Python\Python38\lib\tkinter\__init__.py", line 1883, in __call__
    return self.func(*args)
File "d:\requests\getwangye.py", line 47, in <lambda>
    startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
File "d:\requests\getwangye.py", line 21, in getting
    content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')
File "src\lxml\etree.pyx", line 1582, in lxml.etree._Element.xpath
File "src\lxml\xpath.pxi", line 305, in lxml.etree.XPathElementEvaluator.__call__
File "src\lxml\xpath.pxi", line 225, in lxml.etree._XPathEvaluatorBase._handle_result
lxml.etree.XPathEvalError: Invalid expression

qiuyouzhi 发表于 2020-8-5 15:01:15

噗,太粗心了,text打成了test
try:
    res = get(url,headers=headers).text
    soup = BeautifulSoup(res,'lxml')
    vulstring = ""
    for target in soup.find_all("table",class_="plhin"):
      content = etree.HTML(target.text).xpath('//*[@class="t_f"]/text()')
      string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
      vulstring += string
      vulstring += "\n"
   
    print(vulstring)
except exceptions.MissingSchema:
    print('url有误')

xiaosi4081 发表于 2020-8-5 15:05:49

本帖最后由 xiaosi4081 于 2020-8-5 15:07 编辑

qiuyouzhi 发表于 2020-8-5 15:01
噗,太粗心了,text打成了test

但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?

qiuyouzhi 发表于 2020-8-5 15:07:07

xiaosi4081 发表于 2020-8-5 15:05
但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?

咱把代码发完整呗
不然我运行不了

xiaosi4081 发表于 2020-8-5 15:08:40

qiuyouzhi 发表于 2020-8-5 15:07
咱把代码发完整呗
不然我运行不了

我只是怕被人抄袭{:10_245:}

代码:
getwangye.py:
from tkinter import *
from requests import get,exceptions
import tkinter.messagebox
import clipboard
import tkinter.filedialog
from bs4 import BeautifulSoup
from lxml import etree
class getmain:
    def __init__(self,fm):
      self.fm = fm
      self.maincode()

    def getting(self,url):
      try:
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36","Cookie":"oMVX_2132_saltkey=R74UkqJK; oMVX_2132_lastvisit=1595396069; oMVX_2132_auth=1693Do2EjOr2r8ngcWOluYDuwkzl3LKM8fJ4GL4MhznBYgtr4f80N8ED9JPlvRmsK4KaBbiuX%2FP92S7fwrGzxPc%2Fjnk; oMVX_2132_lastcheckfeed=881467%7C1595399689; oMVX_2132_atarget=1; oMVX_2132_onlineindex=1; oMVX_2132_lastviewtime=881467%7C1595500559; PHPSESSID=avg92ob7gcv06c47fam0im67m0; oMVX_2132_space_top_credit_881467_all=207; oMVX_2132_home_diymode=1; oMVX_2132_ignore_notice=1; oMVX_2132_smile=10D1; oMVX_2132_nofocus_forum=1; oMVX_2132_atlist=566159%2C854664%2C378930%2C702609%2C849582; oMVX_2132_ulastactivity=fc13JyOysM5ExnW8UyySLlDsqBBuncLY%2BI8IMERIoUu78vHR42tc; oMVX_2132_sid=SFa8Bf; oMVX_2132_lip=119.130.231.105%2C1596543963; oMVX_2132_st_t=881467%7C1596546913%7C2f0047256f62124008c3d602f9b61fd7; oMVX_2132_forum_lastvisit=D_39_1595489878D_171_1595680528D_360_1595923195D_243_1596006547D_33_1596011098D_188_1596099175D_38_1596534860D_173_1596546913; oMVX_2132_home_readfeed=1596546916; oMVX_2132_noticeTitle=1; acw_tc=781bad0915965482246555278e7efe279f8a3234011af062f779af11e2dc63; oMVX_2132_visitedfid=173D38D33D188D337D354D241D39D242D335; oMVX_2132_viewid=tid_176798; oMVX_2132_sendmail=1; oMVX_2132_checkpm=1; oMVX_2132_st_p=881467%7C1596548308%7Cc54b90b96d8abef819e979b0397a55cb; _fmdata=OQmTawF8D5QYYw5z1d7VRJZWZr08pj0Nh2V4cP0xTcWdnjXY%2BdGfHTtlF8ZCyqxRG6Ng5pc0cl9klF2pXVNj0STkj9ckn7q%2Fabe950w6FN4%3D; oMVX_2132_lastact=1596548308%09misc.php%09patch"}

            res = get(url,headers=headers).text
            soup = BeautifulSoup(res,'lxml')
            vulstring = ""
            for target in soup.find_all("table",class_="plhin"):
                content = str(etree.HTML(target.text).xpath('//*[@class="t_f"]/text()'))
                string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
                vulstring += string
                vulstring += "\n"
            
            self.result.delete(0.0,END)
            self.result.insert(0.0,vulstring)
      except exceptions.MissingSchema:
            tkinter.messagebox.showerror('错误','url有误')
    def copy(self):
      clipboard.copy(self.result.get(0.0,END))
    def savefile(self):
      path = tkinter.filedialog.asksaveasfile()
      path.write(str(self.result.get(0.0,END)))
      path.close()
    def closewindow(self):
      self.fm.destroy()
      exit()
    def maincode(self):
      frame1 = LabelFrame(self.fm,text="input")
      
      urllabel = Label(frame1,text="url is:   ")
      urllabel.pack()
      urllabel.grid(row=1,column=1)
      self.url = Entry(frame1)
      self.url.grid(row=1,column=2)
      startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
      startButton.grid(row=1,column=3)
      frame1.pack()
      resultFrame = LabelFrame(self.fm,text="result")
      self.result = Text(resultFrame,width=35,height=15)
      resultcopy = Button(resultFrame,text="复制到剪贴板",command=self.copy)
      self.result.pack()
      resultcopy.pack()
      resultFrame.pack()

main.py:
# -*- coding: utf-8 -*-
from requests import get
from re import search
import tkinter as tk
import tkinter.messagebox
from threading import Thread
import time as ti
from getwangye import getmain

# 注:\1 用于引用前面编号为 1 的子组
class fishc_get:
    def __init__(self):
      self.a = []
      self.root = tk.Tk()
      self.root.title("求助帖提醒")
      self.fm1 = tk.LabelFrame(self.root,text="get")
      self.fm1.grid(row=1,column=1)
      self.fm2 = tk.LabelFrame(self.root,text="get_tiezi")
      self.fm2.grid(row=1,column=2)
      self.t = tk.Text(self.fm1)
      self.t.pack()
      getmain(self.fm2)

      


    def load(self):
      while True:
            res = get(f"https://fishc.com.cn/bestanswer.php?mod=huzhu&type=undo").text
            # 获取问题帖的名字
            name = search(r'<a href="https://fishc.com.cn/thread-\d+?-1-1.html" target="_blank">(.+?)</a>', res).group(1)
            # 获取问题帖的URL
            url = "https://fishc.com.cn/thread-" + search(
                r'<a href="https://fishc.com.cn/thread-(.+?)-1-1.html" target="_blank"', res).group(1) + "-1-1.html"
            # 获取回答数
            ans = search(r'<font color="#999999">(\d+?)</font>', res).group(1)
            # 获取时间
            time = search(r'<font color="#999999">(\d+?-\d+?-\d+? \d+?:\d+?)</font>', res).group(1)
            if name not in self.a:
                b = f" 标题:{name}\n 回答数:{ans}\n 提问时间:{time}\n 地址:{url}\n\n"
                self.t.insert(tk.END, b)# 打印相应的内容
                tkinter.messagebox.showwarning("提示", b)
                self.a.append(name)
            ti.sleep(10)
    def duoxian(self):
      try:
            self.func = Thread(target=self.load)
            self.func.setDaemon(True)
            self.func.start()
            self.root.mainloop()
      except:
            ti.sleep(30)
            self.duoxian()

if __name__ == "__main__":
    cl = fishc_get()
    cl.duoxian()
   

qiuyouzhi 发表于 2020-8-5 15:21:27

xiaosi4081 发表于 2020-8-5 15:05
但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?

getwangye.py改成这样就行

from tkinter import *
from requests import get,exceptions
import tkinter.messagebox
import clipboard
import tkinter.filedialog
from bs4 import BeautifulSoup
from lxml import etree
class getmain:
    def __init__(self,fm):
      self.fm = fm
      self.maincode()

    def getting(self,url):
      try:
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36","Cookie":"oMVX_2132_saltkey=R74UkqJK; oMVX_2132_lastvisit=1595396069; oMVX_2132_auth=1693Do2EjOr2r8ngcWOluYDuwkzl3LKM8fJ4GL4MhznBYgtr4f80N8ED9JPlvRmsK4KaBbiuX%2FP92S7fwrGzxPc%2Fjnk; oMVX_2132_lastcheckfeed=881467%7C1595399689; oMVX_2132_atarget=1; oMVX_2132_onlineindex=1; oMVX_2132_lastviewtime=881467%7C1595500559; PHPSESSID=avg92ob7gcv06c47fam0im67m0; oMVX_2132_space_top_credit_881467_all=207; oMVX_2132_home_diymode=1; oMVX_2132_ignore_notice=1; oMVX_2132_smile=10D1; oMVX_2132_nofocus_forum=1; oMVX_2132_atlist=566159%2C854664%2C378930%2C702609%2C849582; oMVX_2132_ulastactivity=fc13JyOysM5ExnW8UyySLlDsqBBuncLY%2BI8IMERIoUu78vHR42tc; oMVX_2132_sid=SFa8Bf; oMVX_2132_lip=119.130.231.105%2C1596543963; oMVX_2132_st_t=881467%7C1596546913%7C2f0047256f62124008c3d602f9b61fd7; oMVX_2132_forum_lastvisit=D_39_1595489878D_171_1595680528D_360_1595923195D_243_1596006547D_33_1596011098D_188_1596099175D_38_1596534860D_173_1596546913; oMVX_2132_home_readfeed=1596546916; oMVX_2132_noticeTitle=1; acw_tc=781bad0915965482246555278e7efe279f8a3234011af062f779af11e2dc63; oMVX_2132_visitedfid=173D38D33D188D337D354D241D39D242D335; oMVX_2132_viewid=tid_176798; oMVX_2132_sendmail=1; oMVX_2132_checkpm=1; oMVX_2132_st_p=881467%7C1596548308%7Cc54b90b96d8abef819e979b0397a55cb; _fmdata=OQmTawF8D5QYYw5z1d7VRJZWZr08pj0Nh2V4cP0xTcWdnjXY%2BdGfHTtlF8ZCyqxRG6Ng5pc0cl9klF2pXVNj0STkj9ckn7q%2Fabe950w6FN4%3D; oMVX_2132_lastact=1596548308%09misc.php%09patch"}

            res = get(url,headers=headers).text
            soup = BeautifulSoup(res,'lxml')
            vulstring = ""
            content = str(etree.HTML(res).xpath('//*[@class="t_f"]/text()'))
            for target in soup.find_all("table",class_="plhin"):
               
                string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
                vulstring += string
                vulstring += "\n"
            
            self.result.delete(0.0,END)
            self.result.insert(0.0,vulstring)
      except exceptions.MissingSchema:
            tkinter.messagebox.showerror('错误','url有误')
    def copy(self):
      clipboard.copy(self.result.get(0.0,END))
    def savefile(self):
      path = tkinter.filedialog.asksaveasfile()
      path.write(str(self.result.get(0.0,END)))
      path.close()
    def closewindow(self):
      self.fm.destroy()
      exit()
    def maincode(self):
      frame1 = LabelFrame(self.fm,text="input")
      
      urllabel = Label(frame1,text="url is:   ")
      urllabel.pack()
      urllabel.grid(row=1,column=1)
      self.url = Entry(frame1)
      self.url.grid(row=1,column=2)
      startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
      startButton.grid(row=1,column=3)
      frame1.pack()
      resultFrame = LabelFrame(self.fm,text="result")
      self.result = Text(resultFrame,width=35,height=15)
      resultcopy = Button(resultFrame,text="复制到剪贴板",command=self.copy)
      self.result.pack()
      resultcopy.pack()
      resultFrame.pack()
页: [1]
查看完整版本: xpath有误