xpath有误
本帖最后由 xiaosi4081 于 2020-8-5 14:58 编辑部分代码:
try:
res = get(url,headers=headers).text
soup = BeautifulSoup(res,'lxml')
vulstring = ""
for target in soup.find_all("table",class_="plhin"):
content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')
string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
vulstring += string
vulstring += "\n"
print(vulstring)
except exceptions.MissingSchema:
print('url有误')
url是论坛上的帖子的地址,例:https://fishc.com.cn/thread-176366-1-1.html
主要是xpath的问题:
content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')
报错:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\x4\AppData\Local\Programs\Python\Python38\lib\tkinter\__init__.py", line 1883, in __call__
return self.func(*args)
File "d:\requests\getwangye.py", line 47, in <lambda>
startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
File "d:\requests\getwangye.py", line 21, in getting
content = etree.HTML(target.text).xpath('//*[@class="t_f"]/test()')
File "src\lxml\etree.pyx", line 1582, in lxml.etree._Element.xpath
File "src\lxml\xpath.pxi", line 305, in lxml.etree.XPathElementEvaluator.__call__
File "src\lxml\xpath.pxi", line 225, in lxml.etree._XPathEvaluatorBase._handle_result
lxml.etree.XPathEvalError: Invalid expression 噗,太粗心了,text打成了test
try:
res = get(url,headers=headers).text
soup = BeautifulSoup(res,'lxml')
vulstring = ""
for target in soup.find_all("table",class_="plhin"):
content = etree.HTML(target.text).xpath('//*[@class="t_f"]/text()')
string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
vulstring += string
vulstring += "\n"
print(vulstring)
except exceptions.MissingSchema:
print('url有误') 本帖最后由 xiaosi4081 于 2020-8-5 15:07 编辑
qiuyouzhi 发表于 2020-8-5 15:01
噗,太粗心了,text打成了test
但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?
xiaosi4081 发表于 2020-8-5 15:05
但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?
咱把代码发完整呗
不然我运行不了 qiuyouzhi 发表于 2020-8-5 15:07
咱把代码发完整呗
不然我运行不了
我只是怕被人抄袭{:10_245:}
代码:
getwangye.py:
from tkinter import *
from requests import get,exceptions
import tkinter.messagebox
import clipboard
import tkinter.filedialog
from bs4 import BeautifulSoup
from lxml import etree
class getmain:
def __init__(self,fm):
self.fm = fm
self.maincode()
def getting(self,url):
try:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36","Cookie":"oMVX_2132_saltkey=R74UkqJK; oMVX_2132_lastvisit=1595396069; oMVX_2132_auth=1693Do2EjOr2r8ngcWOluYDuwkzl3LKM8fJ4GL4MhznBYgtr4f80N8ED9JPlvRmsK4KaBbiuX%2FP92S7fwrGzxPc%2Fjnk; oMVX_2132_lastcheckfeed=881467%7C1595399689; oMVX_2132_atarget=1; oMVX_2132_onlineindex=1; oMVX_2132_lastviewtime=881467%7C1595500559; PHPSESSID=avg92ob7gcv06c47fam0im67m0; oMVX_2132_space_top_credit_881467_all=207; oMVX_2132_home_diymode=1; oMVX_2132_ignore_notice=1; oMVX_2132_smile=10D1; oMVX_2132_nofocus_forum=1; oMVX_2132_atlist=566159%2C854664%2C378930%2C702609%2C849582; oMVX_2132_ulastactivity=fc13JyOysM5ExnW8UyySLlDsqBBuncLY%2BI8IMERIoUu78vHR42tc; oMVX_2132_sid=SFa8Bf; oMVX_2132_lip=119.130.231.105%2C1596543963; oMVX_2132_st_t=881467%7C1596546913%7C2f0047256f62124008c3d602f9b61fd7; oMVX_2132_forum_lastvisit=D_39_1595489878D_171_1595680528D_360_1595923195D_243_1596006547D_33_1596011098D_188_1596099175D_38_1596534860D_173_1596546913; oMVX_2132_home_readfeed=1596546916; oMVX_2132_noticeTitle=1; acw_tc=781bad0915965482246555278e7efe279f8a3234011af062f779af11e2dc63; oMVX_2132_visitedfid=173D38D33D188D337D354D241D39D242D335; oMVX_2132_viewid=tid_176798; oMVX_2132_sendmail=1; oMVX_2132_checkpm=1; oMVX_2132_st_p=881467%7C1596548308%7Cc54b90b96d8abef819e979b0397a55cb; _fmdata=OQmTawF8D5QYYw5z1d7VRJZWZr08pj0Nh2V4cP0xTcWdnjXY%2BdGfHTtlF8ZCyqxRG6Ng5pc0cl9klF2pXVNj0STkj9ckn7q%2Fabe950w6FN4%3D; oMVX_2132_lastact=1596548308%09misc.php%09patch"}
res = get(url,headers=headers).text
soup = BeautifulSoup(res,'lxml')
vulstring = ""
for target in soup.find_all("table",class_="plhin"):
content = str(etree.HTML(target.text).xpath('//*[@class="t_f"]/text()'))
string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
vulstring += string
vulstring += "\n"
self.result.delete(0.0,END)
self.result.insert(0.0,vulstring)
except exceptions.MissingSchema:
tkinter.messagebox.showerror('错误','url有误')
def copy(self):
clipboard.copy(self.result.get(0.0,END))
def savefile(self):
path = tkinter.filedialog.asksaveasfile()
path.write(str(self.result.get(0.0,END)))
path.close()
def closewindow(self):
self.fm.destroy()
exit()
def maincode(self):
frame1 = LabelFrame(self.fm,text="input")
urllabel = Label(frame1,text="url is: ")
urllabel.pack()
urllabel.grid(row=1,column=1)
self.url = Entry(frame1)
self.url.grid(row=1,column=2)
startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
startButton.grid(row=1,column=3)
frame1.pack()
resultFrame = LabelFrame(self.fm,text="result")
self.result = Text(resultFrame,width=35,height=15)
resultcopy = Button(resultFrame,text="复制到剪贴板",command=self.copy)
self.result.pack()
resultcopy.pack()
resultFrame.pack()
main.py:
# -*- coding: utf-8 -*-
from requests import get
from re import search
import tkinter as tk
import tkinter.messagebox
from threading import Thread
import time as ti
from getwangye import getmain
# 注:\1 用于引用前面编号为 1 的子组
class fishc_get:
def __init__(self):
self.a = []
self.root = tk.Tk()
self.root.title("求助帖提醒")
self.fm1 = tk.LabelFrame(self.root,text="get")
self.fm1.grid(row=1,column=1)
self.fm2 = tk.LabelFrame(self.root,text="get_tiezi")
self.fm2.grid(row=1,column=2)
self.t = tk.Text(self.fm1)
self.t.pack()
getmain(self.fm2)
def load(self):
while True:
res = get(f"https://fishc.com.cn/bestanswer.php?mod=huzhu&type=undo").text
# 获取问题帖的名字
name = search(r'<a href="https://fishc.com.cn/thread-\d+?-1-1.html" target="_blank">(.+?)</a>', res).group(1)
# 获取问题帖的URL
url = "https://fishc.com.cn/thread-" + search(
r'<a href="https://fishc.com.cn/thread-(.+?)-1-1.html" target="_blank"', res).group(1) + "-1-1.html"
# 获取回答数
ans = search(r'<font color="#999999">(\d+?)</font>', res).group(1)
# 获取时间
time = search(r'<font color="#999999">(\d+?-\d+?-\d+? \d+?:\d+?)</font>', res).group(1)
if name not in self.a:
b = f" 标题:{name}\n 回答数:{ans}\n 提问时间:{time}\n 地址:{url}\n\n"
self.t.insert(tk.END, b)# 打印相应的内容
tkinter.messagebox.showwarning("提示", b)
self.a.append(name)
ti.sleep(10)
def duoxian(self):
try:
self.func = Thread(target=self.load)
self.func.setDaemon(True)
self.func.start()
self.root.mainloop()
except:
ti.sleep(30)
self.duoxian()
if __name__ == "__main__":
cl = fishc_get()
cl.duoxian()
xiaosi4081 发表于 2020-8-5 15:05
但我这个代码没法弄到帖子的内容,那个xpath返回的是一个空列表,有什么其他的办法吗?
getwangye.py改成这样就行
from tkinter import *
from requests import get,exceptions
import tkinter.messagebox
import clipboard
import tkinter.filedialog
from bs4 import BeautifulSoup
from lxml import etree
class getmain:
def __init__(self,fm):
self.fm = fm
self.maincode()
def getting(self,url):
try:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36","Cookie":"oMVX_2132_saltkey=R74UkqJK; oMVX_2132_lastvisit=1595396069; oMVX_2132_auth=1693Do2EjOr2r8ngcWOluYDuwkzl3LKM8fJ4GL4MhznBYgtr4f80N8ED9JPlvRmsK4KaBbiuX%2FP92S7fwrGzxPc%2Fjnk; oMVX_2132_lastcheckfeed=881467%7C1595399689; oMVX_2132_atarget=1; oMVX_2132_onlineindex=1; oMVX_2132_lastviewtime=881467%7C1595500559; PHPSESSID=avg92ob7gcv06c47fam0im67m0; oMVX_2132_space_top_credit_881467_all=207; oMVX_2132_home_diymode=1; oMVX_2132_ignore_notice=1; oMVX_2132_smile=10D1; oMVX_2132_nofocus_forum=1; oMVX_2132_atlist=566159%2C854664%2C378930%2C702609%2C849582; oMVX_2132_ulastactivity=fc13JyOysM5ExnW8UyySLlDsqBBuncLY%2BI8IMERIoUu78vHR42tc; oMVX_2132_sid=SFa8Bf; oMVX_2132_lip=119.130.231.105%2C1596543963; oMVX_2132_st_t=881467%7C1596546913%7C2f0047256f62124008c3d602f9b61fd7; oMVX_2132_forum_lastvisit=D_39_1595489878D_171_1595680528D_360_1595923195D_243_1596006547D_33_1596011098D_188_1596099175D_38_1596534860D_173_1596546913; oMVX_2132_home_readfeed=1596546916; oMVX_2132_noticeTitle=1; acw_tc=781bad0915965482246555278e7efe279f8a3234011af062f779af11e2dc63; oMVX_2132_visitedfid=173D38D33D188D337D354D241D39D242D335; oMVX_2132_viewid=tid_176798; oMVX_2132_sendmail=1; oMVX_2132_checkpm=1; oMVX_2132_st_p=881467%7C1596548308%7Cc54b90b96d8abef819e979b0397a55cb; _fmdata=OQmTawF8D5QYYw5z1d7VRJZWZr08pj0Nh2V4cP0xTcWdnjXY%2BdGfHTtlF8ZCyqxRG6Ng5pc0cl9klF2pXVNj0STkj9ckn7q%2Fabe950w6FN4%3D; oMVX_2132_lastact=1596548308%09misc.php%09patch"}
res = get(url,headers=headers).text
soup = BeautifulSoup(res,'lxml')
vulstring = ""
content = str(etree.HTML(res).xpath('//*[@class="t_f"]/text()'))
for target in soup.find_all("table",class_="plhin"):
string = "%s:%s"% (target.find("div",class_="pls favatar").div.div.a.text,content)
vulstring += string
vulstring += "\n"
self.result.delete(0.0,END)
self.result.insert(0.0,vulstring)
except exceptions.MissingSchema:
tkinter.messagebox.showerror('错误','url有误')
def copy(self):
clipboard.copy(self.result.get(0.0,END))
def savefile(self):
path = tkinter.filedialog.asksaveasfile()
path.write(str(self.result.get(0.0,END)))
path.close()
def closewindow(self):
self.fm.destroy()
exit()
def maincode(self):
frame1 = LabelFrame(self.fm,text="input")
urllabel = Label(frame1,text="url is: ")
urllabel.pack()
urllabel.grid(row=1,column=1)
self.url = Entry(frame1)
self.url.grid(row=1,column=2)
startButton = Button(frame1,text="start",command=lambda : self.getting(self.url.get()))
startButton.grid(row=1,column=3)
frame1.pack()
resultFrame = LabelFrame(self.fm,text="result")
self.result = Text(resultFrame,width=35,height=15)
resultcopy = Button(resultFrame,text="复制到剪贴板",command=self.copy)
self.result.pack()
resultcopy.pack()
resultFrame.pack()
页:
[1]