|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
使用xpath抽取贴吧的评论,但是运行代码后发现打印出了网页的信息,并没有单独抽取出评论信息。求大神指点。
代码如下:
import urllib.request, urllib.parse, urllib.robotparser
import re, sys, itertools, os, scrapy
from lxml import etree
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:66.0) Gecko/20100101 Firefox/66.0')
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
return html
def get_comment(html):
# p = r'(?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.{3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5]))'
# iplist = re.findall(p, html)
#sel = scrapy.selector.Selector(html)
#comment = sel.xpath('//*[@id="root"]/div/div[3]/div/div[1]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[*]/div[2]/text()').extract()
html2 = etree.parse(html, etree.HTMLParser())
comment = html2.xpath('//*[@id="post_content_125598406716"]/text()').extract()
print(comment)
if __name__ == '__main__':
url = "https://tieba.baidu.com/p/6130814835"
get_comment(url_open(url))
运行结果如下:
Traceback (most recent call last):
File ".\csdl.py", line 29, in <module>
get_comment(url_open(url))
File ".\csdl.py", line 20, in get_comment
html2 = etree.parse(html, etree.HTMLParser())
File "src\lxml\etree.pyx", line 3435, in lxml.etree.parse
File "src\lxml\parser.pxi", line 1840, in lxml.etree._parseDocument
File "src\lxml\parser.pxi", line 1866, in lxml.etree._parseDocumentFromURL
File "src\lxml\parser.pxi", line 1770, in lxml.etree._parseDocFromFile
File "src\lxml\parser.pxi", line 1163, in lxml.etree._BaseParser._parseDocFromFile
File "src\lxml\parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src\lxml\parser.pxi", line 711, in lxml.etree._handleParseResult
File "src\lxml\parser.pxi", line 638, in lxml.etree._raiseParseError
OSError: Error reading file '
<!DOCTYPE html><!--STATUS OK--><html><head><meta name="keywords" content="百度贴吧,终极斗罗噗,套开,始被"/><meta name="description" content="噗,龙套开始被抛弃了..噗,龙套开始被抛弃了,好惨两男的哈哈哈冰天梁:我还是有价值的唐雨格出来后冰天梁:你们六个别抛弃我啊" /><meta charset="UTF-8"><meta furl="tieba.baidu.com/f?kw=%E7%BB%88%E6%9E%81%E6%96%97%E7%BD%97&ie=utf-8" fname="终极斗罗"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><meta name="baidu-site-verification" content="jpBCrwX689" /><link rel="search" type="application/opensearchdescription+xml" href="/tb/cms/content-search.xml" title="百度贴吧" /><title>噗,龙套开始被抛弃了,好惨
....后面为页面的代码信息此处省略。
求指点! |
|