937135952 发表于 2022-12-16 16:35:45

爬取内容为空如何解决

import requests
from lxml import etree

currenturl = "https://www.htfc.com/main/a/20221216/80146505.shtml"

#网页爬虫
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}

r = requests.get(currenturl, headers=headers)
r.encoding = 'gbk'
html = etree.HTML(r.text)#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
#print(html)
title = html.xpath('//div')
print(title)

代码如上,想爬取标题和文本,应该如何写?

isdkz 发表于 2022-12-16 18:21:41

import requests
from lxml import etree

currenturl = "https://www.htfc.com/main/a/20221216/80146505.shtml"

#网页爬虫
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}

r = requests.get(currenturl, headers=headers)
r.encoding = 'utf-8'
html = etree.HTML(r.text)#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
#print(html)
title = html.xpath('//div[@id="details"]/h3/text()')
print("标题:")
print(title)
print()
print("正文:")
content = html.xpath('//div[@class="wz_content"]//span/text()')
print('\n'.join(content))

937135952 发表于 2022-12-19 14:02:43

isdkz 发表于 2022-12-16 18:21


大佬你好,这个网址我用自动化谷歌驱动获取的,再用如上方法,最终“正文”的内容为空是为什么啊

937135952 发表于 2022-12-19 14:03:31

isdkz 发表于 2022-12-16 18:21


def get_huatai():

    s = requests.Session()
   
    fb = webdriver.FirefoxProfile.DEFAULT_PREFERENCES
    # op = webdriver.FirefoxOptions()
    # op.set_preference({'user-data-dir':''})
    c = webdriver.ChromeOptions()
    c.add_argument('--user-data-dir=D:/AutomationProfile')
    # c.add_argument('--headless')
    # c.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36')
    driver = webdriver.Chrome()
   
    driver.get('https://www.htfc.com/main/yjzx/yjbg/index.shtml')
   
    time.sleep(3)

    driver.find_element(By.XPATH, '//input[@id="title"]').send_keys('原油')#这里直接指向id="title"的同级input就行。
    time.sleep(3)
    driver.find_element(By.XPATH, '//a').click()
    time.sleep(3)
    driver.find_element(By.XPATH, '//div[@id="articlelist"]/ul/li/a').click()   
   
    # 获取全部标签页
    window = driver.window_handles
    # 将激活标签页设置为最新的一项(按自己业务改)
    driver.switch_to.window(window.pop())
    #获取当前页面的url   
    currenturl = driver.current_url
    driver.quit()
    #driver.close() #关闭当前页面
    time.sleep(3)
    print(currenturl)


    #currenturl = "https://www.htfc.com/main/a/20221216/80146505.shtml"
   
    #网页爬虫
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language':'en-US,en;q=0.5',
    'Accept-Encoding':'gzip',
    'DNT':'1',
    'Connection':'close'
    }
   
    r = requests.get(currenturl, headers=headers)
    r.encoding = 'utf-8'#z注意这里
    html = etree.HTML(r.text)#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
    #print(html)
    title = html.xpath('//div[@id="details"]/h3/text()')#注意这里,注意@
    print("标题:")
    print(title)
    print()
    print("正文:")
    content = html.xpath('//div[@class="wz_content"]//span/text()')
   
    print('\n'.join(content))

isdkz 发表于 2022-12-19 14:49:47

937135952 发表于 2022-12-19 14:03


我下载一个浏览器驱动调试看看

isdkz 发表于 2022-12-19 15:19:18

本帖最后由 isdkz 于 2022-12-19 15:21 编辑

937135952 发表于 2022-12-19 14:03


我调试了一下,别的页面标签又有点不大一样,我改了一下代码,你再看看

def get_huatai():

    s = requests.Session()
   
    fb = webdriver.FirefoxProfile.DEFAULT_PREFERENCES
    # op = webdriver.FirefoxOptions()
    # op.set_preference({'user-data-dir':''})
    c = webdriver.ChromeOptions()
    c.add_argument('--user-data-dir=D:/AutomationProfile')
    # c.add_argument('--headless')
    # c.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36')
    driver = webdriver.Chrome()
   
    driver.get('https://www.htfc.com/main/yjzx/yjbg/index.shtml')
   
    time.sleep(3)

    driver.find_element(By.XPATH, '//input[@id="title"]').send_keys('原油')#这里直接指向id="title"的同级input就行。
    time.sleep(3)
    driver.find_element(By.XPATH, '//a').click()
    time.sleep(3)
    driver.find_element(By.XPATH, '//div[@id="articlelist"]/ul/li/a').click()   
   
    # 获取全部标签页
    window = driver.window_handles
    # 将激活标签页设置为最新的一项(按自己业务改)
    driver.switch_to.window(window.pop())
    #获取当前页面的url   
    currenturl = driver.current_url
    driver.quit()
    #driver.close() #关闭当前页面
    time.sleep(3)
    print(currenturl)


    #currenturl = "https://www.htfc.com/main/a/20221216/80146505.shtml"
   
    #网页爬虫
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language':'en-US,en;q=0.5',
    'Accept-Encoding':'gzip',
    'DNT':'1',
    'Connection':'close'
    }
   
    r = requests.get(currenturl, headers=headers)
    r.encoding = 'utf-8'#z注意这里
    html = etree.HTML(r.text)#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
    #print(html)
    title = html.xpath('//div[@id="details"]/h3/text()')#注意这里,注意@
    print("标题:")
    print(title)
    print()
    print("正文:")
    content = html.xpath('//div[@class="wz_content"]//text()')                                        # 改了这里
   
    print('\n'.join(content))

937135952 发表于 2022-12-19 15:57:17

isdkz 发表于 2022-12-19 15:19
我调试了一下,别的页面标签又有点不大一样,我改了一下代码,你再看看

ok
非常感谢!
页: [1]
查看完整版本: 爬取内容为空如何解决