|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
大佬们,请问一下我这个代码为什么只能爬取第一个电影的详情页呀,爬第二个详情页就报错了import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urljoin
import logging
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa2.scrape.center/page/{}'
TIME_OUT = 10
PAGE = 10
#初始化浏览器对象
path = 'D:/技能/chromedriver.exe'
brower = webdriver.Chrome(executable_path=path)
#显式等待
wait = WebDriverWait(brower,TIME_OUT)
#发起请求
def scrape_page(url,condition,locator):
logging.info('正在爬取:{}'.format(url))
try:
brower.get(url)
wait.until(condition(locator))#调用wait的until方法,并传入判断条件
except TimeoutException:
logging.error('error occurred while scraping {}'.format(url),exc_info=True)
#加载列表页
def scrape_index(page):
url = INDEX_URL.format(page)
scrape_page(url,condition=EC.visibility_of_all_elements_located,locator=(By.CSS_SELECTOR,'#index .item'))
#解析列表页
def parse_index():
elements = brower.find_elements(By.CSS_SELECTOR,'#index .item .name')
for element in elements:
#提取属性
href = element.get_attribute('href')
yield urljoin(INDEX_URL,href)
#判断详情页是否加载成功
def scrape_detail(url):
scrape_page(url,condition=EC.visibility_of_element_located,locator=(By.TAG_NAME,'h2'))
#解析详情页
def parse_detail():
url = brower.current_url#利用current_url获取当前页面的url
name = brower.find_element(By.TAG_NAME,'h2').text
categories = [element.text for element in brower.find_elements(By.CSS_SELECTOR,'.categories .category span')]#类别
cover = brower.find_element(By.CSS_SELECTOR,'.cover').get_attribute('src')#封面
score = brower.find_element(By.CSS_SELECTOR,'.score').text#分数
drama = brower.find_element(By.CSS_SELECTOR,'.drama p').text#简介
return {
'url':url,
'name':name,
'categories':categories,
'cover':cover,
'score':score,
'drama':drama
}
def main():
try:
for page in range(1,PAGE+1):
scrape_index(page)
detail_urls = parse_index()
# logging.info('详情页地址:{}'.format(list(detail_urls)))
for detail_url in detail_urls:
# logging.info('详情页:{}'.format(detail_url))
scrape_detail(detail_url)
detail_data = parse_detail()
logging.info('data %s',detail_data)
finally:
brower.close()
if __name__ == '__main__':
main()
报错信息D:\技能\python\venv\Scripts\python.exe D:\技能\python\课后练习\基础爬虫\selieum练习.py
D:\技能\python\课后练习\基础爬虫\selieum练习.py:15: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
brower = webdriver.Chrome(executable_path=path)
2023-07-01 16:16:03,515 - INFO: 正在爬取:https://spa2.scrape.center/page/1
2023-07-01 16:16:04,640 - INFO: 详情页:https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx
2023-07-01 16:16:04,640 - INFO: 正在爬取:https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx
2023-07-01 16:16:05,964 - INFO: data {'url': 'https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx', 'name': '霸王别姬 - Farewell My Concubine', 'categories': ['剧情', '爱情'], 'cover': 'https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c', 'score': '9.5', 'drama': '影片借一出《霸王别姬》的京戏,牵扯出三个人之间一段随时代风云变幻的爱恨情仇。段小楼(张丰毅 饰)与程蝶衣(张国荣 饰)是一对打小一起长大的师兄弟,两人一个演生,一个饰旦,一向配合天衣无缝,尤其一出《霸王别姬》,更是誉满京城,为此,两人约定合演一辈子《霸王别姬》。但两人对戏剧与人生关系的理解有本质不同,段小楼深知戏非人生,程蝶衣则是人戏不分。段小楼在认为该成家立业之时迎娶了名妓菊仙(巩俐 饰),致使程蝶衣认定菊仙是可耻的第三者,使段小楼做了叛徒,自此,三人围绕一出《霸王别姬》生出的爱恨情仇战开始随着时代风云的变迁不断升级,终酿成悲剧。'}
Traceback (most recent call last):
File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 79, in <module>
main()
File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 69, in main
for detail_url in detail_urls:
File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 39, in parse_index
href = element.get_attribute('href')
File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 177, in get_attribute
attribute_value = self.parent.execute_script(
File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 500, in execute_script
return self.execute(command, {"script": script, "args": converted_args})["value"]
File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 440, in execute
self.error_handler.check_response(response)
File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 245, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x0052A813+48355]
(No symbol) [0x004BC4B1]
(No symbol) [0x003C5358]
(No symbol) [0x003CE5A3]
(No symbol) [0x003C8A78]
(No symbol) [0x003C8A56]
(No symbol) [0x003C7C03]
(No symbol) [0x003CA19B]
(No symbol) [0x0041D279]
(No symbol) [0x0040A73C]
(No symbol) [0x0041C922]
(No symbol) [0x0040A536]
(No symbol) [0x003E82DC]
(No symbol) [0x003E93DD]
GetHandleVerifier [0x0078AABD+2539405]
GetHandleVerifier [0x007CA78F+2800735]
GetHandleVerifier [0x007C456C+2775612]
GetHandleVerifier [0x005B51E0+616112]
(No symbol) [0x004C5F8C]
(No symbol) [0x004C2328]
(No symbol) [0x004C240B]
(No symbol) [0x004B4FF7]
BaseThreadInitThunk [0x7537F989+25]
RtlGetAppContainerNamedObjectPath [0x77427084+228]
RtlGetAppContainerNamedObjectPath [0x77427054+180]
Process finished with exit code 1
你的错误来自 selenium.common.exceptions.StaleElementReferenceException ,这是一个常见的Selenium错误。它意味着你正在尝试使用的web元素(在这种情况下,是你从电影列表中获得的链接)已经从DOM中消失。在Selenium中,一旦页面刷新或更改,之前找到的元素引用就会变得无效或者“过时”。就你的代码来说,当你尝试获取第二个电影详情页的链接时,第一次页面的访问可能已经导致原页面刷新或者更改,所以原本获取的元素已经过时。
为了解决这个问题,我建议你在访问电影详情页后立即返回电影列表页。你可以在 main() 函数中进行一些修改,使得每次完成一个详情页的抓取后都返回列表页,这样在抓取下一个详情页链接时就不会出现元素过时的问题。代码修改如下:
def main():
try:
for page in range(1, PAGE+1):
scrape_index(page)
detail_urls = list(parse_index()) # 直接将生成器转化为列表,避免在循环过程中页面发生变化
for detail_url in detail_urls:
scrape_detail(detail_url)
detail_data = parse_detail()
logging.info('data %s', detail_data)
scrape_index(page) # 爬取完每个详情页后,重新加载一次列表页
finally:
brower.close()
if __name__ == '__main__':
main()
这样修改后,应该可以解决你的问题。但请注意,这种方法可能会导致更多的网络请求,因为每次爬取详情页之后都会重新加载一次列表页。如果目标网站有请求限制,这可能会导致被封IP的风险增加。
|
|