|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
大佬们,请问一下我这个代码为什么只能爬取第一个电影的详情页呀,爬第二个详情页就报错了
- import time
- from selenium import webdriver
- from selenium.common.exceptions import TimeoutException
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from urllib.parse import urljoin
- import logging
- logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')
- INDEX_URL = 'https://spa2.scrape.center/page/{}'
- TIME_OUT = 10
- PAGE = 10
- #初始化浏览器对象
- path = 'D:/技能/chromedriver.exe'
- brower = webdriver.Chrome(executable_path=path)
- #显式等待
- wait = WebDriverWait(brower,TIME_OUT)
- #发起请求
- def scrape_page(url,condition,locator):
- logging.info('正在爬取:{}'.format(url))
- try:
- brower.get(url)
- wait.until(condition(locator))#调用wait的until方法,并传入判断条件
- except TimeoutException:
- logging.error('error occurred while scraping {}'.format(url),exc_info=True)
- #加载列表页
- def scrape_index(page):
- url = INDEX_URL.format(page)
- scrape_page(url,condition=EC.visibility_of_all_elements_located,locator=(By.CSS_SELECTOR,'#index .item'))
- #解析列表页
- def parse_index():
- elements = brower.find_elements(By.CSS_SELECTOR,'#index .item .name')
- for element in elements:
- #提取属性
- href = element.get_attribute('href')
- yield urljoin(INDEX_URL,href)
- #判断详情页是否加载成功
- def scrape_detail(url):
- scrape_page(url,condition=EC.visibility_of_element_located,locator=(By.TAG_NAME,'h2'))
- #解析详情页
- def parse_detail():
- url = brower.current_url#利用current_url获取当前页面的url
- name = brower.find_element(By.TAG_NAME,'h2').text
- categories = [element.text for element in brower.find_elements(By.CSS_SELECTOR,'.categories .category span')]#类别
- cover = brower.find_element(By.CSS_SELECTOR,'.cover').get_attribute('src')#封面
- score = brower.find_element(By.CSS_SELECTOR,'.score').text#分数
- drama = brower.find_element(By.CSS_SELECTOR,'.drama p').text#简介
- return {
- 'url':url,
- 'name':name,
- 'categories':categories,
- 'cover':cover,
- 'score':score,
- 'drama':drama
- }
- def main():
- try:
- for page in range(1,PAGE+1):
- scrape_index(page)
- detail_urls = parse_index()
- # logging.info('详情页地址:{}'.format(list(detail_urls)))
- for detail_url in detail_urls:
- # logging.info('详情页:{}'.format(detail_url))
- scrape_detail(detail_url)
- detail_data = parse_detail()
- logging.info('data %s',detail_data)
- finally:
- brower.close()
- if __name__ == '__main__':
- main()
复制代码
报错信息
- D:\技能\python\venv\Scripts\python.exe D:\技能\python\课后练习\基础爬虫\selieum练习.py
- D:\技能\python\课后练习\基础爬虫\selieum练习.py:15: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
- brower = webdriver.Chrome(executable_path=path)
- 2023-07-01 16:16:03,515 - INFO: 正在爬取:https://spa2.scrape.center/page/1
- 2023-07-01 16:16:04,640 - INFO: 详情页:https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx
- 2023-07-01 16:16:04,640 - INFO: 正在爬取:https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx
- 2023-07-01 16:16:05,964 - INFO: data {'url': 'https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx', 'name': '霸王别姬 - Farewell My Concubine', 'categories': ['剧情', '爱情'], 'cover': 'https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c', 'score': '9.5', 'drama': '影片借一出《霸王别姬》的京戏,牵扯出三个人之间一段随时代风云变幻的爱恨情仇。段小楼(张丰毅 饰)与程蝶衣(张国荣 饰)是一对打小一起长大的师兄弟,两人一个演生,一个饰旦,一向配合天衣无缝,尤其一出《霸王别姬》,更是誉满京城,为此,两人约定合演一辈子《霸王别姬》。但两人对戏剧与人生关系的理解有本质不同,段小楼深知戏非人生,程蝶衣则是人戏不分。段小楼在认为该成家立业之时迎娶了名妓菊仙(巩俐 饰),致使程蝶衣认定菊仙是可耻的第三者,使段小楼做了叛徒,自此,三人围绕一出《霸王别姬》生出的爱恨情仇战开始随着时代风云的变迁不断升级,终酿成悲剧。'}
- Traceback (most recent call last):
- File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 79, in <module>
- main()
- File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 69, in main
- for detail_url in detail_urls:
- File "D:\技能\python\课后练习\基础爬虫\selieum练习.py", line 39, in parse_index
- href = element.get_attribute('href')
- File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 177, in get_attribute
- attribute_value = self.parent.execute_script(
- File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 500, in execute_script
- return self.execute(command, {"script": script, "args": converted_args})["value"]
- File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 440, in execute
- self.error_handler.check_response(response)
- File "D:\技能\python\venv\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 245, in check_response
- raise exception_class(message, screen, stacktrace)
- selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found
- (Session info: chrome=114.0.5735.199)
- Stacktrace:
- Backtrace:
- GetHandleVerifier [0x0052A813+48355]
- (No symbol) [0x004BC4B1]
- (No symbol) [0x003C5358]
- (No symbol) [0x003CE5A3]
- (No symbol) [0x003C8A78]
- (No symbol) [0x003C8A56]
- (No symbol) [0x003C7C03]
- (No symbol) [0x003CA19B]
- (No symbol) [0x0041D279]
- (No symbol) [0x0040A73C]
- (No symbol) [0x0041C922]
- (No symbol) [0x0040A536]
- (No symbol) [0x003E82DC]
- (No symbol) [0x003E93DD]
- GetHandleVerifier [0x0078AABD+2539405]
- GetHandleVerifier [0x007CA78F+2800735]
- GetHandleVerifier [0x007C456C+2775612]
- GetHandleVerifier [0x005B51E0+616112]
- (No symbol) [0x004C5F8C]
- (No symbol) [0x004C2328]
- (No symbol) [0x004C240B]
- (No symbol) [0x004B4FF7]
- BaseThreadInitThunk [0x7537F989+25]
- RtlGetAppContainerNamedObjectPath [0x77427084+228]
- RtlGetAppContainerNamedObjectPath [0x77427054+180]
- Process finished with exit code 1
复制代码
你的错误来自 selenium.common.exceptions.StaleElementReferenceException ,这是一个常见的Selenium错误。它意味着你正在尝试使用的web元素(在这种情况下,是你从电影列表中获得的链接)已经从DOM中消失。在Selenium中,一旦页面刷新或更改,之前找到的元素引用就会变得无效或者“过时”。就你的代码来说,当你尝试获取第二个电影详情页的链接时,第一次页面的访问可能已经导致原页面刷新或者更改,所以原本获取的元素已经过时。
为了解决这个问题,我建议你在访问电影详情页后立即返回电影列表页。你可以在 main() 函数中进行一些修改,使得每次完成一个详情页的抓取后都返回列表页,这样在抓取下一个详情页链接时就不会出现元素过时的问题。代码修改如下:
- def main():
- try:
- for page in range(1, PAGE+1):
- scrape_index(page)
- detail_urls = list(parse_index()) # 直接将生成器转化为列表,避免在循环过程中页面发生变化
- for detail_url in detail_urls:
- scrape_detail(detail_url)
- detail_data = parse_detail()
- logging.info('data %s', detail_data)
- scrape_index(page) # 爬取完每个详情页后,重新加载一次列表页
- finally:
- brower.close()
- if __name__ == '__main__':
- main()
复制代码
这样修改后,应该可以解决你的问题。但请注意,这种方法可能会导致更多的网络请求,因为每次爬取详情页之后都会重新加载一次列表页。如果目标网站有请求限制,这可能会导致被封IP的风险增加。
|
|