|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 嗷呜呜呜 于 2019-4-25 14:26 编辑
想问下大家,在用selenium对动态页面进行爬取时,整个页面内容是放在子Frame里面的,页面打开子Frame爬到数据后,点击<下一页>的按钮也存放在子Frame里
如果想要爬取下一页的数据
①我需要在第一页先点击<下一页>按钮再返回父Frame然后在进行下一页的爬取;
②也就是我需要每爬一页都返回下一页的父Frame,再从下一页的父Frame中提取子Frame进行数据的提取;
③还是直接在子Frame点击<下一页>按钮之后等待页面自动跳转;
以网易云评论为例,下面代码只爬到第一页评论,已验证len(iframe)=1,
错误提示:iframe无法被点击,主要错误信息在open_page(url)函数中
想问下大家,如果下一页按钮也存放在子iframe中,怎么进行下一页评论内容的爬取
- from lxml import etree
- import time
- import random
- from selenium import webdriver
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- # from selenium.common.exceptions import NoSuchElementException
- #####driver.switch_to_default_content()每次切换iframe返回父iframe
- all_contents = []
- def open_page(url):#打开页面
- browser = webdriver.Chrome()
- browser.get(url)
- times = [0.8,1,1.2,1.3,1.5,0.9,0.5]
- # browser.switch_to.frame('g_iframe')
- for i in range(5):#五页数据
- browser.switch_to.frame('g_iframe')#按照iframe中id的值来确定选取的iframe
- text = browser.page_source
- html = etree.HTML(text)
- parse_page(html)
- time.sleep(random.choice(times))
- # browser.switch_to.parent_frame()
- # time.sleep(0.5)
- button = browser.find_element_by_xpath("//div[@class='m-cmmt']/div[last()]/div/a[last()]")
- time.sleep(random.choice(times))
- button.click()
- wait = WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.ID,"g_iframe")))
- def parse_page(html):#爬取详情
- divs = html.xpath("//div[@class='cmmts j-flag']/div")
- names = []
- contents = []
- response_names = []
- response_contents = []
- for div in divs:
- name = div.xpath(".//div[@class='cnt f-brk']/a/text()")[0]
- names.append(name)
- content = div.xpath(".//div[@class='cnt f-brk']/text()")
- content = "".join(content)
- content = content.replace("\xa0", "").strip()
- contents.append(content[1:])
- response_name = div.xpath(".//div[@class='que f-brk f-pr s-fc3']/a[@class='s-fc7']/text()")
- if response_name == []:
- response_name.append("无回复者")
- response_name = "".join(response_name)
- response_content = div.xpath(".//div[@class='que f-brk f-pr s-fc3']/text()")
- if response_content == []:
- response_content.append("无回复内容")
- response_content = "".join(response_content)
- else:
- response_content = "".join(response_content)
- response_content = response_content[1:]
- response_contents.append(response_content)
- response_names.append(response_name)
- all_content = list(zip(names,contents,response_names,response_contents))
- for each_content in all_content:
- names, contents, response_names, response_contents=each_content
- contentses = {
- "用户名":names,
- "评论内容":contents,
- "被回复用户":response_names,
- "被回复评论":response_contents
- }
- all_contents.append(contentses)
- for each in all_contents:
- print(each, end="\n")
- def main():
- url = "https://music.163.com/#/playlist?id=2703033645"
- open_page(url)
- for each in all_contents:
- print(each,end = "\n")
- if __name__ == '__main__':
- main()
复制代码
|
|