|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 zkamsk 于 2017-8-3 13:20 编辑
使用selenium抓取网页
- items=doc('#J_goodsList .gl-warp .gl-item').items()
- for each in items:
- product={
- 'title':each.find('.p-name em').text()
- }
- print(product)
复制代码
以上代码返回值一直是{'title': ''}为空,可是我把html复制成文件在idle中实验时可以正确返回,这是为什么?很苦恼啊
以下是整个程序的源代码
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import TimeoutException
- from pyquery import PyQuery as pq
- browser=webdriver.Chrome()
- wait=WebDriverWait(browser, 20)
- def search():
- browser.get('https://www.jd.com/')
- try:
- input = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))
- )
- submit=wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
- )
- input.send_keys('美食')
- submit.click()
- total = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))
- prase_page()
- except TimeoutException:
- search()
- def next_page(page_number):
- next=wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_topPage > a.fp-next"))
- )
- next.click()
- wait.until(EC.text_to_be_present_in_element(
- (By.CSS_SELECTOR, '#J_topPage > span > b'), str(page_number)))
- prase_page()
- def prase_page():
- wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_goodsList .gl-warp .gl-item')))
- html=browser.page_source
- doc=pq(html)
- items=doc('#J_goodsList .gl-warp .gl-item').items()
- for each in items:
- product={
- 'title':each.find('.p-name em').text()
- }
- print(product)
- def next_2_page(page_number):
- try:
- input = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
- )
- submit = wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a"))
- )
- input.clear()
- input.send_keys(page_number)
- submit.click()
- wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'),str(page_number)))
- except TimeoutException:
- return next_2_page(page_number)
- def main():
- search()
- for each in range(2,5):
- next_page(each)
- if __name__=='__main__':
- main()
复制代码 |
|