from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)
# 打开猎聘网
url = "https://www.liepin.com/"
driver.get(url)
# 搜索 '会计'
search_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@class="jsx-1374046090"]')))
search_input.send_keys('会计')
search_button = driver.find_element(By.XPATH, '//span[@class="jsx-1374046090 search-btn"]')
search_button.click()
# 等待搜索结果页面加载
wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="job-list-box"]')))
print(driver.current_url)
# 收集所有职位链接和公司名称
job_elements = driver.find_elements(By.XPATH, '//div[@class="job-list-box"]/div')
job_links = []
company_names = []
for index, element in enumerate(job_elements):
try:
# 获取公司名称
company_elem = element.find_element(By.XPATH, './/span[@class="company-name ellipsis-1"]')
company_name = company_elem.text
# 获取职位链接
job_link_elem = element.find_element(By.XPATH, './/a[contains(@class,"job-card-left")]')
job_link = job_link_elem.get_attribute('href')
job_links.append(job_link)
company_names.append(company_name)
print(f"已收集职位 #{index + 1},公司:{company_name}")
except Exception as e:
print(f"收集职位 #{index + 1} 出错:{e}")
# 逐一访问职位链接并提取内容
for index, (job_link, company_name) in enumerate(zip(job_links, company_names)):
try:
# 打开职位链接
driver.execute_script("window.open(arguments[0]);", job_link)
driver.switch_to.window(driver.window_handles[-1])
# 等待职位详情内容加载
content = wait.until(EC.presence_of_element_located((By.XPATH, '//dd[@data-selector="job-intro-content"]')))
print(f"正在处理职位 #{index + 1},公司:{company_name}")
print("职位内容:", content.text)
# 关闭职位详情页
driver.close()
driver.switch_to.window(driver.window_handles[0])
except Exception as e:
print(f"处理职位 #{index + 1} 出错:{e}")
driver.close()
driver.switch_to.window(driver.window_handles[0])
finally:
time.sleep(2) # 等待片刻再处理下一个职位
# 保持浏览器打开用于检查
time.sleep(1000)
|