|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
使用webdriver爬取搜索到的微信公众号的信息,切实现分页爬取的功能:
=============================================================
import selenium, requests
from selenium import webdriver
import time, pymysql, sys, random, traceback
import os,time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def ope_page(url):
#chromedriver="D:\python\chromedriver_win32\chromedriver.exe"
#os.environ["webdriver.chrome.driver"]=chromedriver
#driver=webdriver.Chrome(chromedriver)
phantomjs_path = r'D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe'
driver = webdriver.PhantomJS(phantomjs_path)
#driver = webdriver.PhantomJS(executable_path='D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
text_body = driver.find_element_by_xpath('/html/body').text
#div_list=driver.find_element_by_xpath('//div[@class="news-box"]')
for k in range(0,9):
sogou_vr="sogou_vr_11002301_box_"+str(k)
account_article="account_article_"+str(k)
t=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//p[@class='tit']")
i=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//p[@class='info']")
try:
a=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//a[@uigs='"+account_article+"']")
except:
print("文章缺失")
continue
#t=driver.find_element_by_xpath("//li[@id='%s']//p[@class='tit']"%sogou_vr)
#i=driver.find_element_by_xpath("//li[@id='%s']//p[@class='info']"%sogou_vr)
#a=driver.find_element_by_xpath("//li[@id='%s']//a[@uigs='%s']"%sogou_vr,%account_article)
title=t.text
info1=i.text
article=a.text
user_name=i.find_element_by_xpath('//label[@name="em_weixinhao"]').text
print(title)
print(info1)
print(user_name)
print(article)
nowhandle=driver.current_window_handle
a.click()
allhandle=driver.window_handles
for handle in allhandle:
if handle!=nowhandle:
driver.switch_to_window(handle)
time.sleep(3)
a3=driver.find_element_by_xpath('//h2[@id="activity-name"]')
#print(article2)
if article==a3.text:
print('true')
else:
print('false')
driver.switch_to_window(nowhandle)
#page=driver.find_element_by_xpath("//div[@id='pagebar_container']//a[@id='"+sogou_page+"']")
#page.click()
#return(driver.current_window_handle)
driver.quit()
def get_page_url(url,app_name,sogou_page,times):
#chromedriver="D:\python\chromedriver_win32\chromedriver.exe"
#os.environ["webdriver.chrome.driver"]=chromedriver
#driver=webdriver.Chrome(chromedriver)
phantomjs_path = r'D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe'
driver = webdriver.PhantomJS(phantomjs_path)
driver.get(url)
driver.find_element_by_id("upquery").clear()
driver.find_element_by_id("upquery").send_keys(app_name)
driver.find_element_by_class_name("swz2").click()
if times>1:
page=driver.find_element_by_xpath("//div[@id='pagebar_container']//a[@id='"+sogou_page+"']")
page.click()
return driver.current_url
driver.quit()
def main():
url_t='http://weixin.sogou.com/'
app_name_t='faker'
for p in range(1,6):
sogou_page_t = "sogou_page_"+str(p)
url_value = get_page_url(url_t,app_name_t,sogou_page_t,p)
print(url_value)
ope_page(url_value)
if __name__ == "__main__":
main()
|
|