微信公众号分页爬取 python webdriver

转转将 · 发表于 2017-3-8 11:05:00

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

使用webdriver爬取搜索到的微信公众号的信息，切实现分页爬取的功能：

=============================================================
import selenium, requests
from selenium import webdriver
import time, pymysql, sys, random, traceback
import os,time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

def ope_page(url):
#chromedriver="D:\python\chromedriver_win32\chromedriver.exe"
#os.environ["webdriver.chrome.driver"]=chromedriver
#driver=webdriver.Chrome(chromedriver)
phantomjs_path = r'D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe'
driver = webdriver.PhantomJS(phantomjs_path)
#driver = webdriver.PhantomJS(executable_path='D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)


text_body = driver.find_element_by_xpath('/html/body').text

#div_list=driver.find_element_by_xpath('//div[@class="news-box"]')

for k in range(0,9):
sogou_vr="sogou_vr_11002301_box_"+str(k)
account_article="account_article_"+str(k)

t=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//p[@class='tit']")
i=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//p[@class='info']")
try:
a=driver.find_element_by_xpath("//li[@id='"+sogou_vr+"']//a[@uigs='"+account_article+"']")
except:
print("文章缺失")
continue

#t=driver.find_element_by_xpath("//li[@id='%s']//p[@class='tit']"%sogou_vr)
#i=driver.find_element_by_xpath("//li[@id='%s']//p[@class='info']"%sogou_vr)
#a=driver.find_element_by_xpath("//li[@id='%s']//a[@uigs='%s']"%sogou_vr,%account_article)

title=t.text
info1=i.text
article=a.text
user_name=i.find_element_by_xpath('//label[@name="em_weixinhao"]').text
print(title)
print(info1)
print(user_name)
print(article)

nowhandle=driver.current_window_handle

a.click()
allhandle=driver.window_handles

for handle in allhandle:
if handle!=nowhandle:
driver.switch_to_window(handle)
time.sleep(3)
a3=driver.find_element_by_xpath('//h2[@id="activity-name"]')
#print(article2)
if article==a3.text:
print('true')
else:
print('false')
driver.switch_to_window(nowhandle)
#page=driver.find_element_by_xpath("//div[@id='pagebar_container']//a[@id='"+sogou_page+"']")
#page.click()
#return(driver.current_window_handle)
driver.quit()

def get_page_url(url,app_name,sogou_page,times):
#chromedriver="D:\python\chromedriver_win32\chromedriver.exe"
#os.environ["webdriver.chrome.driver"]=chromedriver
#driver=webdriver.Chrome(chromedriver)
phantomjs_path = r'D:\python\phantomjs-2.1.1-windows\bin\phantomjs.exe'
driver = webdriver.PhantomJS(phantomjs_path)
driver.get(url)

driver.find_element_by_id("upquery").clear()
driver.find_element_by_id("upquery").send_keys(app_name)
driver.find_element_by_class_name("swz2").click()

if times>1:
page=driver.find_element_by_xpath("//div[@id='pagebar_container']//a[@id='"+sogou_page+"']")
page.click()
return driver.current_url
driver.quit()

def main():
url_t='http://weixin.sogou.com/'
app_name_t='faker'
for p in range(1,6):
sogou_page_t = "sogou_page_"+str(p)
url_value = get_page_url(url_t,app_name_t,sogou_page_t,p)
print(url_value)
ope_page(url_value)

if __name__ == "__main__":
main()

ft3312591 · 发表于 2017-3-8 11:34:47

乱七八糟的，说明都没有
复制黏贴一段代码就好了？谁要看？

账号		自动登录	找回密码
密码			立即注册

[作品展示] 微信公众号分页爬取 python webdriver

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块