|
发表于 2020-6-26 13:25:38
|
显示全部楼层
本帖最后由 suchocolate 于 2020-6-26 13:29 编辑
1.selenium获取的cookie格式和urllib/requests不同,下面是用selenium获取百度的cookies样本:for item in driver.get_cookies():
print(item)
{'name': 'BIDUPSID', 'value': 'AC0C72B5367C32A57FC5C7B28143C4BC', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 3740629700}
{'name': 'PSTM', 'value': '1593146053', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 3740629700}
{'name': 'BAIDUID', 'value': 'AC0C72B5367C32A5D02EE2B4E63B2211:FG=1', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 1624682053}
{'name': 'BD_HOME', 'value': '1', 'path': '/', 'domain': 'www.baidu.com', 'secure': False, 'httpOnly': False}
{'name': 'H_PS_PSSID', 'value': '32098_1456_31671_21082_32139_32046_32089_32107', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False}
{'name': 'BD_UPN', 'value': '13314752', 'path': '/', 'domain': 'www.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 1594010053}
urllib/requests的cookie只需要key(name)和vlaue,而selenium还有其他参数,所以selenium的cookie如果给urllib/requests用,得筛选出name和value,反之需要补齐其他参数。
2.不太清楚你具体爬的内容,这个网站我无法登陆,不好判断后续具体用什么爬取合适。个人感觉用requests的session就可以,比用selenium更快一些。
3.如果用selenium爬取后续内容,那么cookies就直接保存成pickle就可以,不需要转成txt。import time
import os
import pickle
import requests
from requests.cookies import RequestsCookieJar
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def ck_cookie():
if not os.path.exists('cookies.pkl'):
return False
else:
return True
def main(www, username, password):
# ========================================================================
# 可选配置,使用无头模式,后台运行,不显示浏览器。
# options = Options()
# options.add_argument('-headless')
# 使用geckodriver驱动,需要提前下载。
# driver = Firefox(executable_path='geckodriver', options=options)
# ========================================================================
driver = Firefox()
wait = WebDriverWait(driver, timeout=10)
if not ck_cookie():
driver.get(www)
wait.until(EC.visibility_of_element_located((By.XPATH, '//button[@class="login-btn"]')))
driver.find_element_by_name("userid").send_keys(username)
time.sleep(1)
driver.find_element_by_name("pwd").send_keys(password)
time.sleep(1)
driver.find_element_by_name("sm1").click()
time.sleep(10)
cookies = driver.get_cookies()
pickle.dump(cookies, open("cookies.pkl", "wb"))
else:
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
driver.get(www)
# 之后的动作
if __name__ == '__main__':
url = 'http://www.boshishuini.com/dada/login.php'
un = input('Please input username')
pw = input('Please input password')
main(url, un, pw)
4.如果后续用requests爬取,需要转成txt:import time
import os
import pickle
import requests
from requests.cookies import RequestsCookieJar
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def ck_cookie():
if not os.path.exists('cookies.txt'):
return False
else:
return True
def main(www, username, password):
# ========================================================================
# 可选配置,使用无头模式,后台运行,不显示浏览器。
# options = Options()
# options.add_argument('-headless')
# 使用geckodriver驱动,需要提前下载。
# driver = Firefox(executable_path='geckodriver', options=options)
# ========================================================================
driver = Firefox()
wait = WebDriverWait(driver, timeout=10)
if not ck_cookie():
driver.get(www)
wait.until(EC.visibility_of_element_located((By.XPATH, '//button[@class="login-btn"]')))
driver.find_element_by_name("userid").send_keys(username)
time.sleep(1)
driver.find_element_by_name("pwd").send_keys(password)
time.sleep(1)
driver.find_element_by_name("sm1").click()
time.sleep(10)
with open('cookies.txt', 'w') as f:
for item in driver.get_cookies():
f.write(item['name'] + '#' + item['value'])
else:
jar = RequestsCookieJar()
with open('cookie.txt', 'r') as f:
for item in f.readlines():
k, v = item.split('#')
jar.set(k, v)
r = requests.get(www, cookies=jar)
# 之后的动作
if __name__ == '__main__':
url = 'http://www.boshishuini.com/dada/login.php'
un = input('Please input username')
pw = input('Please input password')
main(url, un, pw)
5.以上都是基于selenium获取cookie,实际我看可以用requests.session获取:import requests
from lxml import etree
from requests.cookies import RequestsCookieJar
import os
def ck_cookie():
if not os.path.exists('cookies.txt'):
return False
else:
return True
def main(www, username, password):
headers = {'user-agent': 'firefox', 'host': 'www.boshishuini.com', 'Referer': 'http://www.boshishuini.com/dada/login.php'}
data = {"gotopage": "", "dopost": "login", "adminstyle": "newdedecms", "sm1": ""}
# data = {"gotopage": "", "dopost": "login", "adminstyle": "newdedecms", "userid": "admin", "pwd": "admin", "sm1": ""}
data['userid'] = username
data['pwd'] = password
if not ck_cookie():
s = requests.session()
r = s.get(www, headers=headers, data=data)
# html = etree.HTML(r.text)
with open('cookies.txt', 'w') as f:
for k, v in r.cookies.items():
print(k, '=', v)
f.write(k + '#' + v)
else:
jar = RequestsCookieJar()
with open('cookie.txt', 'r') as f:
for item in f.readlines():
k, v = item.split('#')
jar.set(k, v)
r = requests.get(www, cookies=jar)
# 后续动作
if __name__ == '__main__':
url = 'http://www.boshishuini.com/dada/login.php'
un = input('Please input username')
pw = input('Please input password')
main(url, un, pw)
|
|