|
发表于 2020-6-26 13:25:38
|
显示全部楼层
本帖最后由 suchocolate 于 2020-6-26 13:29 编辑
1.selenium获取的cookie格式和urllib/requests不同,下面是用selenium获取百度的cookies样本:- for item in driver.get_cookies():
- print(item)
- {'name': 'BIDUPSID', 'value': 'AC0C72B5367C32A57FC5C7B28143C4BC', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 3740629700}
- {'name': 'PSTM', 'value': '1593146053', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 3740629700}
- {'name': 'BAIDUID', 'value': 'AC0C72B5367C32A5D02EE2B4E63B2211:FG=1', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 1624682053}
- {'name': 'BD_HOME', 'value': '1', 'path': '/', 'domain': 'www.baidu.com', 'secure': False, 'httpOnly': False}
- {'name': 'H_PS_PSSID', 'value': '32098_1456_31671_21082_32139_32046_32089_32107', 'path': '/', 'domain': '.baidu.com', 'secure': False, 'httpOnly': False}
- {'name': 'BD_UPN', 'value': '13314752', 'path': '/', 'domain': 'www.baidu.com', 'secure': False, 'httpOnly': False, 'expiry': 1594010053}
复制代码
urllib/requests的cookie只需要key(name)和vlaue,而selenium还有其他参数,所以selenium的cookie如果给urllib/requests用,得筛选出name和value,反之需要补齐其他参数。
2.不太清楚你具体爬的内容,这个网站我无法登陆,不好判断后续具体用什么爬取合适。个人感觉用requests的session就可以,比用selenium更快一些。
3.如果用selenium爬取后续内容,那么cookies就直接保存成pickle就可以,不需要转成txt。
- import time
- import os
- import pickle
- import requests
- from requests.cookies import RequestsCookieJar
- from selenium import webdriver
- from selenium.webdriver import Firefox
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.firefox.options import Options
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- def ck_cookie():
- if not os.path.exists('cookies.pkl'):
- return False
- else:
- return True
- def main(www, username, password):
- # ========================================================================
- # 可选配置,使用无头模式,后台运行,不显示浏览器。
- # options = Options()
- # options.add_argument('-headless')
- # 使用geckodriver驱动,需要提前下载。
- # driver = Firefox(executable_path='geckodriver', options=options)
- # ========================================================================
- driver = Firefox()
- wait = WebDriverWait(driver, timeout=10)
- if not ck_cookie():
- driver.get(www)
- wait.until(EC.visibility_of_element_located((By.XPATH, '//button[@class="login-btn"]')))
- driver.find_element_by_name("userid").send_keys(username)
- time.sleep(1)
- driver.find_element_by_name("pwd").send_keys(password)
- time.sleep(1)
- driver.find_element_by_name("sm1").click()
- time.sleep(10)
- cookies = driver.get_cookies()
- pickle.dump(cookies, open("cookies.pkl", "wb"))
- else:
- cookies = pickle.load(open("cookies.pkl", "rb"))
- for cookie in cookies:
- driver.add_cookie(cookie)
- driver.get(www)
- # 之后的动作
- if __name__ == '__main__':
- url = 'http://www.boshishuini.com/dada/login.php'
- un = input('Please input username')
- pw = input('Please input password')
- main(url, un, pw)
复制代码
4.如果后续用requests爬取,需要转成txt:
- import time
- import os
- import pickle
- import requests
- from requests.cookies import RequestsCookieJar
- from selenium import webdriver
- from selenium.webdriver import Firefox
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.firefox.options import Options
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- def ck_cookie():
- if not os.path.exists('cookies.txt'):
- return False
- else:
- return True
- def main(www, username, password):
- # ========================================================================
- # 可选配置,使用无头模式,后台运行,不显示浏览器。
- # options = Options()
- # options.add_argument('-headless')
- # 使用geckodriver驱动,需要提前下载。
- # driver = Firefox(executable_path='geckodriver', options=options)
- # ========================================================================
- driver = Firefox()
- wait = WebDriverWait(driver, timeout=10)
- if not ck_cookie():
- driver.get(www)
- wait.until(EC.visibility_of_element_located((By.XPATH, '//button[@class="login-btn"]')))
- driver.find_element_by_name("userid").send_keys(username)
- time.sleep(1)
- driver.find_element_by_name("pwd").send_keys(password)
- time.sleep(1)
- driver.find_element_by_name("sm1").click()
- time.sleep(10)
- with open('cookies.txt', 'w') as f:
- for item in driver.get_cookies():
- f.write(item['name'] + '#' + item['value'])
- else:
- jar = RequestsCookieJar()
- with open('cookie.txt', 'r') as f:
- for item in f.readlines():
- k, v = item.split('#')
- jar.set(k, v)
- r = requests.get(www, cookies=jar)
- # 之后的动作
- if __name__ == '__main__':
- url = 'http://www.boshishuini.com/dada/login.php'
- un = input('Please input username')
- pw = input('Please input password')
- main(url, un, pw)
复制代码
5.以上都是基于selenium获取cookie,实际我看可以用requests.session获取:
- import requests
- from lxml import etree
- from requests.cookies import RequestsCookieJar
- import os
- def ck_cookie():
- if not os.path.exists('cookies.txt'):
- return False
- else:
- return True
- def main(www, username, password):
- headers = {'user-agent': 'firefox', 'host': 'www.boshishuini.com', 'Referer': 'http://www.boshishuini.com/dada/login.php'}
- data = {"gotopage": "", "dopost": "login", "adminstyle": "newdedecms", "sm1": ""}
- # data = {"gotopage": "", "dopost": "login", "adminstyle": "newdedecms", "userid": "admin", "pwd": "admin", "sm1": ""}
- data['userid'] = username
- data['pwd'] = password
- if not ck_cookie():
- s = requests.session()
- r = s.get(www, headers=headers, data=data)
- # html = etree.HTML(r.text)
- with open('cookies.txt', 'w') as f:
- for k, v in r.cookies.items():
- print(k, '=', v)
- f.write(k + '#' + v)
- else:
- jar = RequestsCookieJar()
- with open('cookie.txt', 'r') as f:
- for item in f.readlines():
- k, v = item.split('#')
- jar.set(k, v)
- r = requests.get(www, cookies=jar)
- # 后续动作
- if __name__ == '__main__':
- url = 'http://www.boshishuini.com/dada/login.php'
- un = input('Please input username')
- pw = input('Please input password')
- main(url, un, pw)
复制代码
|
|