|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 DAGECHUIZI 于 2018-4-22 19:14 编辑
- import requests
- from selenium import webdriver
- from urllib.parse import urlencode
- import re
- import json
- import time
- def get_page_index():
- head = {
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'cookie': 'tt_webid=6547097210365822477; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=162eb39d2231ec-08bbf0a2603116-3a614f0b-1fa400-162eb39d22422e; tt_webid=6547097210365822477; uuid="w:d625a48f91744a87887eb7c10eb5c875"; __tasessionId=ro1q9k23r1524377491314; CNZZDATA1259612802=696710449-1524360766-https%253A%252F%252Fwww.baidu.com%252F%7C1524376648',
- 'user - agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 65.0.3325.181Safari / 537.36',
- }
- data = {
- 'offset': 0,
- 'format': 'json',
- 'keyword': '街拍',
- 'autoload': 'true',
- 'count': 20,
- 'cur_tab': 3,
- 'from': 'gallery'
- }
- url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
- response = requests.get(url,headers=head)
- return response.text
- def parse_page_index(html):
- data = json.loads(html)
- if data and 'data' in data.keys():
- for item in data.get('data'):
- yield item.get('article_url')
- def get_page_detail(url):
- browser = webdriver.Chrome()
- browser.get(url)
- print(browser.page_source)
- browser.close()
- def main():
- html = get_page_index()
- for url in parse_page_index(html):
- print(url)
- html = get_page_detail(url)
- print(html)
- time.sleep(10)
- if __name__ == '__main__':
- main()
复制代码
代码是不完整的,写到这里就出现问题了,因为正常方法不能返回html代码,就加入了selenium和chromdriver,结果就出了这个问题
http://toutiao.com/group/6546175194942145037/
Traceback (most recent call last):
File "C:/Users/17222/PycharmProjects/spyder/toutiao_jiepai.py", line 48, in <module>
main()
File "C:/Users/17222/PycharmProjects/spyder/toutiao_jiepai.py", line 44, in main
html = get_page_detail(url)
File "C:/Users/17222/PycharmProjects/spyder/toutiao_jiepai.py", line 36, in get_page_detail
browser.get(url)
File "C:\Users\17222\PycharmProjects\spyder\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 324, in get
self.execute(Command.GET, {'url': url})
File "C:\Users\17222\PycharmProjects\spyder\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 310, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\Users\17222\PycharmProjects\spyder\venv\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 466, in execute
return self._request(command_info[0], url, body=data)
File "C:\Users\17222\PycharmProjects\spyder\venv\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 490, in _request
resp = self._conn.getresponse()
File "C:\Users\17222\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\17222\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\17222\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\17222\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] 远程主机强迫关闭了一个现有的连接。
爬了一天卡在着真的是沮丧,,,,求大佬指点
我给的建议就是 既然 from selenium import webdriver 都用上这个了。统一用 selenium 模拟浏览器就好了。
这类大型网站反爬很正常
|
|