本帖最后由 翩跹叶惊鸿 于 2019-6-25 17:55 编辑 ##第一种:自带的
from urllib.request import Request, urlopen
from scrapy import Selector
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
URL = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=ip&oq=requests.get%25E4%25B8%25AD%25E7%259A%2584timeout%25E6%2598%25AF%25E4%25BB%2580%25E4%25B9%2588&rsv_pq=9e512faf000fda05&rsv_t=ce28kApYTuAxtjle%2Fa0i5778f7ssmJJ5hukJt64RFu2JsHyit8EZthEdSk8&rqlang=cn&rsv_enter=0&inputT=10950&bs=requests.get%E4%B8%AD%E7%9A%84timeout%E6%98%AF%E4%BB%80%E4%B9%88'
req = Request(url=URL, headers=HEADERS)
html = urlopen(req).read()
text = Selector(text=html).xpath('//span[@class="c-gap-right"]/text()').extract()[0]
print(text)
##第二种:需下载requests
from requests import get
from scrapy import Selector
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
URL = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=ip&oq=requests.get%25E4%25B8%25AD%25E7%259A%2584timeout%25E6%2598%25AF%25E4%25BB%2580%25E4%25B9%2588&rsv_pq=9e512faf000fda05&rsv_t=ce28kApYTuAxtjle%2Fa0i5778f7ssmJJ5hukJt64RFu2JsHyit8EZthEdSk8&rqlang=cn&rsv_enter=0&inputT=10950&bs=requests.get%E4%B8%AD%E7%9A%84timeout%E6%98%AF%E4%BB%80%E4%B9%88'
html = get(url=URL, headers=HEADERS).text
text = Selector(text=html).xpath('//span[@class="c-gap-right"]/text()').extract()[0]
print(text)
##使用代理1:自带的
from urllib import request
from scrapy import Selector
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
URL = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=ip&oq=requests.get%25E4%25B8%25AD%25E7%259A%2584timeout%25E6%2598%25AF%25E4%25BB%2580%25E4%25B9%2588&rsv_pq=9e512faf000fda05&rsv_t=ce28kApYTuAxtjle%2Fa0i5778f7ssmJJ5hukJt64RFu2JsHyit8EZthEdSk8&rqlang=cn&rsv_enter=0&inputT=10950&bs=requests.get%E4%B8%AD%E7%9A%84timeout%E6%98%AF%E4%BB%80%E4%B9%88'
proxy = {"https":"203.42.227.113:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
html = opener.open(URL).read()
text = Selector(text=html).xpath('//span[@class="c-gap-right"]/text()').extract()[0]
print(text)
##使用代理2:requests
from requests import get
from scrapy import Selector
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
URL = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=ip&oq=requests.get%25E4%25B8%25AD%25E7%259A%2584timeout%25E6%2598%25AF%25E4%25BB%2580%25E4%25B9%2588&rsv_pq=9e512faf000fda05&rsv_t=ce28kApYTuAxtjle%2Fa0i5778f7ssmJJ5hukJt64RFu2JsHyit8EZthEdSk8&rqlang=cn&rsv_enter=0&inputT=10950&bs=requests.get%E4%B8%AD%E7%9A%84timeout%E6%98%AF%E4%BB%80%E4%B9%88'
proxy = {"https":"203.42.227.113:8080"}
html = get(url=URL, headers=HEADERS, proxies=proxy).text
text = Selector(text=html).xpath('//span[@class="c-gap-right"]/text()').extract()[0]
print(text)
注:
1.需要密码的网页可以在headers中加cookie
2.爬链接就在解析网页时把xpath最后的‘text()’改为‘@href’
3.频繁的爬虫可使用time.sleep()暂停一段时间
4.正则表达式可以用 Selector(text=html).re(r'.*')
|