|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from bs4 import BeautifulSoup
- from lxml import etree
- from cnsenti import Sentiment
- import jieba
- import smtplib
- from email.mime.text import MIMEText
- import os
- import time
- #——————————————————————————————————————————————————————————————————————————————————构建自创字典部分
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- import time
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait #等待一个元素加载完成
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.action_chains import ActionChains#引入鼠标操作
- import json
- def test2():
- url='http://guba.eastmoney.com/news,ustsla,962896962.html'
-
- proxiess={'https': '100.100.7.100:3000'}
-
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.226.188.131:4216 Safari/537.36',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':'en-US,en;q=0.5',
- 'Accept-Encoding':'gzip',
- 'DNT':'1',
- 'Connection':'close'
- }
-
- #r = requests.get(url, headers=headers)
- r = requests.get(url,proxies=proxiess)
- r.encoding = 'utf-8'
- html = etree.HTML(r.text)#etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
- result = html.xpath('//div[contains(@id,"zw_body")]/p/text()')#('//div[@id="mainNewsContent"]/p/text()')
-
- #处理文本
- result = str(result)
- result2 = result.replace('\\u3000','')
- print(result2)
- def test():
- url = 'https://www.baidu.com/'
- #url = 'http://guba.eastmoney.com/default,1_1.html'
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.226.188.131:4216 Safari/537.36',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':'en-US,en;q=0.5',
- 'Accept-Encoding':'gzip',
- 'DNT':'1',
- 'Connection':'close'
- }
- proxiess={'https': '100.100.7.100:3000'}
-
- try:
- page = requests.get(url,proxies=proxiess)
- print(str(proxiess)+"此ip有效")
- except:
- print(str(proxiess)+"此ip无效")
-
-
- test()
- test2()
复制代码
由于代理ip是乱打的,所以test()中无法访问百度
而test2()中却能正常访问页面,请教各位大佬是什么原因...又应该如何解决呢? |
|