关于换代理ip访问网址为什么还是不行的问题
爬取一个新闻网页,经过多次实验...好像有什么每日限制访问次数的机制(不懂是不是)。当爬取200次以上时,就会出现每个页面都是相同的内容(每次print出网址都是不同的)。所以我想用代理ip爬取可能就可以了,可是同样的,爬到一定次数再爬,就算换一个ip地址也不行,每个页面都是相同的内容。想请假一下大佬们是关于网页防爬机制的设定还是什么原因?有没有什么解决办法? 是不是mac地址? 发代码 本帖最后由 suchocolate 于 2020-8-18 09:51 编辑求资专用 发表于 2020-8-18 09:33
是不是mac地址?
MAC地址只在广播域传播(相关知识搜广播域冲突域),IP都3层了,网站看不到你的MAC。除非你和网站服务器直连或通过交换机同VLAN。 suchocolate 发表于 2020-8-18 09:48
发代码
import requests
from bs4 import BeautifulSoup
from lxml import etree
from cnsenti import Sentiment
import jieba
import smtplib
from email.mime.text import MIMEText
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
fromselenium.webdriver.common.keys import Keys
fromselenium.webdriver.common.by importBy
from selenium.webdriver.support.ui importWebDriverWait
fromselenium.webdriver.support importexpected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import json
def get_and_save(url):
global save_txt
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
r = requests.get(url,proxies=proxiess)
r.encoding = 'utf-8'
html = etree.HTML(r.text)
result = html.xpath('//div/p/text()')
#处理文本
result = str(result)
result2 = result.replace('\\u3000','')
print(result2)
return result2
proxiess={}
#爬取网址所有新闻链接
def get_url():
global proxiess
xunhuan=
liebiao=[]
for k in xunhuan:
print (k)
html = 'http://guba.eastmoney.com/default,1_'+str(k)+'.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
page = requests.get(html,headers=headers)
soup_obj=BeautifulSoup(page.content,'html.parser')
for link in soup_obj.findAll('a'):
if "href" in link.attrs:
a = 'http://guba.eastmoney.com'+link.attrs['href']#href=‘/news,000762,954300722.html’
if 'news' in a:
liebiao.append(a)
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
ipp=[]
with open('代理ip池.txt','r',encoding='UTF-8') as temp:
a = list(temp)
temp.close()
ipp.append(a)
ipp = ]
ipp =
ipp =
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
#实现每个ip使用260次就删除,换下一个ip
ip=0 #利用ip来召唤代理ip。每访问XX个网页就换一个ip地址
for i in liebiao:
if ip<260:#东方财富网一页88条新闻,每执行3页换一个ip总共可执行15页
ip=ip+1
proxies = ipp
proxies=proxies
print('正在使用ip:'+proxies)
proxiess={'htttps':proxies}
try:
jiebakey(get_and_save(i))
print("正在爬取东方财富网......")
except:
print("正在爬取东方财富网...")
else:
ip=0
del ipp
if __name__ =='__main__':
get_url() suchocolate 发表于 2020-8-18 09:48
发代码
代理ip池.txt的内容:
{'https': '46.166.151.181:5836'}
{'https': '114.239.171.181:4216'}
{'https': '114.99.12.100:4216'}
{'https': '175.6.66.48:3128'}
{'https': '47.107.240.107:8888'}
{'https': '113.100.209.237:3128'}
937135952 发表于 2020-8-18 21:53
代理ip池.txt的内容:
{'https': '46.166.151.181:5836'}
{'https': '114.239.171.181:4216'}
全部代码 本帖最后由 937135952 于 2020-8-19 19:59 编辑
suchocolate 发表于 2020-8-18 23:02
全部代码
import requests
from bs4 import BeautifulSoup
from lxml import etree
from cnsenti import Sentiment
import jieba
import smtplib
from email.mime.text import MIMEText
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
fromselenium.webdriver.common.keys import Keys
fromselenium.webdriver.common.by importBy
from selenium.webdriver.support.ui importWebDriverWait #等待一个元素加载完成
fromselenium.webdriver.support importexpected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains#引入鼠标操作
import json
hot_stock_concept={}
hot_stock_name={}
def creat_name_dic():
#无头浏览器功能
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
url ='http://quote.eastmoney.com/center/boardlist.html#concept_board'
browser.get(url)
time.sleep(1)
a=1
b=1
global hot_stock_concept,hot_stock_name
for i in range(14):
stock_name = browser.find_elements_by_xpath("//td/a")
stock_name2=[]
for i in stock_name:
stock_name2.append(i.text)
k=1
#写入字典
for i in stock_name2:
if k%2!=0:
hot_stock_concept=a
a=a+1
else:
hot_stock_name=b
b=b+1
k=k+1
#点击下一页
time.sleep(1)
above=browser.find_element_by_xpath("//a").click()
time.sleep(1)
file_name ='热点概念'+'.txt'
for i in list(hot_stock_concept.keys()):
with open(file_name,'w', encoding='utf-8') as temp:
temp.write(i+'\n')
browser.quit()
#————————————————————————————————————————————————————————————————————————————————————————————————————————————
#爬取ip并存储部分——————————————————————————————————————————————————————————————————————————————————————————————
def get_ip():#动态的要用selemiun爬取
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
url ='https://proxy.seofangfa.com/'
browser.get(url)
time.sleep(1)
a = browser.find_elements_by_xpath("//table/tbody/tr/td")
#print(a)
al=[]
bl=[]
for n, v in enumerate(a):#enumerate() 函数用于为可迭代对象添加序号,默认序号从0开始,一般用在 for 循环当中。
#在这里就相当于给n传入序号,v传入元素值
if n % 5 == 0 :
print(v.text)
a=v.text
al.append(a)
elif n % 5 == 1:
print(v.text)
b=v.text
bl.append(b)
browser.quit()
#print(al)
#print(bl)
#save_dic(hot_stock_concept)
#return hot_stock_concept
#创建ip池
url = 'https://www.baidu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.226.188.131:4216 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
for x,y in zip(al,bl):
z=x+':'+y
proxies={'https':'%s'%z}
print(proxies)
try:
page = requests.get(url,proxies=proxies)
with open('代理ip池.txt','a',encoding='UTF-8') as temp:
temp.write(str(proxies)+'\n')
except:
print("此ip无效")
#存储关键字
savekeyword={}
s=''
save_txt=''
jishu_total=0
total=0
total_count = {}
total_count_l=[]
def jiebakey(key1):
global s
global savekeyword
global total
global jishu_total
global total_count
global total_count_l
#载入自建字典
jieba.load_userdict("热点概念.txt")
txt = key1
words = jieba.lcut(txt)
count = {}
for word in words:
if len(word) < 2:
continue
else:
count = count.get(word, 0) + 1
exclude=[]
with open('热点概念备份.txt','r',encoding='UTF-8') as temp:
a = list(temp)
temp.close()
exclude.append(a)
exclude = ]
for key in list(count.keys()):
if key in exclude:
continue
else:
del count
for word in count:
if len(word) > 1:
total_count = total_count.get(word, 0) + 1
total_count_l = list(total_count.items())
total_count_l.sort(key=lambda x: x, reverse=True)
def get_and_save(url):
global save_txt
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
r = requests.get(url,proxies=proxiess)
r.encoding = 'utf-8'
html = etree.HTML(r.text)
result = html.xpath('//div/p/text()')
result = str(result)
result2 = result.replace('\\u3000','')
print(result2)
return result2
proxiess={}
def get_url():
global proxiess
xunhuan=
liebiao=[]
for k in xunhuan:
print (k)
html = 'http://guba.eastmoney.com/default,1_'+str(k)+'.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
page = requests.get(html,headers=headers)
soup_obj=BeautifulSoup(page.content,'html.parser')
for link in soup_obj.findAll('a'):
if "href" in link.attrs:
a = 'http://guba.eastmoney.com'+link.attrs['href']
if 'news' in a:
liebiao.append(a)
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
ipp=[]
with open('代理ip池.txt','r',encoding='UTF-8') as temp:
a = list(temp)
temp.close()
ipp.append(a)
ipp = ]
ipp =
ipp =
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
#实现每个ip使用XX次就换下一个ip_______________________________
ip=0
for i in liebiao:
if ip<260:
ip=ip+1
proxies = ipp
proxies=proxies
print('正在使用ip:'+proxies)
proxiess={'htttps':proxies}
try:
jiebakey(get_and_save(i))
print("正在爬取东方财富网......")
except:
print("正在爬取东方财富网...")
else:
ip=0
del ipp
#实现每个ip使用XX次就换下一个ip_______________________________
if __name__ =='__main__':
creat_name_dic()
time.sleep(3)
get_url()
time.sleep(1)
time.sleep(1)
print(total_count_l)
#send_email(s)
suchocolate 发表于 2020-8-18 23:02
全部代码
全部代码已发
页:
[1]