[quote][url=forum.php?mod=redirect&goto=findpost&pid=4921454&ptid=178056]suchocolate 发表于 2020-8-18 23:02[/url]
全部代码[/quote]
import requests
from bs4 import BeautifulSoup
from lxml import etree
from cnsenti import Sentiment
import jieba
import smtplib
from email.mime.text import MIMEText
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait #等待一个元素加载完成
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains#引入鼠标操作
import json
hot_stock_concept={}
hot_stock_name={}
def creat_name_dic():
#无头浏览器功能
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
url ='http://quote.eastmoney.com/center/boardlist.html#concept_board'
browser.get(url)
time.sleep(1)
a=1
b=1
global hot_stock_concept,hot_stock_name
for i in range(14):
stock_name = browser.find_elements_by_xpath("//td[contains(@class,'mywidth3')]/a")
stock_name2=[]
for i in stock_name:
stock_name2.append(i.text)
k=1
#写入字典
for i in stock_name2:
if k%2!=0:
hot_stock_concept[i]=a
a=a+1
else:
hot_stock_name[i]=b
b=b+1
k=k+1
#点击下一页
time.sleep(1)
above=browser.find_element_by_xpath("//a[text()='下一页']").click()
time.sleep(1)
file_name ='热点概念'+'.txt'
for i in list(hot_stock_concept.keys()):
with open(file_name,'w', encoding='utf-8') as temp:
temp.write(i+'\n')
browser.quit()
#————————————————————————————————————————————————————————————————————————————————————————————————————————————
#爬取ip并存储部分——————————————————————————————————————————————————————————————————————————————————————————————
def get_ip():#动态的要用selemiun爬取
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
url ='https://proxy.seofangfa.com/'
browser.get(url)
time.sleep(1)
a = browser.find_elements_by_xpath("//table[contains(@class,'table')]/tbody/tr/td")
#print(a)
al=[]
bl=[]
for n, v in enumerate(a):#enumerate() 函数用于为可迭代对象添加序号,默认序号从0开始,一般用在 for 循环当中。
#在这里就相当于给n传入序号,v传入元素值
if n % 5 == 0 :
print(v.text)
a=v.text
al.append(a)
elif n % 5 == 1:
print(v.text)
b=v.text
bl.append(b)
browser.quit()
#print(al)
#print(bl)
#save_dic(hot_stock_concept)
#return hot_stock_concept
#创建ip池
url = 'https://www.baidu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.226.188.131:4216 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
for x,y in zip(al,bl):
z=x+':'+y
proxies={'https':'%s'%z}
print(proxies)
try:
page = requests.get(url,proxies=proxies)
with open('代理ip池.txt','a',encoding='UTF-8') as temp:
temp.write(str(proxies)+'\n')
except:
print("此ip无效")
#存储关键字
savekeyword={}
s=''
save_txt=''
jishu_total=0
total=0
total_count = {}
total_count_l=[]
def jiebakey(key1):
global s
global savekeyword
global total
global jishu_total
global total_count
global total_count_l
#载入自建字典
jieba.load_userdict("热点概念.txt")
txt = key1
words = jieba.lcut(txt)
count = {}
for word in words:
if len(word) < 2:
continue
else:
count[word] = count.get(word, 0) + 1
exclude=[]
with open('热点概念备份.txt','r',encoding='UTF-8') as temp:
a = list(temp)
temp.close()
exclude.append(a)
exclude = [i.rstrip('\n') for i in exclude[0]]
for key in list(count.keys()):
if key in exclude:
continue
else:
del count[key]
for word in count:
if len(word) > 1:
total_count[word] = total_count.get(word, 0) + 1
total_count_l = list(total_count.items())
total_count_l.sort(key=lambda x: x[1], reverse=True)
def get_and_save(url):
global save_txt
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
r = requests.get(url,proxies=proxiess)
r.encoding = 'utf-8'
html = etree.HTML(r.text)
result = html.xpath('//div[contains(@id,"zw_body")]/p/text()')
result = str(result)
result2 = result.replace('\\u3000','')
print(result2)
return result2
proxiess={}
def get_url():
global proxiess
xunhuan=[1,2,3,4,5,6,7,8,9,10]
liebiao=[]
for k in xunhuan:
print (k)
html = 'http://guba.eastmoney.com/default,1_'+str(k)+'.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip',
'DNT':'1',
'Connection':'close'
}
page = requests.get(html,headers=headers)
soup_obj=BeautifulSoup(page.content,'html.parser')
for link in soup_obj.findAll('a'):
if "href" in link.attrs:
a = 'http://guba.eastmoney.com'+link.attrs['href']
if 'news' in a:
liebiao.append(a)
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
ipp=[]
with open('代理ip池.txt','r',encoding='UTF-8') as temp:
a = list(temp)
temp.close()
ipp.append(a)
ipp = [i.rstrip('\n') for i in ipp[0]]
ipp = [i.strip('{') for i in ipp]
ipp = [i.strip('}') for i in ipp]
#导入代理ip——————————————————————————————————————————————————————————————————————————————————
#实现每个ip使用XX次就换下一个ip _______________________________
ip=0
for i in liebiao:
if ip<260:
ip=ip+1
proxies = ipp[0]
proxies=proxies[8:]
print('正在使用ip:'+proxies)
proxiess={'htttps':proxies}
try:
jiebakey(get_and_save(i))
print("正在爬取东方财富网......")
except:
print("正在爬取东方财富网...")
else:
ip=0
del ipp[0]
#实现每个ip使用XX次就换下一个ip _______________________________
if __name__ =='__main__':
creat_name_dic()
time.sleep(3)
get_url()
time.sleep(1)
time.sleep(1)
print(total_count_l)
#send_email(s)