|

楼主 |
发表于 2020-8-19 12:01:26
|
显示全部楼层
本帖最后由 937135952 于 2020-8-19 19:59 编辑
- [quote][size=2][url=forum.php?mod=redirect&goto=findpost&pid=4921454&ptid=178056][color=#999999]suchocolate 发表于 2020-8-18 23:02[/color][/url][/size]
- 全部代码[/quote]
- import requests
- from bs4 import BeautifulSoup
- from lxml import etree
- from cnsenti import Sentiment
- import jieba
- import smtplib
- from email.mime.text import MIMEText
- import os
- import time
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- import time
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait #等待一个元素加载完成
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.action_chains import ActionChains#引入鼠标操作
- import json
- hot_stock_concept={}
- hot_stock_name={}
-
- def creat_name_dic():
- #无头浏览器功能
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-gpu')
- path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
- browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
- url ='http://quote.eastmoney.com/center/boardlist.html#concept_board'
- browser.get(url)
- time.sleep(1)
-
- a=1
- b=1
- global hot_stock_concept,hot_stock_name
- for i in range(14):
- stock_name = browser.find_elements_by_xpath("//td[contains(@class,'mywidth3')]/a")
-
- stock_name2=[]
- for i in stock_name:
- stock_name2.append(i.text)
-
- k=1
-
- #写入字典
- for i in stock_name2:
- if k%2!=0:
- hot_stock_concept[i]=a
- a=a+1
- else:
- hot_stock_name[i]=b
- b=b+1
- k=k+1
-
- #点击下一页
- time.sleep(1)
- above=browser.find_element_by_xpath("//a[text()='下一页']").click()
- time.sleep(1)
-
- file_name ='热点概念'+'.txt'
- for i in list(hot_stock_concept.keys()):
- with open(file_name,'w', encoding='utf-8') as temp:
- temp.write(i+'\n')
-
- browser.quit()
-
-
- #————————————————————————————————————————————————————————————————————————————————————————————————————————————
- #爬取ip并存储部分——————————————————————————————————————————————————————————————————————————————————————————————
- def get_ip():#动态的要用selemiun爬取
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-gpu')
- path = r'C:\Users\Administrator\Desktop\chromedriver.exe'
- browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
- url ='https://proxy.seofangfa.com/'
- browser.get(url)
- time.sleep(1)
- a = browser.find_elements_by_xpath("//table[contains(@class,'table')]/tbody/tr/td")
- #print(a)
-
- al=[]
- bl=[]
- for n, v in enumerate(a):#enumerate() 函数用于为可迭代对象添加序号,默认序号从0开始,一般用在 for 循环当中。
- #在这里就相当于给n传入序号,v传入元素值
- if n % 5 == 0 :
- print(v.text)
- a=v.text
- al.append(a)
- elif n % 5 == 1:
- print(v.text)
- b=v.text
- bl.append(b)
- browser.quit()
- #print(al)
- #print(bl)
-
- #save_dic(hot_stock_concept)
- #return hot_stock_concept
- #创建ip池
- url = 'https://www.baidu.com/'
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.226.188.131:4216 Safari/537.36',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':'en-US,en;q=0.5',
- 'Accept-Encoding':'gzip',
- 'DNT':'1',
- 'Connection':'close'
- }
-
- for x,y in zip(al,bl):
- z=x+':'+y
- proxies={'https':'%s'%z}
- print(proxies)
-
- try:
- page = requests.get(url,proxies=proxies)
- with open('代理ip池.txt','a',encoding='UTF-8') as temp:
- temp.write(str(proxies)+'\n')
- except:
- print("此ip无效")
-
-
-
- #存储关键字
- savekeyword={}
- s=''
- save_txt=''
- jishu_total=0
- total=0
- total_count = {}
- total_count_l=[]
- def jiebakey(key1):
- global s
- global savekeyword
- global total
- global jishu_total
- global total_count
- global total_count_l
- #载入自建字典
- jieba.load_userdict("热点概念.txt")
-
- txt = key1
- words = jieba.lcut(txt)
-
- count = {}
- for word in words:
- if len(word) < 2:
- continue
- else:
- count[word] = count.get(word, 0) + 1
-
- exclude=[]
- with open('热点概念备份.txt','r',encoding='UTF-8') as temp:
- a = list(temp)
- temp.close()
- exclude.append(a)
- exclude = [i.rstrip('\n') for i in exclude[0]]
-
- for key in list(count.keys()):
- if key in exclude:
- continue
- else:
- del count[key]
-
-
- for word in count:
- if len(word) > 1:
- total_count[word] = total_count.get(word, 0) + 1
- total_count_l = list(total_count.items())
- total_count_l.sort(key=lambda x: x[1], reverse=True)
- def get_and_save(url):
- global save_txt
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':'en-US,en;q=0.5',
- 'Accept-Encoding':'gzip',
- 'DNT':'1',
- 'Connection':'close'
- }
-
- r = requests.get(url,proxies=proxiess)
- r.encoding = 'utf-8'
- html = etree.HTML(r.text)
- result = html.xpath('//div[contains(@id,"zw_body")]/p/text()')
-
- result = str(result)
- result2 = result.replace('\\u3000','')
- print(result2)
-
- return result2
- proxiess={}
- def get_url():
- global proxiess
- xunhuan=[1,2,3,4,5,6,7,8,9,10]
- liebiao=[]
-
- for k in xunhuan:
- print (k)
- html = 'http://guba.eastmoney.com/default,1_'+str(k)+'.html'
-
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':'en-US,en;q=0.5',
- 'Accept-Encoding':'gzip',
- 'DNT':'1',
- 'Connection':'close'
- }
-
- page = requests.get(html,headers=headers)
-
- soup_obj=BeautifulSoup(page.content,'html.parser')
-
-
- for link in soup_obj.findAll('a'):
- if "href" in link.attrs:
-
- a = 'http://guba.eastmoney.com'+link.attrs['href']
-
-
- if 'news' in a:
- liebiao.append(a)
-
- #导入代理ip——————————————————————————————————————————————————————————————————————————————————
- ipp=[]
- with open('代理ip池.txt','r',encoding='UTF-8') as temp:
- a = list(temp)
- temp.close()
- ipp.append(a)
- ipp = [i.rstrip('\n') for i in ipp[0]]
- ipp = [i.strip('{') for i in ipp]
- ipp = [i.strip('}') for i in ipp]
- #导入代理ip——————————————————————————————————————————————————————————————————————————————————
- #实现每个ip使用XX次就换下一个ip _______________________________
- ip=0
- for i in liebiao:
- if ip<260:
- ip=ip+1
- proxies = ipp[0]
- proxies=proxies[8:]
- print('正在使用ip:'+proxies)
- proxiess={'htttps':proxies}
-
- try:
- jiebakey(get_and_save(i))
- print("正在爬取东方财富网......")
- except:
- print("正在爬取东方财富网...")
- else:
- ip=0
- del ipp[0]
- #实现每个ip使用XX次就换下一个ip _______________________________
-
- if __name__ =='__main__':
- creat_name_dic()
- time.sleep(3)
- get_url()
- time.sleep(1)
- time.sleep(1)
- print(total_count_l)
- #send_email(s)
-
复制代码 |
|