selenium微博_登录_抓取疫情评论_存储数据库,Python交流,编程语言专区,鱼C论坛

HCF 发表于 2020-9-22 16:57:27

selenium微博_登录_抓取疫情评论_存储数据库

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 16:40:18 2020

@author: HCF
"""
import urllib.parse
from selenium.webdriver import Chrome, ChromeOptions
import time
import requests
import chaojiying
import urllib
import random
import traceback
import re
import datetime
from lxml import etree
import pymysql
import json

def get_conn():
"""
:return: 连接，游标l
"""
# 创建连接
conn = pymysql.connect(host="localhost",
                        user="root",
                        password="123456789",
                        db="cov",
                        charset='utf8mb4')
# 创建游标
cursor = conn.cursor()# 执行完毕返回的结果集默认以元组显示
return conn, cursor

def close_conn(conn, cursor):
if cursor:
   cursor.close()
if conn:
   conn.close()

def query(sql, *args):
"""
封装通用查询
:param sql:
:param args:
:return: 返回查询到的结果，((),(),)的形式
"""
conn, cursor = get_conn()
cursor.execute(sql, args)
res = cursor.fetchall()
close_conn(conn, cursor)
uids, dts = [], []
for r in res:
   uids.append(r)
   dts.append(r)
return uids, dts

#转换时间
def check_time(date):
if '今天' in date:
   a, b = date.split()
   t = time.strftime("%Y-%m-%d") + ' ' + b + ':' + '00'
elif '分钟前' in date:
   a, b = date.split('分钟前')# 利用时间撮计算
   timestemp = int(time.time()) - int(a) * 60
   ltime = time.localtime(timestemp)
   t = time.strftime("%Y-%m-%d %H:%M:%S", ltime)
elif '秒前' in date:
   a, b = date.split('秒前')
   timestemp = int(time.time()) - int(a)
   ltime = time.localtime(timestemp)
   t = time.strftime("%Y-%m-%d %H:%M:%S", ltime)
else:
   li = re.findall("\d+", date)
   t = datetime.date(2020, int(li), int(li)).strftime('%Y-%m-%d') + ' ' + li + ':' + li + ':' + '00'
return t

#模拟下拉
def drop_down():
for x in range(1, 11, 2):
   time.sleep(1)
   j = x / 10# 下拉几次
   js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
   driver.execute_script(js)

#获取评论
def outputtext(html):
global a
html = etree.HTML(html)
commentlist = html.xpath('//div[@class="list_ul"]/div[@class="list_li S_line1 clearfix"]')
content = []
id = []
tt = []
conn, cursor = get_conn()
sql = "insert into comments_p(user_id,content,dt,nichen,jianjie,gender,address) values(%s,%s,%s,%s,%s,%s,%s)"
try:
   for div in commentlist:
         text = div.xpath('.//div[@class="list_con"]/div[@class="WB_text"]/text()')
         g1 = ''
         for c in text:
            if c.strip() == '':
               pass
            else:
               g1 += c.strip()
         content.append(g1)# 评论内容
         g1 = g1
         g2 = div.xpath('.//div[@class="list_con"]/div[@class="WB_text"]/a/@usercard')
         id.append(g2)# 评论idd
         g2 = g2#评论者id
         g3 = div.xpath(
            './/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_from S_txt2"]/text()')
         g3 = check_time(g3)
         tt.append(g3)# 评论时间
         if len(g1):
            try:
               ############################person###############
               nichen,jianjie,gender,address = get_address(g2)
            except json.decoder.JSONDecodeError as e:
               print(e)
               time.sleep(random.choice())
               continue
            try:
               cursor.execute(sql, (g2, g1, g3,nichen,jianjie,gender,address))# 插入数据
            except UnicodeEncodeError as e:
               print(g2, ':', e)
               nichen = '---含表情---'
               jianjie = '---含表情---'
               cursor.execute(sql, (g2, g1, g3,nichen,jianjie,gender,address))# 插入数据
            a += 1
   conn.commit()# 提交事务保存数据
except:
   traceback.print_exc()
finally:
   close_conn(conn, cursor)
print(content, '\n', id, '\n', tt, '\n')

def login():
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}
# driver.maximize_window()# 最大化浏览器
time.sleep(8)
driver.find_element_by_xpath('//*[@id="loginname"]').send_keys(num1)# 输入搜索框
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div/div/div/input').send_keys(
   num2)# 输入搜索框
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div/div/a').click()# 模拟点击
time.sleep(3)
try:
   img_url = driver.find_element_by_xpath('//a[@class="code W_fl"]/img').get_attribute('src')
   response = requests.get(img_url, headers=headers)
   with open('yzm.jpg', 'wb') as f:
         f.write(response.content)
   code = chaojiying.run('yzm.jpg')
   # code = input('请输入验证码：')
   driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div/div/div/input').send_keys(code)# 输入搜索框
   driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div/div/a').click()# 模拟点击
except Exception as e:
   print(e)
driver.refresh()
cookies = driver.get_cookies()
# print(cookies)
return cookies

def comment_id():
title = ['新冠肺炎', '新冠疫情', '肺炎疫情', '抗击疫情', '抗击肺炎']
# title = ['新冠疫情']
# t1 = int(time.time()*1000)#时间戳
mids = []
urls = []
for ti in title:
   key = urllib.parse.quote(ti)
   url = 'https://s.weibo.com/weibo?q={}&wvr=6&b=1&Refer=SWeibo_box'.format(key)
   driver.get(url)
   drop_down()
   time.sleep(1)
   # url1 = 'https://m.weibo.cn/detail/4529896124847833'
   divs = driver.find_elements_by_xpath('//div[@class="card-wrap"]')
   url_divs = driver.find_elements_by_xpath('//p[@class="from"]/a')
   for div in divs:
         mid = div.find_element_by_xpath('.').get_attribute('mid')
         if mid:
            mids.append(mid)
   for url_div in url_divs:
         mid = url_div.find_element_by_xpath('.').get_attribute('href')
         if len(mid) > 60 and mid[:17] == 'https://weibo.com':
            urls.append(mid)
   print(len(mids), len(urls))
conn, cursor = get_conn()
sql = "insert into artical_id(mid,url) values(%s,%s)"
try:
   for i in range(len(mids)):
         cursor.execute(sql, (mids, urls))
   conn.commit()# 提交事务保存数据
except:
   traceback.print_exc()
finally:
   close_conn(conn, cursor)
return mids, urls

def comments(mids):
for mid in mids:
   time.sleep(0.5)
   data = {
         'ajwvr': 6,
         'id': mid,
         'root_comment_max_id': None,
         'root_comment_max_id_type': 0,
         'root_comment_ext_param': '',
         'page': 1,
         'filter': 'hot',
         'filter_tips_before': 0,
         'from': 'singleWeiBo',
         '__rnd': int(time.time() * 1000)
   }
   base_url = 'https://weibo.com/aj/v6/comment/big?{}'
   url = base_url.format(urllib.parse.urlencode(data))

   # url='https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4464915978680109&root_comment_max_id=4465043019735365&root_comment_max_id_type=1&root_comment_ext_param=&page=28&filter=hot&sum_comment_number=589&filter_tips_before=1&from=singleWeiBo&__rnd=158745889554'
   while True:
         time.sleep(0.5)
         print(url)
         driver.get(url)
         # time.sleep(0.3)
         html = driver.page_source
         html = html.encode('utf-8').decode('unicode_escape')
         html = html.replace('&', '&')
         a = html.find('&root_comment_max_id=')
         max_id = html
         print('忽略', max_id)
         key = html
         b = key.find('"')
         url1 = key[:b]
         url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&{}&from=singleWeiBo&__rnd={}'.format(url1, int(
            time.time() * 1000))
         # print(url)
         a1 = html.find('pre-wrap;">') + 11
         b1 = html.find('</pre>', a1)
         html = html# 字典
         a2 = html.find('"html":"') + 8
         b2 = html.find(',"count"') - 1
         c = html# 内置html
         d = c.replace('<', '<')
         e = d.replace('>', '>')
         f = e.replace('\/', '/')

         outputtext(f)
         if a == -1:
            break

def get_address(uid):
url = f'https://m.weibo.cn/api/container/getIndex?containerid=230283{uid}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{uid}'
driver.get(url)
html = driver.page_source
html = html.encode('utf-8').decode('unicode_escape')
html = html.replace('&', '&')
html = html.replace('\/', '/')
a1 = html.find('pre-wrap;">') + 11
b1 = html.find('</pre>', a1)
html = html# 字典
html = json.loads(html)
try:
   nichen = html['data']['cards']['card_group']['item_content']
except IndexError as e:
   print(uid, ':', e)
   nichen = '---无名氏---'
try:
   jianjie = html['data']['cards']['card_group']['item_content']
except:
   jianjie = None
try:
   gender = html['data']['cards']['card_group']['item_content']
except IndexError as e:
   print(uid, ':', e)
   gender = random.choice(['男', '女'])
try:
   address = html['data']['cards']['card_group']['item_content']
   if html['data']['cards']['card_group']['item_name'] != '所在地':
         address = html['data']['cards']['card_group']['item_content']
except:
   try:
         address = html['data']['cards']['card_group']['item_content']
   except:
         address = gender
         # gender = None
         gender = random.choice(['男', '女'])
return nichen,jianjie,gender,address

def chroms():
option = ChromeOptions()# 创建谷歌浏览器实例
option.add_argument('window-size=1920x3000')
option.add_argument("--headless")# 隐藏浏览器
option.add_argument('--disable-gpu')
option.add_argument('--hide-scrollbars')
driver = Chrome(options=option)
return driver

if __name__ == "__main__":
num1 = #账号
num2 = #密码
while True:
   a = 1
   driver = chroms()
   driver.get('https://weibo.com/')
   cookies = login()
   print('\n', f"{time.asctime()}开始更新评论数据", '\n')
   mids, urls = comment_id()# 文章id
   # mids=['4463193382931883']
   comments(mids)
   driver.quit()
   print('\n', f"{time.asctime()}评论数据更新结束", '\n')
   time.sleep(3600)

页: [1]

鱼C论坛's Archiver

selenium微博_登录_抓取疫情评论_存储数据库