import requests,re,time,random
import pymysql
from fake_useragent import UserAgent
from lxml import etree
import concurrent.futures
def get_danmu(params):#获取弹幕
url=params['url']#获取弹幕url
name = params['name']#获取数据库表明
page=name.split('danmu')[1]#获取集数
danmus=[]
#建立代理ip池
iplist = ['112.195.202.102:4278']#代理ip库,从网上选几个可用的
ip = random.choice(iplist)
proxies = {"http": ip, "https": ip}
headers = {'User-Agent': UserAgent().random}#导入随机文件头库,用于反爬虫
try:
res=requests.get(url=url,headers=headers,proxies=proxies,timeout=10)#删除proxies=proxies使用本机ip下载
except:
j=0
while j <10:#如果连接失败,重新进行连接,最多连接10次
ip = random.choice(iplist)
proxies = {"http": ip, "https": ip}
try:
res = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
except:
pass
print('正在进行第{}次重新连接--------------------------------------------'.format(j))
j += 1
time.sleep(2)
print('正在下载第{}集弹幕,本页弹幕共{}条'.format(page,eval(res.text)['count']))
if eval(res.text)['count'] != 0 and res.status_code==200:#如果网页请求成功及弹幕数量大于0
res = eval(res.text)#将网页相应内容变成字典
for each in res['comments']:
author = ''.join(re.findall(u"[\u4e00-\u9fa5]+", each['opername'].replace(' ', '')))#作者名字只提取中文
content = ''.join(re.findall(u"[\u4e00-\u9fa5]+", each['content'].replace('\xa0', '')))#评论内容只提取中文
danmus.append({
'作者': '未知作者' if len(author) == 0 else author,#如果为空值则默认为未知作者
'弹幕内容': '未知弹幕' if len(content) == 0 else content,#如果为空值则默认为未知弹幕
'点赞数': each['upcount'],
'弹幕发送时间': each['timepoint'],
'作者等级': each['uservip_degree'],
'弹幕id': each['commentid']})
insert([tuple(each.values()) for each in danmus], name)#将提取的弹幕写入数据库,此处必须要以数组传入
def get_after_param():#获得后缀id
url = 'https://v.qq.com/x/cover/o8mbrpo92gni5uc/i0036rltvk1.html'#输入任意电视剧某一集的网站
headers = {'User-Agent': UserAgent().random}
res = requests.get(url = url,headers=headers)
html = etree.HTML(res.content)
afterid = html.xpath("//div[@class='mod_episode']/span[@_stat='videolist:click' and not(i)]/@id")#不包含预告片及vip集数
jishu = [each.replace(' ','').replace('\n','') for each in html.xpath("//div[@class='mod_episode']/span[@_stat='videolist:click' and not(i)]/a/text()")]
after_param=dict(zip(jishu,afterid))#压缩成字典,方便数据传递
return after_param
def get_before_param():#获得前缀id
after_param=get_after_param()
url='https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
headers = {'User-Agent': UserAgent().random}
for each in after_param:
data = {"wRegistType": 2, "vecIdList": [after_param[each]], "wSpeSource": 0, "bIsGetUserCfg": 1,
"mapExtData": {after_param[each]: {"strCid": "o8mbrpo92gni5uc", "strLid": ""}}}
res = requests.post(url=url, json=data, headers=headers).json()
after_param[each]=res['data']['stMap'][after_param[each]]['strDanMuKey'].split('targetid=')[1]
return after_param
def create_table(name):#创建表
db = pymysql.connect(host='localhost', user='root', password='mxy', port=3306, db='python',charset='UTF8')
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS {}".format(name))
sql = """CREATE TABLE {}
(
作者 varchar(255),
弹幕内容 varchar(255),
点赞数 varchar(255),
弹幕发送时间 varchar(255),
作者等级 varchar(255),
弹幕id varchar(255)
)""".format(name)
cursor.execute(sql)
db.close()
def insert(value,name):#插入数据
db = pymysql.connect(host='localhost', user='root', password='mxy', port=3306, db='python')
cursor = db.cursor()
sql = "INSERT INTO {}(作者, 弹幕内容,点赞数, 弹幕发送时间,作者等级,弹幕id) values(%s, %s, %s,%s,%s,%s)".format(name)
try:
cursor.executemany(sql,value)
db.commit()
except:
db.rollback()
print(value)
db.close()
def thread(after_param,name):
urls = ['https://mfm.video.qq.com/danmu?target_id={}×tamp={}'.format(after_param,i) for i in
range(15, 3000, 30)]
params=[]
for url in urls:
params.append({'url': url, 'name': name,'after_param':after_param})
with concurrent.futures.ThreadPoolExecutor() as pool:
htmls = pool.map(get_danmu,params)
def main():
t1 = time.time()
after_param = get_before_param()
for i in range(1,len(after_param)+1):
print('正在爬取第{}集-----------------------------------------------------------------'.format(i))
t2=time.time()
create_table('danmu'+str(i))
thread(after_param[str(i)], 'danmu'+str(i))
t3= time.time()
print('-------------------------累计运行{}秒,本集弹幕爬取共用{}秒----------------------------------'.format(t3-t1,t3-t2))
print('弹幕已经爬取完成')
main()