|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests,re,time,random
- import pymysql
- from fake_useragent import UserAgent
- from lxml import etree
- import concurrent.futures
- def get_danmu(params):#获取弹幕
- url=params['url']#获取弹幕url
- name = params['name']#获取数据库表明
- page=name.split('danmu')[1]#获取集数
- danmus=[]
- #建立代理ip池
- iplist = ['112.195.202.102:4278']#代理ip库,从网上选几个可用的
- ip = random.choice(iplist)
- proxies = {"http": ip, "https": ip}
- headers = {'User-Agent': UserAgent().random}#导入随机文件头库,用于反爬虫
- try:
- res=requests.get(url=url,headers=headers,proxies=proxies,timeout=10)#删除proxies=proxies使用本机ip下载
- except:
- j=0
- while j <10:#如果连接失败,重新进行连接,最多连接10次
- ip = random.choice(iplist)
- proxies = {"http": ip, "https": ip}
- try:
- res = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
- except:
- pass
- print('正在进行第{}次重新连接--------------------------------------------'.format(j))
- j += 1
- time.sleep(2)
- print('正在下载第{}集弹幕,本页弹幕共{}条'.format(page,eval(res.text)['count']))
- if eval(res.text)['count'] != 0 and res.status_code==200:#如果网页请求成功及弹幕数量大于0
- res = eval(res.text)#将网页相应内容变成字典
- for each in res['comments']:
- author = ''.join(re.findall(u"[\u4e00-\u9fa5]+", each['opername'].replace(' ', '')))#作者名字只提取中文
- content = ''.join(re.findall(u"[\u4e00-\u9fa5]+", each['content'].replace('\xa0', '')))#评论内容只提取中文
- danmus.append({
- '作者': '未知作者' if len(author) == 0 else author,#如果为空值则默认为未知作者
- '弹幕内容': '未知弹幕' if len(content) == 0 else content,#如果为空值则默认为未知弹幕
- '点赞数': each['upcount'],
- '弹幕发送时间': each['timepoint'],
- '作者等级': each['uservip_degree'],
- '弹幕id': each['commentid']})
- insert([tuple(each.values()) for each in danmus], name)#将提取的弹幕写入数据库,此处必须要以数组传入
- def get_after_param():#获得后缀id
- url = 'https://v.qq.com/x/cover/o8mbrpo92gni5uc/i0036rltvk1.html'#输入任意电视剧某一集的网站
- headers = {'User-Agent': UserAgent().random}
- res = requests.get(url = url,headers=headers)
- html = etree.HTML(res.content)
- afterid = html.xpath("//div[@class='mod_episode']/span[@_stat='videolist:click' and not(i)]/@id")#不包含预告片及vip集数
- jishu = [each.replace(' ','').replace('\n','') for each in html.xpath("//div[@class='mod_episode']/span[@_stat='videolist:click' and not(i)]/a/text()")]
- after_param=dict(zip(jishu,afterid))#压缩成字典,方便数据传递
- return after_param
- def get_before_param():#获得前缀id
- after_param=get_after_param()
- url='https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
- headers = {'User-Agent': UserAgent().random}
- for each in after_param:
- data = {"wRegistType": 2, "vecIdList": [after_param[each]], "wSpeSource": 0, "bIsGetUserCfg": 1,
- "mapExtData": {after_param[each]: {"strCid": "o8mbrpo92gni5uc", "strLid": ""}}}
- res = requests.post(url=url, json=data, headers=headers).json()
- after_param[each]=res['data']['stMap'][after_param[each]]['strDanMuKey'].split('targetid=')[1]
- return after_param
- def create_table(name):#创建表
- db = pymysql.connect(host='localhost', user='root', password='mxy', port=3306, db='python',charset='UTF8')
- cursor = db.cursor()
- cursor.execute("DROP TABLE IF EXISTS {}".format(name))
- sql = """CREATE TABLE {}
- (
- 作者 varchar(255),
- 弹幕内容 varchar(255),
- 点赞数 varchar(255),
- 弹幕发送时间 varchar(255),
- 作者等级 varchar(255),
- 弹幕id varchar(255)
- )""".format(name)
- cursor.execute(sql)
- db.close()
- def insert(value,name):#插入数据
- db = pymysql.connect(host='localhost', user='root', password='mxy', port=3306, db='python')
- cursor = db.cursor()
- sql = "INSERT INTO {}(作者, 弹幕内容,点赞数, 弹幕发送时间,作者等级,弹幕id) values(%s, %s, %s,%s,%s,%s)".format(name)
- try:
- cursor.executemany(sql,value)
- db.commit()
- except:
- db.rollback()
- print(value)
- db.close()
- def thread(after_param,name):
- urls = ['https://mfm.video.qq.com/danmu?target_id={}×tamp={}'.format(after_param,i) for i in
- range(15, 3000, 30)]
- params=[]
- for url in urls:
- params.append({'url': url, 'name': name,'after_param':after_param})
- with concurrent.futures.ThreadPoolExecutor() as pool:
- htmls = pool.map(get_danmu,params)
- def main():
- t1 = time.time()
- after_param = get_before_param()
- for i in range(1,len(after_param)+1):
- print('正在爬取第{}集-----------------------------------------------------------------'.format(i))
- t2=time.time()
- create_table('danmu'+str(i))
- thread(after_param[str(i)], 'danmu'+str(i))
- t3= time.time()
- print('-------------------------累计运行{}秒,本集弹幕爬取共用{}秒----------------------------------'.format(t3-t1,t3-t2))
- print('弹幕已经爬取完成')
- main()
复制代码 |
|