underwood_yo 发表于 2023-9-7 12:08:33

微博评论内容爬取

本帖最后由 underwood_yo 于 2023-9-8 17:33 编辑

同学需要爬取微博的评论内容以及时间,给了一个excel文件,包含需要爬取的网址:

static/image/hrline/line2.png

import numpy as np
import pandas as pd
from selenium import webdriver
import re
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)

start_time = time.time()

def crawer(url):
    print('开始爬取'+ url)
    browser.get(url)
    time.sleep(1)
    res = browser.page_source
    return res

def execution_data(res):
    bs4_res = BeautifulSoup(res, 'html.parser')
    text = bs4_res.select('#app > div.lite-page-wrap > div > div.main > div > article > div > div > div.weibo-text').text
    created_at = '"created_at": "(.*?)"'
    time_ = re.findall(created_at, res,re.S)
    return text,time_

data = pd.read_excel('wangzhi.xlsx')

text_all = []
time_all = []
for i in range(data.shape-1):
    try:
      url = data['网址'].iloc
      res = crawer(url)
      text = execution_data(res)
      time_ = execution_data(res)
      text_all.append(text)
      time_all.append(time_)
    except:
      print('第'+ str(i+1) + '个网页出现问题')
      print(url)
      text = []
      time_ = []
      text_all.append(text)
      time_all.append(time_)

context = pd.DataFrame({'文本':text_all,'时间':time_all})
context.to_excel('微博正文.xlsx')
print('爬取完成')

end_time = time.time()
total_time = end_time - start_time
print("所有任务结束,总耗时为:" + str(total_time))

rtiuyttr 发表于 2023-9-9 12:01:44

页: [1]
查看完整版本: 微博评论内容爬取