|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
如题,想爬取某论坛包含“特定词语”的帖子,写了个爬虫,现在发现运行倒是可以运行,但是没有爬到任何数据。我觉得是网址出的问题,但是不知道如何修改。首页的网址是“https://example_url-383-1.html”,第二页就是“https://example_url-383-2.html”
代码如下:
- import requests
- from bs4 import BeautifulSoup
- from concurrent.futures import ThreadPoolExecutor
- import os
- import warnings
- # 抑制InsecureRequestWarning警告
- warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made.*')
- def fetch_posts(url, keyword):
- # 初始化翻页计数器
- page = 1
-
- while True:
- # 构造翻页的URL
- full_url = f"{url}-{page}.html"
-
- # 发送HTTP请求
- # 注意:这里设置了verify=False来禁用SSL验证
- response = requests.get(full_url, verify=False)
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 假设帖子的标题在一个带有class="post-title"的标签内
- titles = soup.find_all(class_='post-title')
-
- for title in titles:
- if keyword in title.text:
- # 获取帖子内容,假设内容在class="post-content"的标签内
- content = title.find_next_sibling(class_='post-content')
- if content and len(content.text) > 200:
- yield (title.text, full_url, content.text)
-
- # 检查是否有下一页
- next_page = soup.find(class_='next-page') # 根据实际情况调整选择器
- if not next_page:
- break # 如果没有下一页,则退出循环
-
- page += 1 # 更新页码
- def process_page(page_url, keyword):
- # 这个函数将被ThreadPoolExecutor调用
- results = []
- for title, url, post_content in fetch_posts(page_url, keyword):
- results.append((title, url, post_content))
- return results
- # 示例使用
- base_url = 'https://example_url-383-1.html' # 论坛首页URL
- keyword = '特定词语'
- # 使用ThreadPoolExecutor并行处理
- with ThreadPoolExecutor(max_workers=5) as executor:
- # 创建一个列表来存储所有页面的URL
- page_urls = [f"{base_url}-{i}.html" for i in range(1, 10)] # 假设有10页
- futures = {executor.submit(process_page, page_url, keyword): page_url for page_url in page_urls}
-
- # 收集结果
- all_results = []
- for future in futures:
- all_results.extend(future.result())
- # 输出结果
- # 1. 输出所有帖子的标题和URL到"mulu.txt"
- with open("mulu.txt", "w", encoding="utf-8") as file:
- for title, url, _ in all_results:
- file.write(f"{title}: {url}\n")
- # 2. 对于每个帖子的内容,输出到一个以帖子标题命名的文件中
- output_dir = "posts"
- os.makedirs(output_dir, exist_ok=True) # 创建输出目录,如果不存在的话
- for title, _, post_content in all_results:
- # 替换文件名中的非法字符
- safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_'))
- with open(os.path.join(output_dir, f"{safe_title}.txt"), "w", encoding="utf-8") as file:
- file.write(post_content)
- print("All pages processed.")
复制代码
我们可以一步一步实现这个需求。以下是完整的库代码:
1. **首先,导入需要用到的库:**
- import requests
- from bs4 import BeautifulSoup
- import os
- import warnings
- # 抑制InsecureRequestWarning警告
- warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made.*')
复制代码
2. **创建一个函数用来爬取页面内容:**
- def fetch_posts(url, keyword):
- results = []
-
- # 遍历每一页,编号从1到10
- for page in range(1, 11):
- # 构造翻页的URL
- full_url = f"{url}-{page}.html"
-
- # 发送HTTP请求
- response = requests.get(full_url, verify=False)
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 假设帖子的标题在class="post-title"的标签内
- titles = soup.find_all(class_='post-title')
-
- for title in titles:
- if keyword in title.text:
- post_url = title.find('a')['href'] # 获取帖子的URL
- results.append((title.text.strip(), post_url))
-
- return results
复制代码
3. **获取特定帖子的HTML内容并处理它:**
- def fetch_post_content(post_url):
- response = requests.get(post_url, verify=False)
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 假设内容在class="post-content"的标签内
- post_content = soup.find(class_='post-content')
- if post_content:
- # 去掉HTML排版特定字符, 这里只是一个简单示例,可以根据实际情况进一步处理
- text_content = post_content.get_text(separator=' ', strip=True)
- return text_content
- return None
复制代码
4. **整合代码,实现所有需求:**
- def main():
- base_url = 'https://example_url-383' # 论坛首页的URL基础部分
- keyword = 'key'
-
- # 步骤1-4: 查找包含关键字的帖子并保存标题和URL到ML.txt
- all_results = fetch_posts(base_url, keyword)
-
- with open("ML.txt", "w", encoding="utf-8") as file:
- for title, url in all_results:
- file.write(f"{title}: {url}\n")
-
- # 步骤5: 对每个帖子,获取内容并保存到小说.txt
- with open("小说.txt", "w", encoding="utf-8") as novel_file:
- for title, url in all_results:
- print(f"Processing: {title}")
- content = fetch_post_content(url)
- if content:
- novel_file.write(f"{title}\n{content}\n\n")
-
- # 步骤6: 打印完成消息
- print("已完成搜索")
- if __name__ == "__main__":
- main()
复制代码
这个代码结构清晰,分步骤实现了所有需求。你可以根据实际情况来调整`fetch_posts`和`fetch_post_content`函数内的选择器(如`class_='post-title'`和`class_='post-content'`),确保与实际网页内容相匹配。
运行这个代码就可以爬取包含特定关键词的帖子,并把内容保存到文本文件中。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
|
|