|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
RT,不太懂为什么,基本思路就是直接开爬然后历遍,查了好几次F12也没看出来哪里有问题
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- from time import sleep
- import re
- import random
- import os
- headers = {
- "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
- "Connection":"keep-alive",
- "Accept":"text/html,application/xhtml+xml,application/xml;q=8.9,image/avif.image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "Accept-Encoding":"gzip,deflate",
- "Host":"www.baidu.com",
- "Cookie": "PSTM=1701005830; BIDUPSID=3847DE4A87F6BBA415E599085D83A664; BAIDUID=15153A5F7368E797908F9D80E823DA32:FG=1; BD_UPN=12314753; H_WISE_SIDS_BFESS=60360; MCITY=-360%3A; BDUSS=lrTlh4NTFOV2d2aVFDTWk0V20ySGNIZDJKT09QMFA0cHFyWkJ5QkxDZkZ1Y0ZtRVFBQUFBJCQAAAAAAAAAAAEAAAAPPSBIWW9ya19oYXBweQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMUsmmbFLJpmQ2; BDUSS_BFESS=lrTlh4NTFOV2d2aVFDTWk0V20ySGNIZDJKT09QMFA0cHFyWkJ5QkxDZkZ1Y0ZtRVFBQUFBJCQAAAAAAAAAAAEAAAAPPSBIWW9ya19oYXBweQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMUsmmbFLJpmQ2; H_PS_PSSID=60449_60359_60468_60440_60491_60501; H_WISE_SIDS=60449_60359_60468_60440_60491_60501; BAIDUID_BFESS=15153A5F7368E797908F9D80E823DA32:FG=1; BA_HECTOR=8lah8l8l05a12524ak20a5a1152kgl1j9ovqd1u; ZFY=wQT92WwXTov0LZiyOtHB0sWFAGuHJDp:BwvWiwRx018g:C; delPer=0; BD_CK_SAM=1; PSINO=1; BDRCVFR[K6RW1DeE3Dm]=mk3SLVN4HKm; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_PS_645EC=41244eUtCO11J%2FCYVZbKoFXLf9Q%2B2xUUYIMe%2FpRqCMbzuDBa9I%2FRH69TQ6cCalLJyL%2F7ag"
- }
- def search(v_keyword,v_result_file,v_max_pages):
- for page in range(v_max_pages):
- print("开始爬取第{}行".format(page+1))
- wait_seconds = random.uniform(1,2)
- print("开始等待{}行".format(wait_seconds))
- sleep(wait_seconds)
- url = 'https://www.baidu.com/s?wd='+ v_keyword+ '&pn=' + str(page * 10)
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- html = response.text
- print("响应码是:{}",format(response.status_code))
- print(response.text)
- soup = BeautifulSoup(response.text, 'html.parser')
- result_list = soup.find_all('div', class_='result c-container new-pmd')
- print("正在爬取{},共查询到{}个结果".format(url,len(result_list)))
- kw_list = []
- page_list = []
- title_list = []
- href_list = []
- realurl_list = []
- desc_list = [] #简介
- site_list = [] #网站名称
- for result in result_list:
- title = result.find("a").text
- print("标题是",title)
- href = result.find("a")['href']
- realurl= get_real_url(v_url=href)
- try:
- desc = result.find(class_='c-abstract').text
- except:
- desc = ""
- try:
- site = result.find(class_='c-showurl c-color-gray').text
- except:
- site = ""
- kw_list.append(v_keyword)
- page_list.append(page+1)
- title_list.append(title)
- href_list.append(href)
- realurl_list.append(realurl)
- desc_list.append(desc)
- site_list.append(site)
- df = pd.DataFrame(
- {
- '关键词':kw_list,
- '页码':page_list,
- '标题':title_list,
- '百度链接':href_list,
- '真实链接':realurl_list,
- '简介':desc_list,
- '网站名称':site_list,
- }
- )
- if os.path.exists(v_result_file):
- header = None
- else:
- header = ['关键词','页码','标题','百度链接', '真实链接','简介','网站名称']
- df.to_csv(v_result_file,mode="a+",index=False,header=header,encoding='utf_8_sig')
- print("数据保存成功:{}".format(v_result_file))
- def get_real_url(v_url):
- r = requests.get(v_url,headers=headers,allow_redirects=False)
- if r.status_code == 382:
- real_url = r.headers.get("location")
- else:
- real_url = re.findall("URL = '(.*?)'",r.text)[0]
- print("real_url is :",real_url)
- return real_url
- if __name__ == '__main__':
- keyword = input("请输入关键词")
- max_pages = 5
- result_file = "爬取{}前{}页.csv".format(keyword,max_pages)
- if os.path.exists(result_file):
- os.remove(result_file)
- print("该爬取关键词存在已被删除")
-
- search(v_keyword=keyword,v_result_file=result_file,v_max_pages=max_pages)
复制代码 |
|