|

楼主 |
发表于 2020-12-24 10:21:48
|
显示全部楼层
- import requests
- import csv
- SUCCESS_CODE=200#成功访问状态码
- FILE_NAME="test.csv"#保存的文件名
- keyWords=["中国","科技","文化","政策","北京"]#5个关键字
- headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}
- #6个网站
- webs=[
- "https://www.bnu.edu.cn/index.htm",
- "http://www.bnuzh.edu.cn/",
- "http://jwb.bnu.edu.cn/",
- "https://www.bnu.edu.cn/xysh/index.htm",
- "https://www.bnu.edu.cn/kxyj/index.htm",
- "http://www.bnuzh.edu.cn/xqjj/bnsd/index.htm"
- ]
- #保存文本文件,测试用
- def saveFile(fileName,text,encoding="utf-8"):
- with open(fileName,"w",encoding=encoding) as f:
- f.write(text)
- class MyWeb:
- def __init__(self,web):
- self.website=web
- self.count={"中国":0,"科技":0,"文化":0,"政策":0,"北京":0}
- response=requests.get(self.website,headers=headers)
- if response.status_code!=SUCCESS_CODE:
- #如果访问出问题
- raise Exception("Error:failed to get response from"+web)
- #将HTML字符编码转换为utf-8
- st=response.text
- temp=st.encode(response.encoding)
- st=temp.decode("utf-8")
- #统计各关键词数量
- for word in keyWords:
- x=0
- while True:
- x=st.find(word,x)
- if x>0:
- self.count[word]+=1
- x+=1
- else:
- break
- #测试打印其各关键词统计数据
- def printData(self):
- print(self.count)
- #快速排序算法,arr为MyWeb类型的数组,key为关键词
- def quickSort(arr,key):
- if len(arr)<=1:
- return arr
- a=arr[0]
- x1=[]
- x2=[]
- for i in range(1,len(arr)):
- if arr[i].count[key]>=a.count[key]:
- x1.append(arr[i])
- else:
- x2.append(arr[i])
- x1=quickSort(x1,key)
- x1.append(a)
- x2=quickSort(x2,key)
- x1.extend(x2)
- return x1
- #主程序
- if __name__=="__main__":
- allWeb=[]
- #汇总所有网站爬取的数据
- for web in webs:
- allWeb.append(MyWeb(web))
- #写入到文件中去
- with open(FILE_NAME,"wt") as f:
- writer=csv.writer(f)
- writer.writerow(['关键词/频率排位','1','2','3','4','5','6'])
- for word in keyWords:
- tmpWebs=quickSort(allWeb,word)
- row=[word]
- for w in tmpWebs:
- row.append(w.website+':'+str(w.count[word]))
- writer.writerow(row)
复制代码 |
|