|
发表于 2019-1-3 20:44:17
|
显示全部楼层
本帖最后由 Stubborn 于 2019-1-3 20:54 编辑
发个文本给我参考下,我看下。需要用到正则表达式。或者你看的动下面的代码,可以借鉴,自己学习,修改下。下面代码是统计文本出现频率最高的前10个单词
- import re
- file_object = open("text.txt") #打开目录
- text=file_object.read()
- txt_dict={}#用于统计 txt:个数
- txt_list=[]#用于存放所有单词
- for letter in text:
- if not letter.isalpha():
- text = text.replace(letter,'-')
- txt_list = text.lower().split('-') #分隔单词
- txtlist = list(filter(lambda x : x != '',txt_list))#去掉空串
- #print('单词列表:',txtlist)
- #正则表达式分隔
- specila_set = set()
- for letter in text:
- if not letter.isalpha():
- specila_set.add(letter)
- #print('所有不是字母的字符:',specila_set)
- pattern = ''.join(specila_set)
- pattern = '['+pattern+']'
- word_list = re.split(pattern,text.lower()) #还要去除空格
- # 遍历字符串,获取每个word追加到wordlist
- word =''
- word_list2 = []
- for letter in text.lower():
- if letter.isalpha(): #如果是字母,追加到word
- word += letter
- else:
- if word != '':
- word_list2.append(word) #不是字母,word不为空的话追加wordlist
- word = '' # word置空
- #print(word_list2)
- # 正则表达式匹配单词
- pattern = r'[a-zA-Z]+'
- word_list3 = re.findall(pattern,text.lower())
- #print(word_list3)
- #统计
- for word in set(txt_list):
- txt_dict[word] = txt_list.count(word) #key=单词,value=单词在list里的count
- print(sorted(txt_dict.items(),key = lambda x:x[1],reverse=True)[0:11]) #dict根据value倒序,取前11个,含有空格,多取一个
- #[('', 273), ('i', 44), ('the', 43), ('and', 37), ('of', 36), ('a', 33), ('to', 28), ('was', 26), ('had', 22), ('my', 20), ('as', 17)]
复制代码 |
|