1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

def read_file(file):
"""
读入指定文件内容,并转换内容为小写
:param file: str-->文件路径
:return: str-->文件内容
"""
with open(file, "r", encoding="utf-8") as f:
return f.read().lower()


def count_text(text, is_1w):
"""
统计指定文本内容的词频
:param text: str-->待统计的文本内容
:param is_1w: Boolean-->是否只统计前一万个单词
:return: list-->[(单词, 单词出现次数)...]
"""
for word in ",.!;…\n":
text = text.replace(word, " ")
text_split = text.split()
if is_1w:
text_split = text_split[:10000]
text_dict = {}
for word in text_split:
text_dict[word] = text_dict.get(word, 0) + 1
text_list = list(text_dict.items())
text_list.sort(key=lambda x: x[1], reverse=True)
return text_list


def print_result(text_list, is_1w, is_detail):
"""
打印词频统计结果
:param text_list: list-->词频统计结果,[(单词, 单词出现次数)...]
:param is_1w: Boolean-->是否为前1万词的统计结果
:param is_detail: Boolean-->是否显示词频统计详情
:return: None
"""
if is_1w:
print("首万词不重复词数为:{}个,词频统计详情如下:".format(len(text_list)))
else:
print("全文不重复词数为:{}个,词频统计详情如下:".format(len(text_list)))
if is_detail:
for item in text_list:
print("{}:{}".format(item[0], item[1]))
else:
print("已设置不显示词频详情。")


if __name__ == "__main__":
FILE_PATH = "D:/nny.txt"
text_all = read_file(FILE_PATH)
text_list_1w = count_text(text_all, True)
text_list_all = count_text(text_all, False)
print_result(text_list_1w, True, False)
print_result(text_list_all, False, True)

代码不难,就不过多解释了,大家一看便知。