1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
| def read_file(file): """ 读入指定文件内容,并转换内容为小写 :param file: str-->文件路径 :return: str-->文件内容 """ with open(file, "r", encoding="utf-8") as f: return f.read().lower()
def count_text(text, is_1w): """ 统计指定文本内容的词频 :param text: str-->待统计的文本内容 :param is_1w: Boolean-->是否只统计前一万个单词 :return: list-->[(单词, 单词出现次数)...] """ for word in ",.!;…\n": text = text.replace(word, " ") text_split = text.split() if is_1w: text_split = text_split[:10000] text_dict = {} for word in text_split: text_dict[word] = text_dict.get(word, 0) + 1 text_list = list(text_dict.items()) text_list.sort(key=lambda x: x[1], reverse=True) return text_list
def print_result(text_list, is_1w, is_detail): """ 打印词频统计结果 :param text_list: list-->词频统计结果,[(单词, 单词出现次数)...] :param is_1w: Boolean-->是否为前1万词的统计结果 :param is_detail: Boolean-->是否显示词频统计详情 :return: None """ if is_1w: print("首万词不重复词数为:{}个,词频统计详情如下:".format(len(text_list))) else: print("全文不重复词数为:{}个,词频统计详情如下:".format(len(text_list))) if is_detail: for item in text_list: print("{}:{}".format(item[0], item[1])) else: print("已设置不显示词频详情。")
if __name__ == "__main__": FILE_PATH = "D:/nny.txt" text_all = read_file(FILE_PATH) text_list_1w = count_text(text_all, True) text_list_all = count_text(text_all, False) print_result(text_list_1w, True, False) print_result(text_list_all, False, True)
|