From 01586f45246887dc983321e4dc035b80c2678b32 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Thu, 9 May 2024 15:52:19 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=88=86=E8=AF=8D?= =?UTF-8?q?=E8=B6=85=E8=BF=87=E6=95=B0=E6=8D=AE=E5=BA=93=E6=9C=80=E5=A4=A7?= =?UTF-8?q?=E9=99=90=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/util/ts_vecto_util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/common/util/ts_vecto_util.py b/apps/common/util/ts_vecto_util.py index 9def9585fab..451d87bf870 100644 --- a/apps/common/util/ts_vecto_util.py +++ b/apps/common/util/ts_vecto_util.py @@ -85,10 +85,11 @@ def to_ts_vector(text: str): # 替换字符串 text = replace_word(word_dict, text) # 分词 - result = jieba.posseg.lcut(text, HMM=True, use_paddle=True) + filter_word = jieba.analyse.extract_tags(text, topK=100) + result = jieba.lcut(text, HMM=True, use_paddle=True) # 过滤标点符号 - result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)] - result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in + result = [item for item in result if filter_word.__contains__(item) and len(item) < 10] + result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in range(len(result))] result_group = group_by(result_, lambda r: r['word']) return " ".join(