Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions apps/common/util/ts_vecto_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import List

import jieba
import jieba.posseg
from jieba import analyse

from common.util.split_model import group_by
Expand All @@ -25,7 +26,9 @@
word_pattern_list = [r"v\d+.\d+.\d+",
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]

remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./-'
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./'

jieba_remove_flag_list = ['x', 'w']


def get_word_list(text: str):
Expand Down Expand Up @@ -82,8 +85,11 @@ def to_ts_vector(text: str):
# 替换字符串
text = replace_word(word_dict, text)
# 分词
result = jieba.tokenize(text, mode='search')
result_ = [{'word': get_key_by_word_dict(item[0], word_dict), 'index': item[1]} for item in result]
result = jieba.posseg.lcut(text, HMM=True, use_paddle=True)
# 过滤标点符号
result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)]
result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in
range(len(result))]
result_group = group_by(result_, lambda r: r['word'])
return " ".join(
[f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in
Expand Down
8 changes: 7 additions & 1 deletion apps/embedding/migrations/0002_embedding_search_vector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Generated by Django 4.1.13 on 2024-04-16 11:43
import threading

import django.contrib.postgres.search
from django.db import migrations
Expand Down Expand Up @@ -44,6 +45,11 @@ def save_keywords(apps, schema_editor):
print(e)


def async_save_keywords(apps, schema_editor):
thread = threading.Thread(target=save_keywords, args=(apps, schema_editor))
thread.start()


class Migration(migrations.Migration):
dependencies = [
('embedding', '0001_initial'),
Expand All @@ -55,5 +61,5 @@ class Migration(migrations.Migration):
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(default='', verbose_name='分词'),
),
migrations.RunPython(save_keywords)
migrations.RunPython(async_save_keywords)
]