File tree Expand file tree Collapse file tree 2 files changed +16
-4
lines changed
Expand file tree Collapse file tree 2 files changed +16
-4
lines changed Original file line number Diff line number Diff line change 1111from typing import List
1212
1313import jieba
14+ import jieba .posseg
1415from jieba import analyse
1516
1617from common .util .split_model import group_by
2526word_pattern_list = [r"v\d+.\d+.\d+" ,
2627 r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}" ]
2728
28- remove_chars = '\n , :\' <>!@#¥%……&*()!@#$%^&*(): ;,/"./-'
29+ remove_chars = '\n , :\' <>!@#¥%……&*()!@#$%^&*(): ;,/"./'
30+
31+ jieba_remove_flag_list = ['x' , 'w' ]
2932
3033
3134def get_word_list (text : str ):
@@ -82,8 +85,11 @@ def to_ts_vector(text: str):
8285 # 替换字符串
8386 text = replace_word (word_dict , text )
8487 # 分词
85- result = jieba .tokenize (text , mode = 'search' )
86- result_ = [{'word' : get_key_by_word_dict (item [0 ], word_dict ), 'index' : item [1 ]} for item in result ]
88+ result = jieba .posseg .lcut (text , HMM = True , use_paddle = True )
89+ # 过滤标点符号
90+ result = [item for item in result if not jieba_remove_flag_list .__contains__ (item .flag )]
91+ result_ = [{'word' : get_key_by_word_dict (result [index ].word , word_dict ), 'index' : index } for index in
92+ range (len (result ))]
8793 result_group = group_by (result_ , lambda r : r ['word' ])
8894 return " " .join (
8995 [f"{ key .lower ()} :{ ',' .join ([str (item ['index' ] + 1 ) for item in result_group [key ]][:20 ])} " for key in
Original file line number Diff line number Diff line change 11# Generated by Django 4.1.13 on 2024-04-16 11:43
2+ import threading
23
34import django .contrib .postgres .search
45from django .db import migrations
@@ -44,6 +45,11 @@ def save_keywords(apps, schema_editor):
4445 print (e )
4546
4647
48+ def async_save_keywords (apps , schema_editor ):
49+ thread = threading .Thread (target = save_keywords , args = (apps , schema_editor ))
50+ thread .start ()
51+
52+
4753class Migration (migrations .Migration ):
4854 dependencies = [
4955 ('embedding' , '0001_initial' ),
@@ -55,5 +61,5 @@ class Migration(migrations.Migration):
5561 name = 'search_vector' ,
5662 field = django .contrib .postgres .search .SearchVectorField (default = '' , verbose_name = '分词' ),
5763 ),
58- migrations .RunPython (save_keywords )
64+ migrations .RunPython (async_save_keywords )
5965 ]
You can’t perform that action at this time.
0 commit comments