新增关键字检索类

2024-08-22 11:07:23 +08:00
parent 043aea6cca
commit 8d7190d0b6
2 changed files with 179 additions and 0 deletions
@@ -0,0 +1,46 @@
+from typing import Any, Dict, List, Union, Callable, NamedTuple
+from bm25s.tokenization import *
+
+try:
+    from tqdm.auto import tqdm
+except ImportError:
+
+    def tqdm(iterable, *args, **kwargs):
+        return iterable
+
+
+def chinese_tokenizer(text: str) -> List[str]:
+    import jieba
+    from nltk.corpus import stopwords
+    tokens = jieba.lcut(text)
+    return [token for token in tokens if token not in stopwords.words('chinese')]
+
+def chTokenize(
+    texts,
+    show_progress: bool = True,
+    leave: bool = False,
+) -> Union[List[List[str]], Tokenized]:
+    if isinstance(texts, str):
+        texts = [texts]
+
+    corpus_ids = []
+    token_to_index = {}
+
+    for text in tqdm(
+        texts, desc="Split strings", leave=leave, disable=not show_progress
+    ):
+        
+        splitted = chinese_tokenizer(text)
+        doc_ids = []
+
+        for token in splitted:
+            if token not in token_to_index:
+                token_to_index[token] = len(token_to_index)
+
+            token_id = token_to_index[token]
+            doc_ids.append(token_id)
+
+        corpus_ids.append(doc_ids)
+
+    return Tokenized(ids=corpus_ids, vocab=token_to_index)
+