47 lines
1.1 KiB
Python
47 lines
1.1 KiB
Python
from typing import Any, Dict, List, Union, Callable, NamedTuple
|
|
from bm25s.tokenization import *
|
|
|
|
try:
|
|
from tqdm.auto import tqdm
|
|
except ImportError:
|
|
|
|
def tqdm(iterable, *args, **kwargs):
|
|
return iterable
|
|
|
|
|
|
def chinese_tokenizer(text: str) -> List[str]:
|
|
import jieba
|
|
from nltk.corpus import stopwords
|
|
tokens = jieba.lcut(text)
|
|
return [token for token in tokens if token not in stopwords.words('chinese')]
|
|
|
|
def chTokenize(
|
|
texts,
|
|
show_progress: bool = True,
|
|
leave: bool = False,
|
|
) -> Union[List[List[str]], Tokenized]:
|
|
if isinstance(texts, str):
|
|
texts = [texts]
|
|
|
|
corpus_ids = []
|
|
token_to_index = {}
|
|
|
|
for text in tqdm(
|
|
texts, desc="Split strings", leave=leave, disable=not show_progress
|
|
):
|
|
|
|
splitted = chinese_tokenizer(text)
|
|
doc_ids = []
|
|
|
|
for token in splitted:
|
|
if token not in token_to_index:
|
|
token_to_index[token] = len(token_to_index)
|
|
|
|
token_id = token_to_index[token]
|
|
doc_ids.append(token_id)
|
|
|
|
corpus_ids.append(doc_ids)
|
|
|
|
return Tokenized(ids=corpus_ids, vocab=token_to_index)
|
|
|