import os from typing import Any, Dict, List, Union, Callable, NamedTuple from bm25s.tokenization import * try: from tqdm.auto import tqdm except ImportError: def tqdm(iterable, *args, **kwargs): return iterable import jieba jiebapath = os.environ.get("JIEBA_DATA", "") jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典 jieba.initialize() #初始化jeiba def chinese_tokenizer(text: str) -> List[str]: from nltk.corpus import stopwords tokens = jieba.lcut(text) return [token for token in tokens if token not in stopwords.words('chinese')] def chTokenize( texts, show_progress: bool = True, leave: bool = False, ) -> Union[List[List[str]], Tokenized]: if isinstance(texts, str): texts = [texts] corpus_ids = [] token_to_index = {} for text in tqdm( texts, desc="Split strings", leave=leave, disable=not show_progress ): splitted = chinese_tokenizer(text) doc_ids = [] for token in splitted: if token not in token_to_index: token_to_index[token] = len(token_to_index) token_id = token_to_index[token] doc_ids.append(token_id) corpus_ids.append(doc_ids) return Tokenized(ids=corpus_ids, vocab=token_to_index)