zjdataai-app/backend/app/engine/retriever/CHTokener.py

import os
from typing import Any, Dict, List, Union, Callable, NamedTuple
from bm25s.tokenization import *

try:
    from tqdm.auto import tqdm
except ImportError:

    def tqdm(iterable, *args, **kwargs):
        return iterable

import jieba
jiebapath = os.environ.get("JIEBA_DATA", "")
jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
jieba.initialize() #初始化jeiba

def chinese_tokenizer(text: str) -> List[str]:
    from nltk.corpus import stopwords
    tokens = jieba.lcut(text)
    return [token for token in tokens if token not in stopwords.words('chinese')]

def chTokenize(
    texts,
    show_progress: bool = True,
    leave: bool = False,
) -> Union[List[List[str]], Tokenized]:
    if isinstance(texts, str):
        texts = [texts]

    corpus_ids = []
    token_to_index = {}

    for text in tqdm(
        texts, desc="Split strings", leave=leave, disable=not show_progress
    ):

        splitted = chinese_tokenizer(text)
        doc_ids = []

        for token in splitted:
            if token not in token_to_index:
                token_to_index[token] = len(token_to_index)

            token_id = token_to_index[token]
            doc_ids.append(token_id)

        corpus_ids.append(doc_ids)

    return Tokenized(ids=corpus_ids, vocab=token_to_index)