调整NLTK数据目录和JIEBA字典位置到本项目中,避免重新安装时需要从网上下载

This commit is contained in:
2024-08-30 01:20:29 +08:00
parent 2901bd9eaf
commit 0a5f335981
8 changed files with 349061 additions and 1 deletions
+5 -1
View File
@@ -1,3 +1,4 @@
import os
from typing import Any, Dict, List, Union, Callable, NamedTuple
from bm25s.tokenization import *
@@ -8,9 +9,12 @@ except ImportError:
def tqdm(iterable, *args, **kwargs):
return iterable
import jieba
jiebapath = os.environ.get("JIEBA_DATA", "")
jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
jieba.initialize() #初始化jeiba
def chinese_tokenizer(text: str) -> List[str]:
import jieba
from nltk.corpus import stopwords
tokens = jieba.lcut(text)
return [token for token in tokens if token not in stopwords.words('chinese')]