调整NLTK数据目录和JIEBA字典位置到本项目中，避免重新安装时需要从网上下载

2024-08-30 01:20:29 +08:00
parent 2901bd9eaf
commit 0a5f335981
8 changed files with 349061 additions and 1 deletions
@@ -1,3 +1,8 @@
 JIEBA_DATA=./nltk_data
 NLTK_DATA=./nltk_data
 SQLITE_DATABASE_URL=sqlite:///./source.db
 DATA_SOURCE_CACHE=./restapi
 # The Llama Cloud API key.
 # LLAMA_CLOUD_API_KEY=
 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
@@ -1,3 +1,8 @@
 JIEBA_DATA=./nltk_data
 NLTK_DATA=./nltk_data
 SQLITE_DATABASE_URL=sqlite:///./source.db
 DATA_SOURCE_CACHE=./restapi
 # The Llama Cloud API key.
 # LLAMA_CLOUD_API_KEY=
 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
@@ -1,3 +1,4 @@
 import os
 from typing import Any, Dict, List, Union, Callable, NamedTuple
 from bm25s.tokenization import *
@@ -8,9 +9,12 @@ except ImportError:
    def tqdm(iterable, *args, **kwargs):
        return iterable
 import jieba
 jiebapath = os.environ.get("JIEBA_DATA", "")
 jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
 jieba.initialize() #初始化jeiba
 def chinese_tokenizer(text: str) -> List[str]:
    import jieba
    from nltk.corpus import stopwords
    tokens = jieba.lcut(text)
    return [token for token in tokens if token not in stopwords.words('chinese')]