调整NLTK数据目录和JIEBA字典位置到本项目中，避免重新安装时需要从网上下载

2024-08-30 01:20:29 +08:00
parent 2901bd9eaf
commit 0a5f335981
8 changed files with 349061 additions and 1 deletions
@@ -1,3 +1,8 @@
+JIEBA_DATA=./nltk_data
+NLTK_DATA=./nltk_data
+SQLITE_DATABASE_URL=sqlite:///./source.db
+DATA_SOURCE_CACHE=./restapi
+
 # The Llama Cloud API key.
 # LLAMA_CLOUD_API_KEY=
 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
@@ -1,3 +1,8 @@
+JIEBA_DATA=./nltk_data
+NLTK_DATA=./nltk_data
+SQLITE_DATABASE_URL=sqlite:///./source.db
+DATA_SOURCE_CACHE=./restapi
+
 # The Llama Cloud API key.
 # LLAMA_CLOUD_API_KEY=
 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Dict, List, Union, Callable, NamedTuple
 from bm25s.tokenization import *

@@ -8,9 +9,12 @@ except ImportError:
    def tqdm(iterable, *args, **kwargs):
        return iterable

+import jieba
+jiebapath = os.environ.get("JIEBA_DATA", "")
+jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
+jieba.initialize() #初始化jeiba

 def chinese_tokenizer(text: str) -> List[str]:
-    import jieba
    from nltk.corpus import stopwords
    tokens = jieba.lcut(text)
    return [token for token in tokens if token not in stopwords.words('chinese')]