调整NLTK数据目录和JIEBA字典位置到本项目中,避免重新安装时需要从网上下载

This commit is contained in:
2024-08-30 01:20:29 +08:00
parent 2901bd9eaf
commit 0a5f335981
8 changed files with 349061 additions and 1 deletions
+5
View File
@@ -1,3 +1,8 @@
JIEBA_DATA=./nltk_data
NLTK_DATA=./nltk_data
SQLITE_DATABASE_URL=sqlite:///./source.db
DATA_SOURCE_CACHE=./restapi
# The Llama Cloud API key. # The Llama Cloud API key.
# LLAMA_CLOUD_API_KEY= # LLAMA_CLOUD_API_KEY=
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
+5
View File
@@ -1,3 +1,8 @@
JIEBA_DATA=./nltk_data
NLTK_DATA=./nltk_data
SQLITE_DATABASE_URL=sqlite:///./source.db
DATA_SOURCE_CACHE=./restapi
# The Llama Cloud API key. # The Llama Cloud API key.
# LLAMA_CLOUD_API_KEY= # LLAMA_CLOUD_API_KEY=
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1 SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
+5 -1
View File
@@ -1,3 +1,4 @@
import os
from typing import Any, Dict, List, Union, Callable, NamedTuple from typing import Any, Dict, List, Union, Callable, NamedTuple
from bm25s.tokenization import * from bm25s.tokenization import *
@@ -8,9 +9,12 @@ except ImportError:
def tqdm(iterable, *args, **kwargs): def tqdm(iterable, *args, **kwargs):
return iterable return iterable
import jieba
jiebapath = os.environ.get("JIEBA_DATA", "")
jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
jieba.initialize() #初始化jeiba
def chinese_tokenizer(text: str) -> List[str]: def chinese_tokenizer(text: str) -> List[str]:
import jieba
from nltk.corpus import stopwords from nltk.corpus import stopwords
tokens = jieba.lcut(text) tokens = jieba.lcut(text)
return [token for token in tokens if token not in stopwords.words('chinese')] return [token for token in tokens if token not in stopwords.words('chinese')]
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large Load Diff
Binary file not shown.