调整NLTK数据目录和JIEBA字典位置到本项目中,避免重新安装时需要从网上下载
This commit is contained in:
@@ -1,3 +1,8 @@
|
|||||||
|
JIEBA_DATA=./nltk_data
|
||||||
|
NLTK_DATA=./nltk_data
|
||||||
|
SQLITE_DATABASE_URL=sqlite:///./source.db
|
||||||
|
DATA_SOURCE_CACHE=./restapi
|
||||||
|
|
||||||
# The Llama Cloud API key.
|
# The Llama Cloud API key.
|
||||||
# LLAMA_CLOUD_API_KEY=
|
# LLAMA_CLOUD_API_KEY=
|
||||||
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
|
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
|
||||||
|
|||||||
@@ -1,3 +1,8 @@
|
|||||||
|
JIEBA_DATA=./nltk_data
|
||||||
|
NLTK_DATA=./nltk_data
|
||||||
|
SQLITE_DATABASE_URL=sqlite:///./source.db
|
||||||
|
DATA_SOURCE_CACHE=./restapi
|
||||||
|
|
||||||
# The Llama Cloud API key.
|
# The Llama Cloud API key.
|
||||||
# LLAMA_CLOUD_API_KEY=
|
# LLAMA_CLOUD_API_KEY=
|
||||||
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
|
SQL_DATABASE_URL=mysql+pymysql://zjinfo1:Dy2Bcr53Hm5xRkba@110.42.234.166:3306/zjinfo1
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
from typing import Any, Dict, List, Union, Callable, NamedTuple
|
from typing import Any, Dict, List, Union, Callable, NamedTuple
|
||||||
from bm25s.tokenization import *
|
from bm25s.tokenization import *
|
||||||
|
|
||||||
@@ -8,9 +9,12 @@ except ImportError:
|
|||||||
def tqdm(iterable, *args, **kwargs):
|
def tqdm(iterable, *args, **kwargs):
|
||||||
return iterable
|
return iterable
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
jiebapath = os.environ.get("JIEBA_DATA", "")
|
||||||
|
jieba.set_dictionary(os.path.join(jiebapath, 'dict.txt')) #设置字典
|
||||||
|
jieba.initialize() #初始化jeiba
|
||||||
|
|
||||||
def chinese_tokenizer(text: str) -> List[str]:
|
def chinese_tokenizer(text: str) -> List[str]:
|
||||||
import jieba
|
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
tokens = jieba.lcut(text)
|
tokens = jieba.lcut(text)
|
||||||
return [token for token in tokens if token not in stopwords.words('chinese')]
|
return [token for token in tokens if token not in stopwords.words('chinese')]
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
+349046
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user