83 lines
2.6 KiB
Python
83 lines
2.6 KiB
Python
import logging
|
|
import yaml
|
|
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
|
|
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
|
|
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
|
|
import os
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def load_configs():
|
|
with open("config/loaders.yaml",encoding='utf-8') as f:
|
|
configs = yaml.safe_load(f)
|
|
return configs
|
|
|
|
def path_difference(path1:str, path2:str):
|
|
import os
|
|
path1 = os.path.abspath(path1)
|
|
path2 = os.path.abspath(path2)
|
|
|
|
path1_parts = path1.split(os.path.sep)
|
|
path2_parts = path2.split(os.path.sep)
|
|
|
|
for i, part in enumerate(path1_parts):
|
|
if part != path2_parts[i]:
|
|
break
|
|
else:
|
|
i += 1
|
|
|
|
pathKey = ''
|
|
for j in range(i,len(path2_parts)):
|
|
pathKey+=path2_parts[j] + '_'
|
|
return pathKey[0:-1]
|
|
|
|
def getFileCacahePath():
|
|
rootPath = 'data'
|
|
configs = load_configs()
|
|
if configs is not None and len(configs.items()) > 0:
|
|
for loader_type, loader_config in configs.items():
|
|
if loader_type == "file":
|
|
rootPath = FileLoaderConfig(**loader_config).data_dir
|
|
break
|
|
return rootPath
|
|
|
|
def get_document_Types():
|
|
rootPath = getFileCacahePath()
|
|
types = []
|
|
dirStack = [rootPath]
|
|
while len(dirStack) > 0:
|
|
curDir = dirStack.pop()
|
|
dirs = [os.path.join(curDir, d) for d in os.listdir(curDir) if os.path.isdir(os.path.join(curDir, d))]
|
|
if len(dirs) > 0:
|
|
for dir in dirs:
|
|
dirStack.append(dir)
|
|
else:
|
|
types.append(path_difference(rootPath,curDir))
|
|
return types
|
|
|
|
def get_documents(docType:str):
|
|
documents = []
|
|
config = load_configs()
|
|
|
|
if config is None or len(config.items()) == 0:
|
|
return documents
|
|
|
|
for loader_type, loader_config in config.items():
|
|
if loader_config.get('enable', True): # 检查 enable 字段
|
|
logger.info(
|
|
f"Loading documents from loader: {loader_type}, config: {loader_config}"
|
|
)
|
|
|
|
loader_config = loader_config or []
|
|
match loader_type:
|
|
case "file":
|
|
document = get_file_documents(FileLoaderConfig(**loader_config),docType)
|
|
case "web":
|
|
document = get_web_documents(WebLoaderConfig(**loader_config))
|
|
case "db":
|
|
document = get_db_documents(configs=[DBLoaderConfig(**cfg) for cfg in loader_config])
|
|
case _:
|
|
raise ValueError(f"Invalid loader type: {loader_type}")
|
|
documents.extend(document)
|
|
|
|
return documents |