import logging import yaml from app.engine.loaders.db import DBLoaderConfig, get_db_documents from app.engine.loaders.file import FileLoaderConfig, get_file_documents from app.engine.loaders.web import WebLoaderConfig, get_web_documents logger = logging.getLogger(__name__) def load_configs(): with open("config/loaders.yaml") as f: configs = yaml.safe_load(f) return configs def path_difference(path1:str, path2:str): import os path1 = os.path.abspath(path1) path2 = os.path.abspath(path2) path1_parts = path1.split(os.path.sep) path2_parts = path2.split(os.path.sep) for i, part in enumerate(path1_parts): if part != path2_parts[i]: break else: i += 1 pathKey = '' for j in range(i,len(path2_parts)): pathKey+=path2_parts[j] + '_' return pathKey[0:-1] def get_document_Types(): import os rootPath = 'data' configs = load_configs() if configs is not None and len(configs.items()) > 0: for loader_type, loader_config in configs.items(): if loader_type == "file": rootPath = FileLoaderConfig(**loader_config).data_dir break types = [] dirStack = [rootPath] while len(dirStack) > 0: curDir = dirStack.pop() dirs = [os.path.join(curDir, d) for d in os.listdir(curDir) if os.path.isdir(os.path.join(curDir, d))] if len(dirs) > 0: for dir in dirs: dirStack.append(dir) else: types.append(path_difference(rootPath,curDir)) return types def get_documents(docType:str): documents = [] config = load_configs() if config is None or len(config.items()) == 0: return documents for loader_type, loader_config in config.items(): logger.info( f"Loading documents from loader: {loader_type}, config: {loader_config}" ) loader_config = loader_config or [] match loader_type: case "file": document = get_file_documents(FileLoaderConfig(**loader_config),docType) case "web": document = get_web_documents(WebLoaderConfig(**loader_config)) case "db": document = get_db_documents(configs=[DBLoaderConfig(**cfg) for cfg in loader_config]) case _: raise ValueError(f"Invalid loader type: {loader_type}") documents.extend(document) return documents