合并代码
This commit is contained in:
@@ -3,39 +3,79 @@ import yaml
|
||||
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
|
||||
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
|
||||
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_configs():
|
||||
with open("config/loaders.yaml") as f:
|
||||
with open("config/loaders.yaml",encoding='utf-8') as f:
|
||||
configs = yaml.safe_load(f)
|
||||
return configs
|
||||
|
||||
def path_difference(path1:str, path2:str):
|
||||
import os
|
||||
path1 = os.path.abspath(path1)
|
||||
path2 = os.path.abspath(path2)
|
||||
|
||||
def get_documents():
|
||||
path1_parts = path1.split(os.path.sep)
|
||||
path2_parts = path2.split(os.path.sep)
|
||||
|
||||
for i, part in enumerate(path1_parts):
|
||||
if part != path2_parts[i]:
|
||||
break
|
||||
else:
|
||||
i += 1
|
||||
|
||||
pathKey = ''
|
||||
for j in range(i,len(path2_parts)):
|
||||
pathKey+=path2_parts[j] + '_'
|
||||
return pathKey[0:-1]
|
||||
|
||||
def getFileCacahePath():
|
||||
rootPath = 'data'
|
||||
configs = load_configs()
|
||||
if configs is not None and len(configs.items()) > 0:
|
||||
for loader_type, loader_config in configs.items():
|
||||
if loader_type == "file":
|
||||
rootPath = FileLoaderConfig(**loader_config).data_dir
|
||||
break
|
||||
return rootPath
|
||||
|
||||
def get_document_Types():
|
||||
rootPath = getFileCacahePath()
|
||||
types = []
|
||||
dirStack = [rootPath]
|
||||
while len(dirStack) > 0:
|
||||
curDir = dirStack.pop()
|
||||
dirs = [os.path.join(curDir, d) for d in os.listdir(curDir) if os.path.isdir(os.path.join(curDir, d))]
|
||||
if len(dirs) > 0:
|
||||
for dir in dirs:
|
||||
dirStack.append(dir)
|
||||
else:
|
||||
types.append(path_difference(rootPath,curDir))
|
||||
return types
|
||||
|
||||
def get_documents(docType:str):
|
||||
documents = []
|
||||
config = load_configs()
|
||||
|
||||
if config is None or len(config.items()) == 0:
|
||||
return documents
|
||||
return documents
|
||||
|
||||
for loader_type, loader_config in config.items():
|
||||
if loader_config.get('enable', True): # 检查 enable 字段
|
||||
logger.info(
|
||||
f"Loading documents from loader: {loader_type}, config: {loader_config}"
|
||||
)
|
||||
logger.info(
|
||||
f"Loading documents from loader: {loader_type}, config: {loader_config}"
|
||||
)
|
||||
|
||||
loader_config = loader_config or []
|
||||
match loader_type:
|
||||
case "file":
|
||||
document = get_file_documents(FileLoaderConfig(**loader_config))
|
||||
case "web":
|
||||
document = get_web_documents(WebLoaderConfig(**loader_config))
|
||||
case "db":
|
||||
document = get_db_documents(configs=[DBLoaderConfig(**cfg) for cfg in loader_config])
|
||||
case _:
|
||||
raise ValueError(f"Invalid loader type: {loader_type}")
|
||||
documents.extend(document)
|
||||
loader_config = loader_config or []
|
||||
match loader_type:
|
||||
case "file":
|
||||
document = get_file_documents(FileLoaderConfig(**loader_config),docType)
|
||||
case "web":
|
||||
document = get_web_documents(WebLoaderConfig(**loader_config))
|
||||
case "db":
|
||||
document = get_db_documents(configs=[DBLoaderConfig(**cfg) for cfg in loader_config])
|
||||
case _:
|
||||
raise ValueError(f"Invalid loader type: {loader_type}")
|
||||
documents.extend(document)
|
||||
|
||||
return documents
|
||||
@@ -46,7 +46,7 @@ def llama_local_extractor() -> Dict[str, BaseReader]:
|
||||
return {".json" : JSONReader(clean_json=False,levels_back=0)}
|
||||
|
||||
|
||||
def get_file_documents(config: FileLoaderConfig):
|
||||
def get_file_documents(config: FileLoaderConfig,childPath: str):
|
||||
from llama_index.core.readers import SimpleDirectoryReader
|
||||
|
||||
try:
|
||||
@@ -63,7 +63,7 @@ def get_file_documents(config: FileLoaderConfig):
|
||||
file_extractor = llama_local_extractor()
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
config.data_dir,
|
||||
os.path.join(config.data_dir,childPath.replace('_','\\')),
|
||||
recursive=True,
|
||||
filename_as_id=True,
|
||||
raise_on_error=True,
|
||||
|
||||
Reference in New Issue
Block a user