From 47437044cb07d17e842fd51607d2eabdf852668d Mon Sep 17 00:00:00 2001 From: wanyaokun <12345678> Date: Tue, 10 Sep 2024 15:30:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=87=AA=E5=AE=9A=E4=B9=89?= =?UTF-8?q?=E8=8A=82=E7=82=B9=E5=85=83=E6=95=B0=E6=8D=AE=E5=9B=9E=E8=B0=83?= =?UTF-8?q?=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/engine/loaders/file.py | 80 +++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/backend/app/engine/loaders/file.py b/backend/app/engine/loaders/file.py index a75d30e..b2990f4 100644 --- a/backend/app/engine/loaders/file.py +++ b/backend/app/engine/loaders/file.py @@ -8,7 +8,10 @@ from llama_parse import LlamaParse from pydantic import BaseModel, validator from app.engine.loaders.markdownReader import ChunkMarkdownReader from app.engine.loaders.projectJson import ProjectJson - +from typing import Any, Callable, Dict, Generator, List, Optional, Type, Set +import fsspec,mimetypes +from fsspec.implementations.local import LocalFileSystem +from datetime import datetime logger = logging.getLogger(__name__) @@ -23,6 +26,80 @@ class FileLoaderConfig(BaseModel): raise ValueError(f"Directory '{v}' does not exist") return v +class CustomFileMetadataFunc: + """ + Default file metadata function wrapper which stores the fs. + Allows for pickling of the function. + """ + + def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None): + self.fs = fs or self._get_default_fs() + + def __call__(self, file_path: str) -> Dict: + return self._default_file_metadata_func(file_path, self.fs) + + def _default_file_metadata_func(self, + file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None + ) -> Dict: + """ + Get some handy metadata from filesystem. + + Args: + file_path: str: file path in str + """ + fs = fs or self._get_default_fs() + stat_result = fs.stat(file_path) + + try: + file_name = os.path.basename(str(stat_result["name"])) + except Exception as e: + file_name = os.path.basename(file_path) + + creation_date = self._format_file_timestamp(stat_result.get("created")) + last_modified_date = self._format_file_timestamp(stat_result.get("mtime")) + last_accessed_date = self._format_file_timestamp(stat_result.get("atime")) + default_meta = { + "file_name": file_name, + "file_type": mimetypes.guess_type(file_path)[0], + "file_size": stat_result.get("size"), + "creation_date": creation_date, + "last_modified_date": last_modified_date, + "last_accessed_date": last_accessed_date, + } + + # Return not null value + return { + meta_key: meta_value + for meta_key, meta_value in default_meta.items() + if meta_value is not None + } + + def _format_file_timestamp( + timestamp: float, include_time: bool = False + ) -> Optional[str]: + """ + Format file timestamp to a %Y-%m-%d string. + + Args: + timestamp (float): timestamp in float + include_time (bool): whether to include time in the formatted string + + Returns: + str: formatted timestamp + """ + try: + if include_time: + return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ") + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d") + except Exception: + return None + + def _get_default_fs(self) -> fsspec.AbstractFileSystem: + return LocalFileSystem() + + def _is_default_fs(self,fs: fsspec.AbstractFileSystem) -> bool: + return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir + def llama_parse_parser(): if os.getenv("LLAMA_CLOUD_API_KEY") is None: raise ValueError( @@ -72,6 +149,7 @@ def get_file_documents(config: FileLoaderConfig,childPath: str): filename_as_id=True, raise_on_error=True, file_extractor=file_extractor, + file_metadata = CustomFileMetadataFunc() ) return reader.load_data() except Exception as e: