新增自定义节点元数据回调函数
This commit is contained in:
@@ -8,7 +8,10 @@ from llama_parse import LlamaParse
|
||||
from pydantic import BaseModel, validator
|
||||
from app.engine.loaders.markdownReader import ChunkMarkdownReader
|
||||
from app.engine.loaders.projectJson import ProjectJson
|
||||
|
||||
from typing import Any, Callable, Dict, Generator, List, Optional, Type, Set
|
||||
import fsspec,mimetypes
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -23,6 +26,80 @@ class FileLoaderConfig(BaseModel):
|
||||
raise ValueError(f"Directory '{v}' does not exist")
|
||||
return v
|
||||
|
||||
class CustomFileMetadataFunc:
|
||||
"""
|
||||
Default file metadata function wrapper which stores the fs.
|
||||
Allows for pickling of the function.
|
||||
"""
|
||||
|
||||
def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None):
|
||||
self.fs = fs or self._get_default_fs()
|
||||
|
||||
def __call__(self, file_path: str) -> Dict:
|
||||
return self._default_file_metadata_func(file_path, self.fs)
|
||||
|
||||
def _default_file_metadata_func(self,
|
||||
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Get some handy metadata from filesystem.
|
||||
|
||||
Args:
|
||||
file_path: str: file path in str
|
||||
"""
|
||||
fs = fs or self._get_default_fs()
|
||||
stat_result = fs.stat(file_path)
|
||||
|
||||
try:
|
||||
file_name = os.path.basename(str(stat_result["name"]))
|
||||
except Exception as e:
|
||||
file_name = os.path.basename(file_path)
|
||||
|
||||
creation_date = self._format_file_timestamp(stat_result.get("created"))
|
||||
last_modified_date = self._format_file_timestamp(stat_result.get("mtime"))
|
||||
last_accessed_date = self._format_file_timestamp(stat_result.get("atime"))
|
||||
default_meta = {
|
||||
"file_name": file_name,
|
||||
"file_type": mimetypes.guess_type(file_path)[0],
|
||||
"file_size": stat_result.get("size"),
|
||||
"creation_date": creation_date,
|
||||
"last_modified_date": last_modified_date,
|
||||
"last_accessed_date": last_accessed_date,
|
||||
}
|
||||
|
||||
# Return not null value
|
||||
return {
|
||||
meta_key: meta_value
|
||||
for meta_key, meta_value in default_meta.items()
|
||||
if meta_value is not None
|
||||
}
|
||||
|
||||
def _format_file_timestamp(
|
||||
timestamp: float, include_time: bool = False
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Format file timestamp to a %Y-%m-%d string.
|
||||
|
||||
Args:
|
||||
timestamp (float): timestamp in float
|
||||
include_time (bool): whether to include time in the formatted string
|
||||
|
||||
Returns:
|
||||
str: formatted timestamp
|
||||
"""
|
||||
try:
|
||||
if include_time:
|
||||
return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _get_default_fs(self) -> fsspec.AbstractFileSystem:
|
||||
return LocalFileSystem()
|
||||
|
||||
def _is_default_fs(self,fs: fsspec.AbstractFileSystem) -> bool:
|
||||
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
|
||||
|
||||
def llama_parse_parser():
|
||||
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
|
||||
raise ValueError(
|
||||
@@ -72,6 +149,7 @@ def get_file_documents(config: FileLoaderConfig,childPath: str):
|
||||
filename_as_id=True,
|
||||
raise_on_error=True,
|
||||
file_extractor=file_extractor,
|
||||
file_metadata = CustomFileMetadataFunc()
|
||||
)
|
||||
return reader.load_data()
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user