新增自定义节点元数据回调函数

This commit is contained in:
wanyaokun
2024-09-10 15:30:58 +08:00
parent 5edfecef30
commit 47437044cb
+79 -1
View File
@@ -8,7 +8,10 @@ from llama_parse import LlamaParse
from pydantic import BaseModel, validator from pydantic import BaseModel, validator
from app.engine.loaders.markdownReader import ChunkMarkdownReader from app.engine.loaders.markdownReader import ChunkMarkdownReader
from app.engine.loaders.projectJson import ProjectJson from app.engine.loaders.projectJson import ProjectJson
from typing import Any, Callable, Dict, Generator, List, Optional, Type, Set
import fsspec,mimetypes
from fsspec.implementations.local import LocalFileSystem
from datetime import datetime
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -23,6 +26,80 @@ class FileLoaderConfig(BaseModel):
raise ValueError(f"Directory '{v}' does not exist") raise ValueError(f"Directory '{v}' does not exist")
return v return v
class CustomFileMetadataFunc:
"""
Default file metadata function wrapper which stores the fs.
Allows for pickling of the function.
"""
def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None):
self.fs = fs or self._get_default_fs()
def __call__(self, file_path: str) -> Dict:
return self._default_file_metadata_func(file_path, self.fs)
def _default_file_metadata_func(self,
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> Dict:
"""
Get some handy metadata from filesystem.
Args:
file_path: str: file path in str
"""
fs = fs or self._get_default_fs()
stat_result = fs.stat(file_path)
try:
file_name = os.path.basename(str(stat_result["name"]))
except Exception as e:
file_name = os.path.basename(file_path)
creation_date = self._format_file_timestamp(stat_result.get("created"))
last_modified_date = self._format_file_timestamp(stat_result.get("mtime"))
last_accessed_date = self._format_file_timestamp(stat_result.get("atime"))
default_meta = {
"file_name": file_name,
"file_type": mimetypes.guess_type(file_path)[0],
"file_size": stat_result.get("size"),
"creation_date": creation_date,
"last_modified_date": last_modified_date,
"last_accessed_date": last_accessed_date,
}
# Return not null value
return {
meta_key: meta_value
for meta_key, meta_value in default_meta.items()
if meta_value is not None
}
def _format_file_timestamp(
timestamp: float, include_time: bool = False
) -> Optional[str]:
"""
Format file timestamp to a %Y-%m-%d string.
Args:
timestamp (float): timestamp in float
include_time (bool): whether to include time in the formatted string
Returns:
str: formatted timestamp
"""
try:
if include_time:
return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ")
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
except Exception:
return None
def _get_default_fs(self) -> fsspec.AbstractFileSystem:
return LocalFileSystem()
def _is_default_fs(self,fs: fsspec.AbstractFileSystem) -> bool:
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
def llama_parse_parser(): def llama_parse_parser():
if os.getenv("LLAMA_CLOUD_API_KEY") is None: if os.getenv("LLAMA_CLOUD_API_KEY") is None:
raise ValueError( raise ValueError(
@@ -72,6 +149,7 @@ def get_file_documents(config: FileLoaderConfig,childPath: str):
filename_as_id=True, filename_as_id=True,
raise_on_error=True, raise_on_error=True,
file_extractor=file_extractor, file_extractor=file_extractor,
file_metadata = CustomFileMetadataFunc()
) )
return reader.load_data() return reader.load_data()
except Exception as e: except Exception as e: