新增自定义节点元数据回调函数
This commit is contained in:
@@ -8,7 +8,10 @@ from llama_parse import LlamaParse
|
|||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
from app.engine.loaders.markdownReader import ChunkMarkdownReader
|
from app.engine.loaders.markdownReader import ChunkMarkdownReader
|
||||||
from app.engine.loaders.projectJson import ProjectJson
|
from app.engine.loaders.projectJson import ProjectJson
|
||||||
|
from typing import Any, Callable, Dict, Generator, List, Optional, Type, Set
|
||||||
|
import fsspec,mimetypes
|
||||||
|
from fsspec.implementations.local import LocalFileSystem
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -23,6 +26,80 @@ class FileLoaderConfig(BaseModel):
|
|||||||
raise ValueError(f"Directory '{v}' does not exist")
|
raise ValueError(f"Directory '{v}' does not exist")
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
class CustomFileMetadataFunc:
|
||||||
|
"""
|
||||||
|
Default file metadata function wrapper which stores the fs.
|
||||||
|
Allows for pickling of the function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None):
|
||||||
|
self.fs = fs or self._get_default_fs()
|
||||||
|
|
||||||
|
def __call__(self, file_path: str) -> Dict:
|
||||||
|
return self._default_file_metadata_func(file_path, self.fs)
|
||||||
|
|
||||||
|
def _default_file_metadata_func(self,
|
||||||
|
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Get some handy metadata from filesystem.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: str: file path in str
|
||||||
|
"""
|
||||||
|
fs = fs or self._get_default_fs()
|
||||||
|
stat_result = fs.stat(file_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_name = os.path.basename(str(stat_result["name"]))
|
||||||
|
except Exception as e:
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
|
||||||
|
creation_date = self._format_file_timestamp(stat_result.get("created"))
|
||||||
|
last_modified_date = self._format_file_timestamp(stat_result.get("mtime"))
|
||||||
|
last_accessed_date = self._format_file_timestamp(stat_result.get("atime"))
|
||||||
|
default_meta = {
|
||||||
|
"file_name": file_name,
|
||||||
|
"file_type": mimetypes.guess_type(file_path)[0],
|
||||||
|
"file_size": stat_result.get("size"),
|
||||||
|
"creation_date": creation_date,
|
||||||
|
"last_modified_date": last_modified_date,
|
||||||
|
"last_accessed_date": last_accessed_date,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Return not null value
|
||||||
|
return {
|
||||||
|
meta_key: meta_value
|
||||||
|
for meta_key, meta_value in default_meta.items()
|
||||||
|
if meta_value is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
def _format_file_timestamp(
|
||||||
|
timestamp: float, include_time: bool = False
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Format file timestamp to a %Y-%m-%d string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timestamp (float): timestamp in float
|
||||||
|
include_time (bool): whether to include time in the formatted string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: formatted timestamp
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if include_time:
|
||||||
|
return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_default_fs(self) -> fsspec.AbstractFileSystem:
|
||||||
|
return LocalFileSystem()
|
||||||
|
|
||||||
|
def _is_default_fs(self,fs: fsspec.AbstractFileSystem) -> bool:
|
||||||
|
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
|
||||||
|
|
||||||
def llama_parse_parser():
|
def llama_parse_parser():
|
||||||
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
|
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -72,6 +149,7 @@ def get_file_documents(config: FileLoaderConfig,childPath: str):
|
|||||||
filename_as_id=True,
|
filename_as_id=True,
|
||||||
raise_on_error=True,
|
raise_on_error=True,
|
||||||
file_extractor=file_extractor,
|
file_extractor=file_extractor,
|
||||||
|
file_metadata = CustomFileMetadataFunc()
|
||||||
)
|
)
|
||||||
return reader.load_data()
|
return reader.load_data()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user