Initial commit from Create Llama

This commit is contained in:
2024-08-08 18:33:08 +08:00
commit 4923337038
97 changed files with 5378 additions and 0 deletions
+37
View File
@@ -0,0 +1,37 @@
import logging
import yaml
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
logger = logging.getLogger(__name__)
def load_configs():
with open("config/loaders.yaml") as f:
configs = yaml.safe_load(f)
return configs
def get_documents():
documents = []
config = load_configs()
for loader_type, loader_config in config.items():
logger.info(
f"Loading documents from loader: {loader_type}, config: {loader_config}"
)
match loader_type:
case "file":
document = get_file_documents(FileLoaderConfig(**loader_config))
case "web":
document = get_web_documents(WebLoaderConfig(**loader_config))
case "db":
document = get_db_documents(
configs=[DBLoaderConfig(**cfg) for cfg in loader_config]
)
case _:
raise ValueError(f"Invalid loader type: {loader_type}")
documents.extend(document)
return documents
+26
View File
@@ -0,0 +1,26 @@
import os
import logging
from typing import List
from pydantic import BaseModel, validator
from llama_index.core.indices.vector_store import VectorStoreIndex
logger = logging.getLogger(__name__)
class DBLoaderConfig(BaseModel):
uri: str
queries: List[str]
def get_db_documents(configs: list[DBLoaderConfig]):
from llama_index.readers.database import DatabaseReader
docs = []
for entry in configs:
loader = DatabaseReader(uri=entry.uri)
for query in entry.queries:
logger.info(f"Loading data from database with query: {query}")
documents = loader.load_data(query=query)
docs.extend(documents)
return documents
+79
View File
@@ -0,0 +1,79 @@
import os
import logging
from typing import Dict
from llama_parse import LlamaParse
from pydantic import BaseModel, validator
logger = logging.getLogger(__name__)
class FileLoaderConfig(BaseModel):
data_dir: str = "data"
use_llama_parse: bool = False
@validator("data_dir")
def data_dir_must_exist(cls, v):
if not os.path.isdir(v):
raise ValueError(f"Directory '{v}' does not exist")
return v
def llama_parse_parser():
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
raise ValueError(
"LLAMA_CLOUD_API_KEY environment variable is not set. "
"Please set it in .env file or in your shell environment then run again!"
)
parser = LlamaParse(
result_type="markdown",
verbose=True,
language="en",
ignore_errors=False,
)
return parser
def llama_parse_extractor() -> Dict[str, LlamaParse]:
from llama_parse.utils import SUPPORTED_FILE_TYPES
parser = llama_parse_parser()
return {file_type: parser for file_type in SUPPORTED_FILE_TYPES}
def get_file_documents(config: FileLoaderConfig):
from llama_index.core.readers import SimpleDirectoryReader
try:
file_extractor = None
if config.use_llama_parse:
# LlamaParse is async first,
# so we need to use nest_asyncio to run it in sync mode
import nest_asyncio
nest_asyncio.apply()
file_extractor = llama_parse_extractor()
reader = SimpleDirectoryReader(
config.data_dir,
recursive=True,
filename_as_id=True,
raise_on_error=True,
file_extractor=file_extractor,
)
return reader.load_data()
except Exception as e:
import sys
import traceback
# Catch the error if the data dir is empty
# and return as empty document list
_, _, exc_traceback = sys.exc_info()
function_name = traceback.extract_tb(exc_traceback)[-1].name
if function_name == "_add_files":
logger.warning(
f"Failed to load file documents, error message: {e} . Return as empty document list."
)
return []
else:
# Raise the error if it is not the case of empty data dir
raise e
+36
View File
@@ -0,0 +1,36 @@
import os
import json
from pydantic import BaseModel, Field
class CrawlUrl(BaseModel):
base_url: str
prefix: str
max_depth: int = Field(default=1, ge=0)
class WebLoaderConfig(BaseModel):
driver_arguments: list[str] = Field(default=None)
urls: list[CrawlUrl]
def get_web_documents(config: WebLoaderConfig):
from llama_index.readers.web import WholeSiteReader
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
driver_arguments = config.driver_arguments or []
for arg in driver_arguments:
options.add_argument(arg)
docs = []
for url in config.urls:
scraper = WholeSiteReader(
prefix=url.prefix,
max_depth=url.max_depth,
driver=webdriver.Chrome(options=options),
)
docs.extend(scraper.load_data(url.base_url))
return docs