合并Dev分支代码

This commit is contained in:
wanyaokun
2024-08-30 10:49:05 +08:00
parent e9ccd7db35
commit 73565b26e4
16 changed files with 486 additions and 409 deletions
+17 -15
View File
@@ -5,7 +5,7 @@ load_dotenv()
import logging
import os
from app.engine.loaders import get_documents
from app.engine.loaders import get_document_Types, get_documents
from app.engine.vectordb import get_vector_store
from app.settings import init_settings
from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
@@ -21,12 +21,13 @@ logger = logging.getLogger()
STORAGE_DIR = os.getenv("STORAGE_DIR", "storage")
def get_doc_store():
def get_doc_store(docType:str):
# If the storage directory is there, load the document store from it.
# If not, set up an in-memory document store since we can't load from a directory that doesn't exist.
if os.path.exists(STORAGE_DIR):
return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
storeDir = os.path.join(STORAGE_DIR,docType)
if os.path.exists(storeDir):
return SimpleDocumentStore.from_persist_dir(storeDir)
else:
return SimpleDocumentStore()
@@ -71,19 +72,20 @@ def generate_datasource():
logger.info("Generate index for the provided data")
# Get the stores and documents or create new ones
documents = get_documents()
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["private"] = "false"
docstore = get_doc_store()
vector_store = get_vector_store()
docTypes = get_document_Types()
for docType in docTypes:
documents = get_documents(docType)
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["private"] = "false"
docstore = get_doc_store(docType)
vector_store = get_vector_store(docType)
# Run the ingestion pipeline
_ = run_pipeline(docstore, vector_store, documents)
# Run the ingestion pipeline
_ = run_pipeline(docstore, vector_store, documents)
# Build the index and persist storage
persist_storage(docstore, vector_store)
persist_BMRetriever(vector_store)
# Build the index and persist storage
persist_storage(docstore, vector_store)
logger.info("Finished generating the index")