diff --git a/backend/app/engine/generate.py b/backend/app/engine/generate.py index 1581194..d591ce6 100644 --- a/backend/app/engine/generate.py +++ b/backend/app/engine/generate.py @@ -10,7 +10,7 @@ from app.engine.vectordb import get_vector_store from app.settings import init_settings from app.engine.retriever.CHBM25Retriever import CHBM25Retriever from llama_index.core.ingestion import IngestionPipeline -from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser from llama_index.core.settings import Settings from llama_index.core.storage import StorageContext from llama_index.core.storage.docstore import SimpleDocumentStore @@ -35,10 +35,11 @@ def get_doc_store(docType:str): def run_pipeline(docstore, vector_store, documents): pipeline = IngestionPipeline( transformations=[ - SentenceSplitter( - chunk_size=Settings.chunk_size, - chunk_overlap=Settings.chunk_overlap, - ), + #SentenceSplitter( + #chunk_size=Settings.chunk_size, + #chunk_overlap=Settings.chunk_overlap, + #), + MarkdownNodeParser(), Settings.embed_model, ], docstore=docstore, @@ -86,6 +87,7 @@ def generate_datasource(): # Build the index and persist storage persist_storage(docstore, vector_store) + persist_BMRetriever(vector_store) logger.info("Finished generating the index") diff --git a/backend/app/engine/retriever/HybridRetriever.py b/backend/app/engine/retriever/HybridRetriever.py index 00b5495..be03e08 100644 --- a/backend/app/engine/retriever/HybridRetriever.py +++ b/backend/app/engine/retriever/HybridRetriever.py @@ -24,13 +24,13 @@ class HybridRetriever(BaseRetriever): self._vecRetriever = vector_index.as_retriever( similarity_top_k=similarity_top_k,filters = filters ) - + self._bm25Retriever = None STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm") if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0: self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR) else: - bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None)) - bmRetriver.persist(STORAGE_DIR) + self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None)) + self._bm25Retriever.persist(STORAGE_DIR) self._alpha = alpha