From f171282a0c9858b1d47ae9d7efdf35debd1c091f Mon Sep 17 00:00:00 2001 From: wanyaokun <12345678> Date: Thu, 5 Sep 2024 18:13:39 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=85=B3=E9=94=AE=E8=AF=8D?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E6=BA=A2=E5=87=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/engine/generate.py | 12 +++++++----- backend/app/engine/retriever/HybridRetriever.py | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/backend/app/engine/generate.py b/backend/app/engine/generate.py index 1581194..d591ce6 100644 --- a/backend/app/engine/generate.py +++ b/backend/app/engine/generate.py @@ -10,7 +10,7 @@ from app.engine.vectordb import get_vector_store from app.settings import init_settings from app.engine.retriever.CHBM25Retriever import CHBM25Retriever from llama_index.core.ingestion import IngestionPipeline -from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser from llama_index.core.settings import Settings from llama_index.core.storage import StorageContext from llama_index.core.storage.docstore import SimpleDocumentStore @@ -35,10 +35,11 @@ def get_doc_store(docType:str): def run_pipeline(docstore, vector_store, documents): pipeline = IngestionPipeline( transformations=[ - SentenceSplitter( - chunk_size=Settings.chunk_size, - chunk_overlap=Settings.chunk_overlap, - ), + #SentenceSplitter( + #chunk_size=Settings.chunk_size, + #chunk_overlap=Settings.chunk_overlap, + #), + MarkdownNodeParser(), Settings.embed_model, ], docstore=docstore, @@ -86,6 +87,7 @@ def generate_datasource(): # Build the index and persist storage persist_storage(docstore, vector_store) + persist_BMRetriever(vector_store) logger.info("Finished generating the index") diff --git a/backend/app/engine/retriever/HybridRetriever.py b/backend/app/engine/retriever/HybridRetriever.py index 00b5495..be03e08 100644 --- a/backend/app/engine/retriever/HybridRetriever.py +++ b/backend/app/engine/retriever/HybridRetriever.py @@ -24,13 +24,13 @@ class HybridRetriever(BaseRetriever): self._vecRetriever = vector_index.as_retriever( similarity_top_k=similarity_top_k,filters = filters ) - + self._bm25Retriever = None STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm") if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0: self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR) else: - bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None)) - bmRetriver.persist(STORAGE_DIR) + self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None)) + self._bm25Retriever.persist(STORAGE_DIR) self._alpha = alpha