修改关键词检索溢出问题

2024-09-05 18:13:39 +08:00
parent 626ff1e632
commit f171282a0c
2 changed files with 10 additions and 8 deletions
@@ -10,7 +10,7 @@ from app.engine.vectordb import get_vector_store
 from app.settings import init_settings
 from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
 from llama_index.core.ingestion import IngestionPipeline
-from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser
 from llama_index.core.settings import Settings
 from llama_index.core.storage import StorageContext
 from llama_index.core.storage.docstore import SimpleDocumentStore
@@ -35,10 +35,11 @@ def get_doc_store(docType:str):
 def run_pipeline(docstore, vector_store, documents):
    pipeline = IngestionPipeline(
        transformations=[
-            SentenceSplitter(
+            #SentenceSplitter(
-                chunk_size=Settings.chunk_size,
+                #chunk_size=Settings.chunk_size,
-                chunk_overlap=Settings.chunk_overlap,
+                #chunk_overlap=Settings.chunk_overlap,
-            ),
+            #),
            MarkdownNodeParser(),
            Settings.embed_model,
        ],
        docstore=docstore,
@@ -86,6 +87,7 @@ def generate_datasource():
        # Build the index and persist storage
        persist_storage(docstore, vector_store)
        persist_BMRetriever(vector_store)
    logger.info("Finished generating the index")
@@ -24,13 +24,13 @@ class HybridRetriever(BaseRetriever):
        self._vecRetriever = vector_index.as_retriever(
            similarity_top_k=similarity_top_k,filters = filters
        )
-
+        self._bm25Retriever = None
        STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm")
        if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0:
            self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR)
        else:
-            bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None))
+            self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None))
-            bmRetriver.persist(STORAGE_DIR)
+            self._bm25Retriever.persist(STORAGE_DIR)
        self._alpha = alpha