修改关键词检索溢出问题
This commit is contained in:
@@ -10,7 +10,7 @@ from app.engine.vectordb import get_vector_store
|
|||||||
from app.settings import init_settings
|
from app.settings import init_settings
|
||||||
from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
|
from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
|
||||||
from llama_index.core.ingestion import IngestionPipeline
|
from llama_index.core.ingestion import IngestionPipeline
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser
|
||||||
from llama_index.core.settings import Settings
|
from llama_index.core.settings import Settings
|
||||||
from llama_index.core.storage import StorageContext
|
from llama_index.core.storage import StorageContext
|
||||||
from llama_index.core.storage.docstore import SimpleDocumentStore
|
from llama_index.core.storage.docstore import SimpleDocumentStore
|
||||||
@@ -35,10 +35,11 @@ def get_doc_store(docType:str):
|
|||||||
def run_pipeline(docstore, vector_store, documents):
|
def run_pipeline(docstore, vector_store, documents):
|
||||||
pipeline = IngestionPipeline(
|
pipeline = IngestionPipeline(
|
||||||
transformations=[
|
transformations=[
|
||||||
SentenceSplitter(
|
#SentenceSplitter(
|
||||||
chunk_size=Settings.chunk_size,
|
#chunk_size=Settings.chunk_size,
|
||||||
chunk_overlap=Settings.chunk_overlap,
|
#chunk_overlap=Settings.chunk_overlap,
|
||||||
),
|
#),
|
||||||
|
MarkdownNodeParser(),
|
||||||
Settings.embed_model,
|
Settings.embed_model,
|
||||||
],
|
],
|
||||||
docstore=docstore,
|
docstore=docstore,
|
||||||
@@ -86,6 +87,7 @@ def generate_datasource():
|
|||||||
|
|
||||||
# Build the index and persist storage
|
# Build the index and persist storage
|
||||||
persist_storage(docstore, vector_store)
|
persist_storage(docstore, vector_store)
|
||||||
|
persist_BMRetriever(vector_store)
|
||||||
|
|
||||||
logger.info("Finished generating the index")
|
logger.info("Finished generating the index")
|
||||||
|
|
||||||
|
|||||||
@@ -24,13 +24,13 @@ class HybridRetriever(BaseRetriever):
|
|||||||
self._vecRetriever = vector_index.as_retriever(
|
self._vecRetriever = vector_index.as_retriever(
|
||||||
similarity_top_k=similarity_top_k,filters = filters
|
similarity_top_k=similarity_top_k,filters = filters
|
||||||
)
|
)
|
||||||
|
self._bm25Retriever = None
|
||||||
STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm")
|
STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm")
|
||||||
if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0:
|
if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0:
|
||||||
self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR)
|
self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR)
|
||||||
else:
|
else:
|
||||||
bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None))
|
self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=self._vector_index.vector_store.get_nodes(None))
|
||||||
bmRetriver.persist(STORAGE_DIR)
|
self._bm25Retriever.persist(STORAGE_DIR)
|
||||||
self._alpha = alpha
|
self._alpha = alpha
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user