import os from typing import Optional, Any, Dict, List from llama_index.core.base.base_retriever import BaseRetriever from llama_index.core.schema import NodeWithScore, QueryBundle from app.engine.retriever.CHBM25Retriever import CHBM25Retriever class HybridRetriever(BaseRetriever): def __init__( self, vector_index, similarity_top_k: int = 2, out_top_k: Optional[int] = None, alpha: float = 0.5, filters = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) self._vector_index = vector_index self._embed_model = vector_index._embed_model self._out_top_k = out_top_k or similarity_top_k self._vecRetriever = vector_index.as_retriever( similarity_top_k=similarity_top_k,filters = filters ) self._bm25Retriever = None STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm") if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0: self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR) else: nodes = self._vector_index.vector_store.get_nodes(None) similarity_top_k = min(len(nodes),similarity_top_k) self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=nodes) self._bm25Retriever.persist(STORAGE_DIR) self._alpha = alpha def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: vecNodes:List[NodeWithScore] = self._vecRetriever.retrieve(query_bundle.query_str) bmNodes:List[NodeWithScore] = self._bm25Retriever.retrieve(query_bundle.query_str) bmDic:Dict[str,NodeWithScore] = {} for node in bmNodes: bmDic[node.node_id] = node vecScores = [node_with_score.score for node_with_score in vecNodes] bmSores = [node_with_score.score for node_with_score in bmNodes] vec_min_score = min(vecScores) if len(vecScores) > 0 else 0 vec_max_score = max(vecScores) if len(vecScores) > 0 else 0 bm_min_score = min(bmSores) if len(bmSores) > 0 else 0 bm_max_score = max(bmSores) if len(bmSores) > 0 else 0 result_tups = [] for i in range(len(vecNodes)): node = vecNodes[i] bmScore = 0.0 if node.node_id in bmDic: bmScore = bmDic[node.node_id].score bmDic.pop(node.node_id) else: bmScore = 0.0 bmScore = self.normal_score(bmScore,bm_min_score,bm_max_score) vecScore = self.normal_score(node.score,vec_min_score,vec_max_score) full_similarity = (self._alpha * vecScore) + ( (1 - self._alpha) * bmScore ) result_tups.append((full_similarity, node)) for _,node in bmDic.items(): full_similarity = (1 - self._alpha) * node.score result_tups.append((full_similarity, node)) result_tups = sorted(result_tups, key=lambda x: x[0], reverse=True) for full_score, node in result_tups: node.score = full_score return [n for _, n in result_tups][:self._out_top_k] def normal_score(self,score,min,max): if min == max: return 1.0 if score > 0 else 0.0 else: return (score - min) / (max - min)