Files
zjdataai-app/backend/app/engine/retriever/HybridRetriever.py
T
2024-09-06 08:23:17 +08:00

89 lines
3.3 KiB
Python

import os
from typing import Optional, Any, Dict, List
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.schema import NodeWithScore, QueryBundle
from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
class HybridRetriever(BaseRetriever):
def __init__(
self,
vector_index,
similarity_top_k: int = 2,
out_top_k: Optional[int] = None,
alpha: float = 0.5,
filters = None,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self._vector_index = vector_index
self._embed_model = vector_index._embed_model
self._out_top_k = out_top_k or similarity_top_k
self._vecRetriever = vector_index.as_retriever(
similarity_top_k=similarity_top_k,filters = filters
)
self._bm25Retriever = None
STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm")
if os.path.exists(STORAGE_DIR) and len(os.listdir(STORAGE_DIR)) > 0:
self._bm25Retriever = CHBM25Retriever.from_persist_dir(STORAGE_DIR)
else:
nodes = self._vector_index.vector_store.get_nodes(None)
similarity_top_k = min(len(nodes),similarity_top_k)
self._bm25Retriever = CHBM25Retriever.from_defaults(similarity_top_k=similarity_top_k,nodes=nodes)
self._bm25Retriever.persist(STORAGE_DIR)
self._alpha = alpha
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
vecNodes:List[NodeWithScore] = self._vecRetriever.retrieve(query_bundle.query_str)
bmNodes:List[NodeWithScore] = self._bm25Retriever.retrieve(query_bundle.query_str)
bmDic:Dict[str,NodeWithScore] = {}
for node in bmNodes:
bmDic[node.node_id] = node
vecScores = [node_with_score.score for node_with_score in vecNodes]
bmSores = [node_with_score.score for node_with_score in bmNodes]
vec_min_score = min(vecScores) if len(vecScores) > 0 else 0
vec_max_score = max(vecScores) if len(vecScores) > 0 else 0
bm_min_score = min(bmSores) if len(bmSores) > 0 else 0
bm_max_score = max(bmSores) if len(bmSores) > 0 else 0
result_tups = []
for i in range(len(vecNodes)):
node = vecNodes[i]
bmScore = 0.0
if node.node_id in bmDic:
bmScore = bmDic[node.node_id].score
bmDic.pop(node.node_id)
else:
bmScore = 0.0
bmScore = self.normal_score(bmScore,bm_min_score,bm_max_score)
vecScore = self.normal_score(node.score,vec_min_score,vec_max_score)
full_similarity = (self._alpha * vecScore) + (
(1 - self._alpha) * bmScore
)
result_tups.append((full_similarity, node))
for _,node in bmDic.items():
full_similarity = (1 - self._alpha) * node.score
result_tups.append((full_similarity, node))
result_tups = sorted(result_tups, key=lambda x: x[0], reverse=True)
for full_score, node in result_tups:
node.score = full_score
return [n for _, n in result_tups][:self._out_top_k]
def normal_score(self,score,min,max):
if min == max:
return 1.0 if score > 0 else 0.0
else:
return (score - min) / (max - min)