517691c2d6
6.18 更新数据配置路径统一,和前端demo
78 lines
2.7 KiB
Python
78 lines
2.7 KiB
Python
import os
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain.embeddings.base import Embeddings
|
|
from openai import OpenAI
|
|
import requests
|
|
import httpx
|
|
import logging
|
|
|
|
from extraction_info import info_data_txt, info_faiss_archived
|
|
|
|
class SiliconFlowEmbeddings(Embeddings):
|
|
"""SiliconFlow嵌入模型封装"""
|
|
def __init__(self, api_key: str, model: str = "bge-m3"):
|
|
self.api_key = api_key
|
|
self.model = model
|
|
self.url = "http://10.1.16.39:9995/v1/embeddings"
|
|
self.headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
def _embed(self, input):
|
|
payload = {
|
|
"model": self.model,
|
|
"input": input,
|
|
"encoding_format": "float"
|
|
}
|
|
response = requests.post(self.url, json=payload, headers=self.headers)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return [item["embedding"] for item in data["data"]]
|
|
|
|
def embed_documents(self, texts):
|
|
return self._embed(texts)
|
|
|
|
def embed_query(self, text):
|
|
return self._embed([text])[0]
|
|
|
|
|
|
# embeddings = Embedding(url="http://10.1.16.39:9995/v1", api_key="xxx", model_name="bge-m3")
|
|
embeddings = SiliconFlowEmbeddings(api_key="xxx")
|
|
|
|
with open(info_data_txt, 'r', encoding='utf-8') as file:
|
|
txt_list = [line.strip() for line in file]
|
|
|
|
# embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
|
|
# embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
|
|
|
|
# faiss_archived = "./data/faiss_data/data"
|
|
vectorstore_txt_faiss = FAISS.from_texts(txt_list, embeddings)
|
|
vectorstore_txt_faiss.save_local(info_faiss_archived)
|
|
|
|
retriever_txt_faiss1 = vectorstore_txt_faiss.as_retriever(search_kwargs={"k":3})
|
|
retriever_txt_faiss2 = vectorstore_txt_faiss.as_retriever(
|
|
search_type="mmr",
|
|
search_kwargs={"k": 3, # 检索结果
|
|
"fetch_k": 1, # 候选结果数量
|
|
"lambda_mult": 0.5} # 平衡指数,1为相关性;0为多样性
|
|
)
|
|
retriever_txt_faiss3 = vectorstore_txt_faiss.as_retriever(
|
|
search_type="similarity_score_threshold",
|
|
search_kwargs={"score_threshold": 0.5}
|
|
)
|
|
|
|
def intersection_of_three_lists(input_str):
|
|
list1 = retriever_txt_faiss1.invoke(input_str)
|
|
list2 = retriever_txt_faiss2.invoke(input_str)
|
|
list3 = retriever_txt_faiss3.invoke(input_str)
|
|
|
|
def _intersection_of_three_lists(retrieval_results):
|
|
return [doc.page_content for doc in retrieval_results]
|
|
|
|
list11 = _intersection_of_three_lists(list1)
|
|
list22 = _intersection_of_three_lists(list2)
|
|
list33 = _intersection_of_three_lists(list3)
|
|
|
|
return list(set(list11) & set(list22) & set(list33)) |