上传文件至 /

4.3更新
2025-04-03 17:24:18 +08:00
parent c152fb8714
commit 9b9453a1a3
2 changed files with 196 additions and 146 deletions
@@ -120,6 +120,17 @@ def get_keywords_v3(input_str):
    else:
        return None
 def get_keywords_v4(input_str):
    import re
    matches = re.findall(r'【(.*?)】', input_str)
    # 获取第一个和第二个【】里的内容（索引为0和1）
    first = matches[0] if len(matches) >= 1 else None
    second = matches[1] if len(matches) >= 2 else None
    return first, second
 def normalize_text(text: str, synonym_dict: dict) -> str:
    import re
@@ -141,3 +152,5 @@ def normalize_text(text: str, synonym_dict: dict) -> str:
@@ -1,9 +1,44 @@
 import os
 from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_huggingface import HuggingFaceEmbeddings
-embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
+# embedding_path = "/data/Z/Z_llm_dm/vector_data/bge-m3"
-embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
+# embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
 from typing import List
 import requests
 from langchain.embeddings.base import Embeddings
 class SiliconFlowEmbeddings(Embeddings):
    def __init__(self, api_key: str, model: str = "bge-m3"):
        self.api_key = api_key
        self.model = model
        self.url = "http://10.1.16.39:9995/v1/embeddings"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
    def _embed(self, input: List[str]) -> List[List[float]]:
        payload = {
            "model": self.model,
            "input": input,
            "encoding_format": "float"
        }
        response = requests.post(self.url, json=payload, headers=self.headers)
        response.raise_for_status()
        data = response.json()
        return [item["embedding"] for item in data["data"]]
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self._embed(texts)
    def embed_query(self, text: str) -> List[float]:
        return self._embed([text])[0]
 embeddings = SiliconFlowEmbeddings(api_key="sk-ftnofbucchwnscojohyxwmfzgaykdxihafnlphohsinftkbr")
 def Mixed_retrieval(input_path):
    file_name = os.path.splitext(os.path.basename(input_path))[0]
@@ -33,6 +68,8 @@ def Mixed_retrieval(input_path):
    return retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3
 def interface_search(input_str, retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3):
    index_keyword1 = []
    for i in retriever_txt_faiss1.invoke(input_str):