上传文件至 /

4.3更新
This commit is contained in:
2025-04-03 17:24:18 +08:00
parent c152fb8714
commit 9b9453a1a3
2 changed files with 196 additions and 146 deletions
+13
View File
@@ -120,6 +120,17 @@ def get_keywords_v3(input_str):
else: else:
return None return None
def get_keywords_v4(input_str):
import re
matches = re.findall(r'【(.*?)】', input_str)
# 获取第一个和第二个【】里的内容(索引为0和1)
first = matches[0] if len(matches) >= 1 else None
second = matches[1] if len(matches) >= 2 else None
return first, second
def normalize_text(text: str, synonym_dict: dict) -> str: def normalize_text(text: str, synonym_dict: dict) -> str:
import re import re
@@ -141,3 +152,5 @@ def normalize_text(text: str, synonym_dict: dict) -> str:
+40 -3
View File
@@ -1,9 +1,44 @@
import os import os
from langchain_community.vectorstores import FAISS from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings # from langchain_huggingface import HuggingFaceEmbeddings
embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3" # embedding_path = "/data/Z/Z_llm_dm/vector_data/bge-m3"
embeddings = HuggingFaceEmbeddings(model_name=embedding_path) # embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
from typing import List
import requests
from langchain.embeddings.base import Embeddings
class SiliconFlowEmbeddings(Embeddings):
def __init__(self, api_key: str, model: str = "bge-m3"):
self.api_key = api_key
self.model = model
self.url = "http://10.1.16.39:9995/v1/embeddings"
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
def _embed(self, input: List[str]) -> List[List[float]]:
payload = {
"model": self.model,
"input": input,
"encoding_format": "float"
}
response = requests.post(self.url, json=payload, headers=self.headers)
response.raise_for_status()
data = response.json()
return [item["embedding"] for item in data["data"]]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return self._embed(texts)
def embed_query(self, text: str) -> List[float]:
return self._embed([text])[0]
embeddings = SiliconFlowEmbeddings(api_key="sk-ftnofbucchwnscojohyxwmfzgaykdxihafnlphohsinftkbr")
def Mixed_retrieval(input_path): def Mixed_retrieval(input_path):
file_name = os.path.splitext(os.path.basename(input_path))[0] file_name = os.path.splitext(os.path.basename(input_path))[0]
@@ -33,6 +68,8 @@ def Mixed_retrieval(input_path):
return retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3 return retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3
def interface_search(input_str, retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3): def interface_search(input_str, retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3):
index_keyword1 = [] index_keyword1 = []
for i in retriever_txt_faiss1.invoke(input_str): for i in retriever_txt_faiss1.invoke(input_str):