上传文件至 kg_lab_6.13
6.17 更新对检索工程数据复杂表达式的能力
This commit is contained in:
+17
-5
@@ -3,10 +3,20 @@ from vector_lab import intersection_of_three_lists
|
|||||||
from utils import find_target_item, find_target_items, pre_mapping, pre_mapping2
|
from utils import find_target_item, find_target_items, pre_mapping, pre_mapping2
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
# 样例
|
||||||
|
# input_str1 = "杆塔总基数是多少?"
|
||||||
|
# input_str2 = "单回路长度是多少?"
|
||||||
|
# input_str3 = "计算一下角钢塔的塔材装材费"
|
||||||
|
# input_str4 = "计算一下土石方总量"
|
||||||
|
# input_str5 = "板式塔基的各类基础数量占总塔基数比例是多少?"
|
||||||
|
# input_str6 = "基础混凝土总量是多少"
|
||||||
|
# input_str7 = "计算一下本体工程机械费"
|
||||||
|
# input_str8 = "项目建设技术服务费合计"
|
||||||
|
|
||||||
# 初始化
|
# 初始化
|
||||||
problem_rewrite = Problem_rewrite()
|
problem_rewrite = Problem_rewrite()
|
||||||
|
|
||||||
from utils import extract_concrete_info, extract_query_prefix_list
|
from utils import extract_concrete_info, extract_query_prefix_list, split_chinese_bracketed_phrases
|
||||||
|
|
||||||
from chains_lab import question_answer, question_answer_calculation
|
from chains_lab import question_answer, question_answer_calculation
|
||||||
|
|
||||||
@@ -76,15 +86,17 @@ while True:
|
|||||||
|
|
||||||
elif isinstance(question, list):
|
elif isinstance(question, list):
|
||||||
ques = extract_query_prefix_list(question)
|
ques = extract_query_prefix_list(question)
|
||||||
ques_info = extract_concrete_info(question)
|
temp = extract_concrete_info(question)
|
||||||
|
ques_info = split_chinese_bracketed_phrases(temp[0])
|
||||||
|
print(ques)
|
||||||
|
|
||||||
retriever_info = []
|
retriever_info = []
|
||||||
for i in ques:
|
for idx, i in enumerate(ques):
|
||||||
response = booway_cypher_chain.invoke(ques)
|
response = booway_cypher_chain.invoke(i)
|
||||||
# generated_cypher = response.get("intermediate_steps")[0]
|
|
||||||
temp = response.get("result")
|
temp = response.get("result")
|
||||||
retriever_info.append(temp)
|
retriever_info.append(temp)
|
||||||
|
|
||||||
|
|
||||||
retriever_keywords = ques_info[0]
|
retriever_keywords = ques_info[0]
|
||||||
calculation = ques_info[-1]
|
calculation = ques_info[-1]
|
||||||
|
|
||||||
|
|||||||
+29
-35
@@ -112,7 +112,7 @@ def pre_mapping(keywords, data):
|
|||||||
if judge_exists(item, data):
|
if judge_exists(item, data):
|
||||||
temp0 = item
|
temp0 = item
|
||||||
# temp0 = find_target_items(ceshi["指标描述"]["映射规则"], item, data)
|
# temp0 = find_target_items(ceshi["指标描述"]["映射规则"], item, data)
|
||||||
result.append(f"模糊查找一下【{temp0}】,换算规则:【{temp1}】,233")
|
result.append(f"模糊查找一下【{temp0}】,换算规则:【{temp1}】")
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -125,45 +125,39 @@ def pre_mapping(keywords, data):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
#
|
||||||
def extract_concrete_info(outputs):
|
def extract_query_prefix_list(text_list):
|
||||||
import re
|
import re
|
||||||
from typing import List
|
pattern = r'^.*?【[^】]*】'
|
||||||
|
return [re.search(pattern, s).group() for s in text_list if re.search(pattern, s)]
|
||||||
|
|
||||||
"""
|
def extract_concrete_info(ceshi):
|
||||||
从多个句子中提取第一个“【】”作为查找信息,最后一个“【】”作为换算规则,
|
|
||||||
返回格式为:[合并的查找句子, 换算规则]
|
|
||||||
"""
|
|
||||||
prefixes = []
|
|
||||||
suffix = ''
|
|
||||||
|
|
||||||
for item in outputs:
|
|
||||||
matches = re.findall(r'【([^】]+)】', item)
|
|
||||||
if len(matches) >= 2:
|
|
||||||
prefixes.append(f"查找一下【{matches[0]}】")
|
|
||||||
# 假设所有换算规则一致,取第一个即可
|
|
||||||
if not suffix:
|
|
||||||
suffix = f'换算规则:【{matches[-1]}】'
|
|
||||||
|
|
||||||
if not prefixes or not suffix:
|
|
||||||
return []
|
|
||||||
|
|
||||||
return ['; '.join(prefixes), suffix]
|
|
||||||
|
|
||||||
|
|
||||||
def extract_query_prefix_list(input_list):
|
|
||||||
import re
|
import re
|
||||||
"""
|
keyword_list = []
|
||||||
输入一个字符串列表,提取每个字符串中符合格式的前缀内容(例如:'查找一下【样式】')
|
rule_text = None
|
||||||
|
|
||||||
|
for item in ceshi:
|
||||||
|
# 提取关键词
|
||||||
|
keyword_match = re.search(r'查找一下【(.*?)】', item)
|
||||||
|
if keyword_match:
|
||||||
|
keyword_list.append(keyword_match.group(1))
|
||||||
|
|
||||||
|
# 提取多行规则,使用 DOTALL 模式使 . 匹配换行符
|
||||||
|
rule_match = re.search(r'换算规则:【(.*?)】', item, re.DOTALL)
|
||||||
|
if rule_match and rule_text is None:
|
||||||
|
rule_text = rule_match.group(1) # 只取第一个规则内容,假设所有项规则一致
|
||||||
|
|
||||||
|
merged = f"模糊查找一下【{';'.join(keyword_list)}】,换算规则:【{rule_text}】"
|
||||||
|
return [merged]
|
||||||
|
|
||||||
|
def split_chinese_bracketed_phrases(text):
|
||||||
|
import re
|
||||||
|
# 使用正则匹配【...】结构和其前面的标识词
|
||||||
|
pattern = r'[^【]*?【[^】]*】'
|
||||||
|
matches = re.findall(pattern, text)
|
||||||
|
return [match.strip() for match in matches]
|
||||||
|
|
||||||
参数:
|
|
||||||
input_list (list[str]): 包含描述性语句的字符串列表
|
|
||||||
|
|
||||||
返回:
|
|
||||||
list[str]: 提取出的前缀部分列表(如 '查找一下【大板式】')
|
|
||||||
"""
|
|
||||||
pattern = r'(查找一下【[^】]+】)'
|
|
||||||
return [re.match(pattern, text).group(1) for text in input_list if re.match(pattern, text)]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,49 @@
|
|||||||
import os
|
import os
|
||||||
from langchain_community.vectorstores import FAISS
|
from langchain_community.vectorstores import FAISS
|
||||||
from langchain_huggingface import HuggingFaceEmbeddings
|
from langchain_huggingface import HuggingFaceEmbeddings
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from openai import OpenAI
|
||||||
|
import requests
|
||||||
|
import httpx
|
||||||
|
import logging
|
||||||
|
|
||||||
|
class SiliconFlowEmbeddings(Embeddings):
|
||||||
|
"""SiliconFlow嵌入模型封装"""
|
||||||
|
def __init__(self, api_key: str, model: str = "bge-m3"):
|
||||||
|
self.api_key = api_key
|
||||||
|
self.model = model
|
||||||
|
self.url = "http://10.1.16.39:9995/v1/embeddings"
|
||||||
|
self.headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
def _embed(self, input):
|
||||||
|
payload = {
|
||||||
|
"model": self.model,
|
||||||
|
"input": input,
|
||||||
|
"encoding_format": "float"
|
||||||
|
}
|
||||||
|
response = requests.post(self.url, json=payload, headers=self.headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return [item["embedding"] for item in data["data"]]
|
||||||
|
|
||||||
|
def embed_documents(self, texts):
|
||||||
|
return self._embed(texts)
|
||||||
|
|
||||||
|
def embed_query(self, text):
|
||||||
|
return self._embed([text])[0]
|
||||||
|
|
||||||
|
|
||||||
|
# embeddings = Embedding(url="http://10.1.16.39:9995/v1", api_key="xxx", model_name="bge-m3")
|
||||||
|
embeddings = SiliconFlowEmbeddings(api_key="xxx")
|
||||||
|
|
||||||
with open("./data/data.txt", 'r', encoding='utf-8') as file:
|
with open("./data/data.txt", 'r', encoding='utf-8') as file:
|
||||||
txt_list = [line.strip() for line in file]
|
txt_list = [line.strip() for line in file]
|
||||||
|
|
||||||
embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
|
# embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
|
||||||
embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
|
# embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
|
||||||
|
|
||||||
faiss_archived = "./data/faiss_data/data"
|
faiss_archived = "./data/faiss_data/data"
|
||||||
vectorstore_txt_faiss = FAISS.from_texts(txt_list, embeddings)
|
vectorstore_txt_faiss = FAISS.from_texts(txt_list, embeddings)
|
||||||
|
|||||||
Reference in New Issue
Block a user