上传文件至 kg_lab_6.13

6.17 更新对检索工程数据复杂表达式的能力
2025-06-17 17:17:20 +08:00
parent 8a44b9780d
commit fad7c5de4a
3 changed files with 85 additions and 42 deletions
@@ -3,10 +3,20 @@ from vector_lab import intersection_of_three_lists
 from utils import find_target_item, find_target_items, pre_mapping, pre_mapping2
 import json
 # 样例
 # input_str1 = "杆塔总基数是多少？"
 # input_str2 = "单回路长度是多少？"
 # input_str3 = "计算一下角钢塔的塔材装材费"
 # input_str4 = "计算一下土石方总量"
 # input_str5 = "板式塔基的各类基础数量占总塔基数比例是多少？"
 # input_str6 = "基础混凝土总量是多少"
 # input_str7 = "计算一下本体工程机械费"
 # input_str8 = "项目建设技术服务费合计"
 # 初始化
 problem_rewrite = Problem_rewrite()
-from utils import extract_concrete_info, extract_query_prefix_list
+from utils import extract_concrete_info, extract_query_prefix_list, split_chinese_bracketed_phrases
 from chains_lab import question_answer, question_answer_calculation
@@ -76,15 +86,17 @@ while True:
        elif isinstance(question, list):
            ques = extract_query_prefix_list(question)
-            ques_info = extract_concrete_info(question)
+            temp = extract_concrete_info(question)
            ques_info = split_chinese_bracketed_phrases(temp[0])
            print(ques)
            retriever_info = []
-            for i in ques:
+            for idx, i in enumerate(ques):
-                response = booway_cypher_chain.invoke(ques)    
+                response = booway_cypher_chain.invoke(i)    
                # generated_cypher = response.get("intermediate_steps")[0]
                temp = response.get("result")
                retriever_info.append(temp)
            retriever_keywords = ques_info[0]
            calculation = ques_info[-1]
@@ -112,7 +112,7 @@ def pre_mapping(keywords, data):
                            if judge_exists(item, data):
                                temp0 = item
                                # temp0 = find_target_items(ceshi["指标描述"]["映射规则"], item, data)
-                                result.append(f"模糊查找一下【{temp0}】，换算规则：【{temp1}】,233")
+                                result.append(f"模糊查找一下【{temp0}】，换算规则：【{temp1}】")
                            else:
                                continue
@@ -125,45 +125,39 @@ def pre_mapping(keywords, data):
                return result
-
+# 
-def extract_concrete_info(outputs):
+def extract_query_prefix_list(text_list):
    import re
-    from typing import List
+    pattern = r'^.*?【[^】]*】'
    return [re.search(pattern, s).group() for s in text_list if re.search(pattern, s)]
-    """
+def extract_concrete_info(ceshi):
    从多个句子中提取第一个“【】”作为查找信息，最后一个“【】”作为换算规则，
    返回格式为：[合并的查找句子, 换算规则]
    """
    prefixes = []
    suffix = ''
    for item in outputs:
        matches = re.findall(r'【([^】]+)】', item)
        if len(matches) >= 2:
            prefixes.append(f"查找一下【{matches[0]}】")
            # 假设所有换算规则一致，取第一个即可
            if not suffix:
                suffix = f'换算规则：【{matches[-1]}】'
    if not prefixes or not suffix:
        return []
    return ['; '.join(prefixes), suffix]
 def extract_query_prefix_list(input_list):
    import re
-    """
+    keyword_list = []
-    输入一个字符串列表，提取每个字符串中符合格式的前缀内容（例如：'查找一下【样式】'）
+    rule_text = None
    for item in ceshi:
        # 提取关键词
        keyword_match = re.search(r'查找一下【(.*?)】', item)
        if keyword_match:
            keyword_list.append(keyword_match.group(1))
        # 提取多行规则，使用 DOTALL 模式使 . 匹配换行符
        rule_match = re.search(r'换算规则：【(.*?)】', item, re.DOTALL)
        if rule_match and rule_text is None:
            rule_text = rule_match.group(1)  # 只取第一个规则内容，假设所有项规则一致
    merged = f"模糊查找一下【{'；'.join(keyword_list)}】，换算规则：【{rule_text}】"
    return [merged]
 def split_chinese_bracketed_phrases(text):
    import re
    # 使用正则匹配【...】结构和其前面的标识词
    pattern = r'[^【]*?【[^】]*】'
    matches = re.findall(pattern, text)
    return [match.strip() for match in matches]
    参数:
        input_list (list[str]): 包含描述性语句的字符串列表
    返回:
        list[str]: 提取出的前缀部分列表（如 '查找一下【大板式】'）
    """
    pattern = r'(查找一下【[^】]+】)'
    return [re.match(pattern, text).group(1) for text in input_list if re.match(pattern, text)]
@@ -1,12 +1,49 @@
 import os
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain.embeddings.base import Embeddings
 from openai import OpenAI
 import requests
 import httpx
 import logging
 class SiliconFlowEmbeddings(Embeddings):
    """SiliconFlow嵌入模型封装"""
    def __init__(self, api_key: str, model: str = "bge-m3"):
        self.api_key = api_key
        self.model = model
        self.url = "http://10.1.16.39:9995/v1/embeddings"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
    def _embed(self, input):
        payload = {
            "model": self.model,
            "input": input,
            "encoding_format": "float"
        }
        response = requests.post(self.url, json=payload, headers=self.headers)
        response.raise_for_status()
        data = response.json()
        return [item["embedding"] for item in data["data"]]
    def embed_documents(self, texts):
        return self._embed(texts)
    def embed_query(self, text):
        return self._embed([text])[0]
 # embeddings = Embedding(url="http://10.1.16.39:9995/v1", api_key="xxx", model_name="bge-m3")
 embeddings  = SiliconFlowEmbeddings(api_key="xxx")
 with open("./data/data.txt", 'r', encoding='utf-8') as file:
    txt_list = [line.strip() for line in file]
-embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
+# embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3"
-embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
+# embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
 faiss_archived = "./data/faiss_data/data"
 vectorstore_txt_faiss = FAISS.from_texts(txt_list, embeddings)