diff --git a/kg_lab_6.13/main2.py b/kg_lab_6.13/main2.py index e008024..0b54070 100644 --- a/kg_lab_6.13/main2.py +++ b/kg_lab_6.13/main2.py @@ -3,10 +3,20 @@ from vector_lab import intersection_of_three_lists from utils import find_target_item, find_target_items, pre_mapping, pre_mapping2 import json +# 样例 +# input_str1 = "杆塔总基数是多少?" +# input_str2 = "单回路长度是多少?" +# input_str3 = "计算一下角钢塔的塔材装材费" +# input_str4 = "计算一下土石方总量" +# input_str5 = "板式塔基的各类基础数量占总塔基数比例是多少?" +# input_str6 = "基础混凝土总量是多少" +# input_str7 = "计算一下本体工程机械费" +# input_str8 = "项目建设技术服务费合计" + # 初始化 problem_rewrite = Problem_rewrite() -from utils import extract_concrete_info, extract_query_prefix_list +from utils import extract_concrete_info, extract_query_prefix_list, split_chinese_bracketed_phrases from chains_lab import question_answer, question_answer_calculation @@ -76,14 +86,16 @@ while True: elif isinstance(question, list): ques = extract_query_prefix_list(question) - ques_info = extract_concrete_info(question) + temp = extract_concrete_info(question) + ques_info = split_chinese_bracketed_phrases(temp[0]) + print(ques) retriever_info = [] - for i in ques: - response = booway_cypher_chain.invoke(ques) - # generated_cypher = response.get("intermediate_steps")[0] + for idx, i in enumerate(ques): + response = booway_cypher_chain.invoke(i) temp = response.get("result") retriever_info.append(temp) + retriever_keywords = ques_info[0] calculation = ques_info[-1] diff --git a/kg_lab_6.13/utils.py b/kg_lab_6.13/utils.py index 5f35c5a..05fd28f 100644 --- a/kg_lab_6.13/utils.py +++ b/kg_lab_6.13/utils.py @@ -112,7 +112,7 @@ def pre_mapping(keywords, data): if judge_exists(item, data): temp0 = item # temp0 = find_target_items(ceshi["指标描述"]["映射规则"], item, data) - result.append(f"模糊查找一下【{temp0}】,换算规则:【{temp1}】,233") + result.append(f"模糊查找一下【{temp0}】,换算规则:【{temp1}】") else: continue @@ -125,45 +125,39 @@ def pre_mapping(keywords, data): return result - -def extract_concrete_info(outputs): +# +def extract_query_prefix_list(text_list): import re - from typing import List + pattern = r'^.*?【[^】]*】' + return [re.search(pattern, s).group() for s in text_list if re.search(pattern, s)] - """ - 从多个句子中提取第一个“【】”作为查找信息,最后一个“【】”作为换算规则, - 返回格式为:[合并的查找句子, 换算规则] - """ - prefixes = [] - suffix = '' - - for item in outputs: - matches = re.findall(r'【([^】]+)】', item) - if len(matches) >= 2: - prefixes.append(f"查找一下【{matches[0]}】") - # 假设所有换算规则一致,取第一个即可 - if not suffix: - suffix = f'换算规则:【{matches[-1]}】' - - if not prefixes or not suffix: - return [] - - return ['; '.join(prefixes), suffix] - - -def extract_query_prefix_list(input_list): +def extract_concrete_info(ceshi): import re - """ - 输入一个字符串列表,提取每个字符串中符合格式的前缀内容(例如:'查找一下【样式】') + keyword_list = [] + rule_text = None + + for item in ceshi: + # 提取关键词 + keyword_match = re.search(r'查找一下【(.*?)】', item) + if keyword_match: + keyword_list.append(keyword_match.group(1)) + + # 提取多行规则,使用 DOTALL 模式使 . 匹配换行符 + rule_match = re.search(r'换算规则:【(.*?)】', item, re.DOTALL) + if rule_match and rule_text is None: + rule_text = rule_match.group(1) # 只取第一个规则内容,假设所有项规则一致 + + merged = f"模糊查找一下【{';'.join(keyword_list)}】,换算规则:【{rule_text}】" + return [merged] + +def split_chinese_bracketed_phrases(text): + import re + # 使用正则匹配【...】结构和其前面的标识词 + pattern = r'[^【]*?【[^】]*】' + matches = re.findall(pattern, text) + return [match.strip() for match in matches] - 参数: - input_list (list[str]): 包含描述性语句的字符串列表 - 返回: - list[str]: 提取出的前缀部分列表(如 '查找一下【大板式】') - """ - pattern = r'(查找一下【[^】]+】)' - return [re.match(pattern, text).group(1) for text in input_list if re.match(pattern, text)] diff --git a/kg_lab_6.13/vector_lab.py b/kg_lab_6.13/vector_lab.py index 54ccd29..79da3cb 100644 --- a/kg_lab_6.13/vector_lab.py +++ b/kg_lab_6.13/vector_lab.py @@ -1,12 +1,49 @@ import os from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings +from langchain.embeddings.base import Embeddings +from openai import OpenAI +import requests +import httpx +import logging + +class SiliconFlowEmbeddings(Embeddings): + """SiliconFlow嵌入模型封装""" + def __init__(self, api_key: str, model: str = "bge-m3"): + self.api_key = api_key + self.model = model + self.url = "http://10.1.16.39:9995/v1/embeddings" + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + def _embed(self, input): + payload = { + "model": self.model, + "input": input, + "encoding_format": "float" + } + response = requests.post(self.url, json=payload, headers=self.headers) + response.raise_for_status() + data = response.json() + return [item["embedding"] for item in data["data"]] + + def embed_documents(self, texts): + return self._embed(texts) + + def embed_query(self, text): + return self._embed([text])[0] + + +# embeddings = Embedding(url="http://10.1.16.39:9995/v1", api_key="xxx", model_name="bge-m3") +embeddings = SiliconFlowEmbeddings(api_key="xxx") with open("./data/data.txt", 'r', encoding='utf-8') as file: txt_list = [line.strip() for line in file] -embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3" -embeddings = HuggingFaceEmbeddings(model_name=embedding_path) +# embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3" +# embeddings = HuggingFaceEmbeddings(model_name=embedding_path) faiss_archived = "./data/faiss_data/data" vectorstore_txt_faiss = FAISS.from_texts(txt_list, embeddings)