优化意图识别模块,新增文档相关性判断功能,更新DifyQueryRetrieval类以支持多线程检索,增强数据模型,改进日志记录,调整Excel数据验证逻辑,更新多个提示词模板以提升用户体验。
This commit is contained in:
@@ -9,6 +9,7 @@ Description: 意图分类、改写核心逻辑
|
||||
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
from langchain.output_parsers import PydanticOutputParser
|
||||
import json
|
||||
from typing import List, Tuple, Dict, Any, Optional
|
||||
@@ -18,7 +19,8 @@ import time
|
||||
|
||||
from .PromptTemplates import (classification_prompt, query_rewrite_prompt,
|
||||
extract_nouns_prompt, classification_info,
|
||||
slot_filling_prompt)
|
||||
slot_filling_prompt, step_back_prompt,
|
||||
follow_up_questions_prompt, hyde_prompt, multi_questions_prompt)
|
||||
|
||||
from .Multi_PromptTemplates import (
|
||||
intent_and_slot_prompt, output_example,
|
||||
@@ -29,7 +31,8 @@ from .DataModels import (
|
||||
Classification, QueryRewrite, Term, TermList,
|
||||
SoftwareFunctionSlots, SoftwareTroubleShootingSlots, ProfessionalConsultingSlots,
|
||||
DataProblemSlots, FileExtensionConsultingSlots, SoftwareLockSlots,
|
||||
InstallationDownloadSlots, ProblemDiagnosisSlots, OtherSlots, IntentAndSlotResult
|
||||
InstallationDownloadSlots, ProblemDiagnosisSlots, OtherSlots, IntentAndSlotResult,
|
||||
StepBackPrompt, FollowUpQuestions, HypotheticalDocument, MultiQuestions
|
||||
)
|
||||
from .ProfessionalNounVector import ProfessionalNounRetriever
|
||||
from rag2_0.tool.ModelTool import XinferenceReRankerModel, OpenAiLLM, SiliconFlowReRankerModel
|
||||
@@ -166,34 +169,32 @@ class IntentRecognizer:
|
||||
Returns:
|
||||
提取的术语列表
|
||||
"""
|
||||
try:
|
||||
# 如果使用jieba分词
|
||||
if use_jieba:
|
||||
# 先使用jieba分词
|
||||
tokens = self._tokenize_with_jieba(query)
|
||||
|
||||
# 构建术语列表
|
||||
terms = []
|
||||
for token in tokens:
|
||||
if len(token) > 1: # 过滤掉单字词
|
||||
terms.append(Term(name=token, synonymous=[], description=""))
|
||||
|
||||
return terms
|
||||
else:
|
||||
# 使用LLM提取关键词
|
||||
# 准备提示词
|
||||
formatted_prompt = extract_nouns_prompt.replace("{content}", query)
|
||||
terms_list_parser = PydanticOutputParser(pydantic_object=TermList)
|
||||
formatted_prompt = formatted_prompt.replace("{output_format}", terms_list_parser.get_format_instructions())
|
||||
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 尝试使用Pydantic解析器解析TermList
|
||||
parsed_output = terms_list_parser.parse(response.content)
|
||||
return parsed_output.terms
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e
|
||||
# 如果使用jieba分词
|
||||
if use_jieba:
|
||||
# 先使用jieba分词
|
||||
tokens = self._tokenize_with_jieba(query)
|
||||
|
||||
# 构建术语列表
|
||||
terms = []
|
||||
for token in tokens:
|
||||
if len(token) > 1: # 过滤掉单字词
|
||||
terms.append(Term(name=token, synonymous=[], description=""))
|
||||
|
||||
return terms
|
||||
else:
|
||||
# 使用LLM提取关键词
|
||||
# 准备提示词
|
||||
formatted_prompt = extract_nouns_prompt.replace("{content}", query)
|
||||
terms_list_parser = PydanticOutputParser(pydantic_object=TermList)
|
||||
formatted_prompt = formatted_prompt.replace("{output_format}", terms_list_parser.get_format_instructions())
|
||||
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 尝试使用Pydantic解析器解析TermList
|
||||
parsed_output = terms_list_parser.parse(response.content)
|
||||
return parsed_output.terms
|
||||
|
||||
|
||||
def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2, rerank_score:float = 0.6) -> List[Term]:
|
||||
"""
|
||||
@@ -358,7 +359,8 @@ class IntentRecognizer:
|
||||
def process_query(self, query: str, conversation_context: str = "",
|
||||
chat_history: List[Dict[str, str]] = None,
|
||||
previous_slots: Dict[str, Any] = None,
|
||||
use_jieba: bool = False) -> Dict[str, Any]:
|
||||
use_jieba: bool = False,
|
||||
enable_query_expansion: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
处理用户问题的完整流程
|
||||
|
||||
@@ -388,10 +390,29 @@ class IntentRecognizer:
|
||||
# suffix_terms.append(suffix_term)
|
||||
|
||||
# return Classification(vertical_classification="安装下载", sub_classification="查询"), TermList(terms=suffix_terms), QueryRewrite(rewrite=query), matched_suffixes
|
||||
|
||||
if chat_history is None:
|
||||
chat_history = []
|
||||
if previous_slots is None:
|
||||
previous_slots = {}
|
||||
|
||||
# 步骤: 并行执行提问扩展
|
||||
if enable_query_expansion:
|
||||
# 创建线程和结果容器
|
||||
threads_and_results = [
|
||||
# 5.1: 后退提示
|
||||
self._run_in_thread(self._generate_step_back_prompt, args=(query, chat_history, conversation_context)),
|
||||
|
||||
# 5.2: Follow Up Questions
|
||||
self._run_in_thread(self._generate_follow_up_questions, args=(query, chat_history, conversation_context)),
|
||||
|
||||
# 5.3: HyDE
|
||||
self._run_in_thread(self._generate_hypothetical_document, args=(query, chat_history, conversation_context)),
|
||||
|
||||
# 5.4: 多问题查询
|
||||
self._run_in_thread(self._generate_multi_questions, args=(query, chat_history, conversation_context))
|
||||
]
|
||||
|
||||
# 步骤1: 匹配关键词
|
||||
keywords_terms, query_keys = self._match_keywords(query, use_jieba)
|
||||
|
||||
@@ -419,12 +440,47 @@ class IntentRecognizer:
|
||||
if classification.vertical_classification not in ["其他", "闲聊"] and classification.sub_classification not in ["其他", "闲聊"]:
|
||||
slot_filling_result = self._fill_slots(rewrite.rewrite, classification, conversation_context, chat_history, previous_slots)
|
||||
|
||||
if not enable_query_expansion:
|
||||
return {
|
||||
"classification": classification.model_dump(),
|
||||
"keywords": keywords_terms.model_dump(),
|
||||
"rewrite": rewrite.model_dump(),
|
||||
"query_keys": query_keys,
|
||||
"slot_filling": slot_filling_result
|
||||
}
|
||||
|
||||
# 等待所有线程完成
|
||||
start_time = time.time()
|
||||
for thread, _ in threads_and_results:
|
||||
thread.join()
|
||||
end_time = time.time()
|
||||
logging.info(f"问题扩展环节耗时统计 - 总耗时: {end_time - start_time:.2f}秒")
|
||||
|
||||
# 收集结果
|
||||
step_back_result = threads_and_results[0][1][0] if threads_and_results[0][1] else StepBackPrompt(original_query=query, step_back_query=query)
|
||||
follow_up_result = threads_and_results[1][1][0] if threads_and_results[1][1] else FollowUpQuestions(original_query=query, follow_up_query=query)
|
||||
hyde_result = threads_and_results[2][1][0] if threads_and_results[2][1] else HypotheticalDocument(original_query=query, hypothetical_answer="")
|
||||
multi_questions_result = threads_and_results[3][1][0] if threads_and_results[3][1] else MultiQuestions(original_query=query, sub_questions=[query])
|
||||
all_questions=multi_questions_result.sub_questions
|
||||
all_questions.append(query)
|
||||
all_questions.append(step_back_result.step_back_query)
|
||||
all_questions.append(follow_up_result.follow_up_query)
|
||||
all_questions.append(hyde_result.hypothetical_answer)
|
||||
all_questions = list(set(all_questions))
|
||||
|
||||
query_expand={"all":all_questions,
|
||||
"step_back":step_back_result.model_dump(),
|
||||
"follow_up":follow_up_result.model_dump(),
|
||||
"hyde":hyde_result.model_dump(),
|
||||
"multi_questions":multi_questions_result.model_dump()}
|
||||
# 返回所有结果
|
||||
return {
|
||||
"classification": classification.model_dump(),
|
||||
"keywords": keywords_terms.model_dump(),
|
||||
"rewrite": rewrite.model_dump(),
|
||||
"query_keys": query_keys,
|
||||
"slot_filling": slot_filling_result
|
||||
"slot_filling": slot_filling_result,
|
||||
"query_expand": query_expand
|
||||
}
|
||||
|
||||
|
||||
@@ -544,7 +600,182 @@ class IntentRecognizer:
|
||||
# 如果解析失败,创建一个空的模型实例
|
||||
empty_instance = slot_model_class()
|
||||
return empty_instance
|
||||
|
||||
def _generate_step_back_prompt(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> StepBackPrompt:
|
||||
"""
|
||||
生成后退提示
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
chat_history: 历史对话记录
|
||||
conversation_context: 会话背景信息
|
||||
|
||||
Returns:
|
||||
后退提示结果
|
||||
"""
|
||||
step_back_start_time = time.time()
|
||||
# 准备提示词
|
||||
step_back_parser = PydanticOutputParser(pydantic_object=StepBackPrompt)
|
||||
formatted_prompt = step_back_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
output_format=step_back_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
try:
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 解析输出
|
||||
parsed_output = step_back_parser.parse(response.content)
|
||||
step_back_end_time = time.time()
|
||||
step_back_time = step_back_end_time - step_back_start_time
|
||||
logging.debug(f"后退提示生成耗时统计 - 总耗时: {step_back_time:.2f}秒")
|
||||
return parsed_output
|
||||
except Exception as e:
|
||||
# 如果解析失败,返回原始查询作为后退提示
|
||||
logging.error(f"后退提示生成失败: {e}")
|
||||
return StepBackPrompt(original_query=query, step_back_query=query)
|
||||
|
||||
def _generate_follow_up_questions(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> FollowUpQuestions:
|
||||
"""
|
||||
生成后续问题
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
chat_history: 历史对话记录
|
||||
conversation_context: 会话背景信息
|
||||
|
||||
Returns:
|
||||
后续问题结果
|
||||
"""
|
||||
follow_up_start_time = time.time()
|
||||
# 准备提示词
|
||||
follow_up_parser = PydanticOutputParser(pydantic_object=FollowUpQuestions)
|
||||
formatted_prompt = follow_up_questions_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
output_format=follow_up_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
try:
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 解析输出
|
||||
parsed_output = follow_up_parser.parse(response.content)
|
||||
follow_up_end_time = time.time()
|
||||
follow_up_time = follow_up_end_time - follow_up_start_time
|
||||
logging.debug(f"后续问题生成耗时统计 - 总耗时: {follow_up_time:.2f}秒")
|
||||
return parsed_output
|
||||
except Exception as e:
|
||||
# 如果解析失败,返回原始查询作为后续问题
|
||||
logging.error(f"后续问题生成失败: {e}")
|
||||
return FollowUpQuestions(original_query=query, follow_up_query=query)
|
||||
|
||||
def _generate_hypothetical_document(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> HypotheticalDocument:
|
||||
"""
|
||||
生成假设性文档
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
chat_history: 历史对话记录
|
||||
conversation_context: 会话背景信息
|
||||
|
||||
Returns:
|
||||
假设性文档结果
|
||||
"""
|
||||
hyde_start_time = time.time()
|
||||
# 准备提示词
|
||||
hyde_parser = PydanticOutputParser(pydantic_object=HypotheticalDocument)
|
||||
formatted_prompt = hyde_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
output_format=hyde_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
try:
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 解析输出
|
||||
parsed_output = hyde_parser.parse(response.content)
|
||||
hyde_end_time = time.time()
|
||||
hyde_time = hyde_end_time - hyde_start_time
|
||||
logging.debug(f"假设性文档生成耗时统计 - 总耗时: {hyde_time:.2f}秒")
|
||||
return parsed_output
|
||||
except Exception as e:
|
||||
# 如果解析失败,返回空的假设性回答
|
||||
logging.error(f"假设性文档生成失败: {e}")
|
||||
return HypotheticalDocument(original_query=query, hypothetical_answer="")
|
||||
|
||||
def _generate_multi_questions(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> MultiQuestions:
|
||||
"""
|
||||
生成多角度问题
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
chat_history: 历史对话记录
|
||||
conversation_context: 会话背景信息
|
||||
|
||||
Returns:
|
||||
多角度问题结果
|
||||
"""
|
||||
multi_questions_start_time = time.time()
|
||||
# 准备提示词
|
||||
multi_questions_parser = PydanticOutputParser(pydantic_object=MultiQuestions)
|
||||
formatted_prompt = multi_questions_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
output_format=multi_questions_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
try:
|
||||
# 调用LLM
|
||||
response = self._llm.invoke(formatted_prompt, False)
|
||||
|
||||
# 解析输出
|
||||
parsed_output = multi_questions_parser.parse(response.content)
|
||||
multi_questions_end_time = time.time()
|
||||
multi_questions_time = multi_questions_end_time - multi_questions_start_time
|
||||
logging.debug(f"多角度问题生成耗时统计 - 总耗时: {multi_questions_time:.2f}秒")
|
||||
return parsed_output
|
||||
except Exception as e:
|
||||
# 如果解析失败,返回原始查询作为唯一子问题
|
||||
logging.error(f"多角度问题生成失败: {e},LLM返回内容:{response.content}")
|
||||
return MultiQuestions(original_query=query, sub_questions=[query])
|
||||
|
||||
def _run_in_thread(self, func, args=(), kwargs={}):
|
||||
"""
|
||||
在线程中执行函数并返回结果
|
||||
|
||||
Args:
|
||||
func: 要执行的函数
|
||||
args: 函数的位置参数
|
||||
kwargs: 函数的关键字参数
|
||||
|
||||
Returns:
|
||||
(thread, result_container): 线程对象和存放结果的容器
|
||||
"""
|
||||
result_container = []
|
||||
|
||||
def thread_target():
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
result_container.append(result)
|
||||
except Exception as e:
|
||||
logging.error(f"线程执行函数 {func.__name__} 时出错: {e}")
|
||||
result_container.append(None)
|
||||
|
||||
thread = threading.Thread(target=thread_target)
|
||||
thread.start()
|
||||
return thread, result_container
|
||||
|
||||
|
||||
def _process_intent_and_slot(self, user_input: str, conversation_context: str = "",
|
||||
chat_history: List[Dict[str, str]] = None,
|
||||
previous_slots: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
|
||||
Reference in New Issue
Block a user