更新环境变量配置,调整模型名称获取方式,新增Dify API相关配置,删除无用的脚本文件,优化意图识别逻辑,添加LLM提取词条逻辑
This commit is contained in:
@@ -39,10 +39,21 @@ from .ProfessionalNounVector import ProfessionalNounRetriever, AsyncProfessional
|
||||
from rag2_0.tool.ModelTool import XinferenceReRankerModel, OpenAiLLM, SiliconFlowReRankerModel
|
||||
|
||||
class AsyncIntentRecognizer:
|
||||
SOFT_WIKI_PATH = "data/wiki_data"
|
||||
SOFT_NAMETOWIKI_MAP = {
|
||||
"配网计价通D3软件": "配网计价通D3软件.txt",
|
||||
"西藏计价通Z1软件": "西藏计价通Z1软件.txt",
|
||||
"储能计价通C1软件": "储能计价通C1软件.txt",
|
||||
"技改检修工程计价通T1软件": "技改检修工程计价通T1软件.txt",
|
||||
"技改检修清单计价通T1软件": "技改检修清单计价通T1软件.txt",
|
||||
"电力建设计价通软件": "电力建设计价通软件.txt",
|
||||
"下载安装注册": "下载安装注册.txt",
|
||||
}
|
||||
|
||||
"""
|
||||
异步意图识别和问题改写类
|
||||
"""
|
||||
def __init__(self, api_key: str = None, base_url: str = None, model_name: str = "gpt-3.5-turbo", vector_index_dir: str = None):
|
||||
def __init__(self):
|
||||
"""
|
||||
初始化异步意图识别器
|
||||
|
||||
@@ -52,51 +63,53 @@ class AsyncIntentRecognizer:
|
||||
model_name: 要使用的模型名称
|
||||
vector_index_dir: 向量索引目录,如果为None则使用默认目录
|
||||
"""
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
base_url = os.getenv("OPENAI_API_BASE")
|
||||
model_name = os.getenv("MODEL_NAME", "gpt-3.5-turbo")
|
||||
# 初始化LLM
|
||||
llm_params = {
|
||||
"temperature": 0.2, # 降低随机性,使结果更确定
|
||||
"top_p": 0.7,
|
||||
"model": model_name
|
||||
"model": model_name,
|
||||
"api_key": api_key,
|
||||
"base_url": base_url
|
||||
}
|
||||
|
||||
# 如果提供了API密钥,则使用提供的密钥
|
||||
if api_key:
|
||||
llm_params["api_key"] = api_key
|
||||
|
||||
# 如果提供了自定义URL,则使用提供的URL
|
||||
if base_url:
|
||||
llm_params["base_url"] = base_url
|
||||
|
||||
self._llm = OpenAiLLM(**llm_params)
|
||||
llm_params["model"] = os.getenv("MINI_MODEL_NAME", "gpt-3.5-turbo")
|
||||
self._llm_mini = OpenAiLLM(**llm_params)
|
||||
|
||||
# 加载suffix关键词
|
||||
self._suffix_keywords = self._load_suffix_keywords()
|
||||
|
||||
# 加载软件词条名称库
|
||||
self._soft_wiki_library = self._load_soft_wiki_library()
|
||||
# 异步检索器将在create方法中初始化
|
||||
self._noun_retriever = None
|
||||
self._api_key = api_key
|
||||
self._vector_index_dir = vector_index_dir
|
||||
|
||||
def _load_soft_wiki_library(self):
|
||||
"""
|
||||
加载软件wiki库
|
||||
"""
|
||||
SOFT_WIKI_LIBRARY = {}
|
||||
for soft_name, wiki_file_name in self.SOFT_NAMETOWIKI_MAP.items():
|
||||
with open(f"{self.SOFT_WIKI_PATH}/{wiki_file_name}", "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
# 去除空行
|
||||
lines = [line.strip() for line in lines if line.strip()]
|
||||
SOFT_WIKI_LIBRARY[soft_name] = lines
|
||||
return SOFT_WIKI_LIBRARY
|
||||
|
||||
@classmethod
|
||||
async def create(cls, api_key: str = None, base_url: str = None, model_name: str = "gpt-3.5-turbo", vector_index_dir: str = None):
|
||||
async def create(cls):
|
||||
"""
|
||||
异步工厂方法:创建并初始化异步意图识别器实例
|
||||
|
||||
Args:
|
||||
api_key: OpenAI API密钥,如果为None则从环境变量获取
|
||||
base_url: OpenAI API基础URL,如果为None则使用默认URL
|
||||
model_name: 要使用的模型名称
|
||||
vector_index_dir: 向量索引目录,如果为None则使用默认目录
|
||||
|
||||
Returns:
|
||||
初始化完成的AsyncIntentRecognizer实例
|
||||
"""
|
||||
instance = cls(api_key, base_url, model_name, vector_index_dir)
|
||||
instance = cls()
|
||||
# 异步初始化名词检索器
|
||||
instance._noun_retriever = await AsyncProfessionalNounRetriever.create(
|
||||
api_key=api_key,
|
||||
index_dir=vector_index_dir
|
||||
)
|
||||
instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
|
||||
return instance
|
||||
|
||||
def _load_suffix_keywords(self, filepath: str = None) -> List[str]:
|
||||
@@ -402,11 +415,12 @@ class AsyncIntentRecognizer:
|
||||
return f"通过博微软件助手查询软件锁信息,锁注册号为{lock_number}"
|
||||
|
||||
|
||||
async def process_query_async(self, query: str, conversation_context: str = "",
|
||||
async def process_query_async(self, query: str, conversation_context: Dict = None,
|
||||
chat_history: List[Dict[str, str]] = None,
|
||||
previous_slots: Dict[str, Any] = None,
|
||||
use_jieba: bool = False,
|
||||
enable_query_expansion: bool = False) -> Dict[str, Any]:
|
||||
enable_query_expansion: bool = False,
|
||||
cur_soft_name: str = "") -> Dict[str, Any]:
|
||||
"""
|
||||
异步处理用户问题的完整流程
|
||||
|
||||
@@ -417,7 +431,7 @@ class AsyncIntentRecognizer:
|
||||
previous_slots: 历史槽位信息
|
||||
use_jieba: 是否使用jieba分词辅助提取关键词
|
||||
enable_query_expansion: 是否启用查询扩展
|
||||
|
||||
cur_soft_name: 当前查询的软件名称
|
||||
Returns:
|
||||
包含分类、关键词、改写和槽位填充结果的字典
|
||||
"""
|
||||
@@ -425,7 +439,8 @@ class AsyncIntentRecognizer:
|
||||
chat_history = []
|
||||
if previous_slots is None:
|
||||
previous_slots = {}
|
||||
|
||||
if conversation_context is None:
|
||||
conversation_context = {}
|
||||
# 步骤: 并行执行提问扩展
|
||||
query_expand_tasks = []
|
||||
if enable_query_expansion:
|
||||
@@ -437,9 +452,9 @@ class AsyncIntentRecognizer:
|
||||
# 5.2: Follow Up Questions
|
||||
asyncio.create_task(self._generate_follow_up_questions_async(query, chat_history, conversation_context)),
|
||||
|
||||
# 5.3: HyDE
|
||||
# asyncio.create_task(self._generate_hypothetical_document_async(query, chat_history, conversation_context)),
|
||||
|
||||
# 5.3: 文档查询
|
||||
asyncio.create_task(self._find_matching_software_docs_async(query, cur_soft_name, chat_history)),
|
||||
|
||||
# 5.4: 多问题查询
|
||||
asyncio.create_task(self._generate_multi_questions_async(query, chat_history, conversation_context))
|
||||
]
|
||||
@@ -497,23 +512,22 @@ class AsyncIntentRecognizer:
|
||||
# 收集结果
|
||||
step_back_result = query_expand_results[0] if query_expand_results[0] else StepBackPrompt(original_query=query, can_use_back_prompt=False, step_back_query=[query])
|
||||
follow_up_result = query_expand_results[1] if query_expand_results[1] else FollowUpQuestions(original_query=query, follow_up_query=query)
|
||||
# hyde_result = query_expand_results[2] if query_expand_results[2] else HypotheticalDocument(original_query=query, hypothetical_answer="")
|
||||
multi_questions_result = query_expand_results[2] if query_expand_results[2] else MultiQuestions(original_query=query, sub_questions=[query])
|
||||
wiki_result = query_expand_results[2] if query_expand_results[2] else []
|
||||
multi_questions_result = query_expand_results[3] if query_expand_results[3] else MultiQuestions(original_query=query, sub_questions=[query])
|
||||
|
||||
all_questions = multi_questions_result.sub_questions
|
||||
all_questions.append(query)
|
||||
all_questions.append(rewrite.rewrite)
|
||||
all_questions.extend(step_back_result.step_back_query)
|
||||
all_questions.append(follow_up_result.follow_up_query)
|
||||
# all_questions.append(hyde_result.hypothetical_answer)
|
||||
all_questions.extend(wiki_result)
|
||||
all_questions = list(set(all_questions))
|
||||
|
||||
query_expand = {
|
||||
"all": all_questions,
|
||||
"step_back": step_back_result.model_dump(),
|
||||
"follow_up": follow_up_result.model_dump(),
|
||||
# "hyde": hyde_result.model_dump(),
|
||||
"multi_questions": multi_questions_result.model_dump()
|
||||
"multi_questions": multi_questions_result.model_dump(),
|
||||
}
|
||||
|
||||
# 返回所有结果
|
||||
@@ -721,45 +735,72 @@ class AsyncIntentRecognizer:
|
||||
logging.error(f"异步后续问题生成失败: {e}", exc_info=True)
|
||||
return FollowUpQuestions(original_query=query, follow_up_query=query)
|
||||
|
||||
async def _generate_hypothetical_document_async(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> HypotheticalDocument:
|
||||
async def _find_matching_software_docs_async(self, query: str, soft_name: str,
|
||||
chat_history: List[Dict[str, str]] = None,
|
||||
top_k: int = 3) -> List[str]:
|
||||
"""
|
||||
异步生成假设性文档
|
||||
异步查找软件文档中与用户问题最匹配的几行内容
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
query: 用户问题
|
||||
soft_name: 软件名称
|
||||
chat_history: 历史对话记录
|
||||
conversation_context: 会话背景信息
|
||||
top_k: 返回的匹配行数,默认为3
|
||||
|
||||
Returns:
|
||||
假设性文档结果
|
||||
匹配的文档行列表
|
||||
"""
|
||||
if chat_history is None:
|
||||
chat_history = []
|
||||
|
||||
# 检查软件名称是否在支持的列表中
|
||||
if soft_name not in self.SOFT_NAMETOWIKI_MAP:
|
||||
return []
|
||||
|
||||
# 获取软件文档内容
|
||||
soft_docs = self._soft_wiki_library.get(soft_name, [])
|
||||
if not soft_docs:
|
||||
return []
|
||||
soft_docs.extend(self._soft_wiki_library.get("下载安装注册", []))
|
||||
# soft_docs=soft_docs[:50]
|
||||
# 构建文档字符串,只包含行内容
|
||||
soft_docs_str = "\n".join(f"{doc.strip()}" for i, doc in enumerate(soft_docs))
|
||||
|
||||
# 构建提示词,让LLM选择最匹配的行
|
||||
prompt = f"""
|
||||
{soft_docs_str}
|
||||
================================
|
||||
以上为软件功能操作、常见问题排查等功能,结合历史对话,请输出与当前提问最相关的1-3个功能名称,
|
||||
使用Json格式输出,如下:
|
||||
[{{"content": "行内容"}},...]
|
||||
当前问题: {query}
|
||||
历史对话: {json.dumps(chat_history, ensure_ascii=False)}
|
||||
"""
|
||||
hyde_start_time = time.time()
|
||||
# 准备提示词
|
||||
hyde_parser = PydanticOutputParser(pydantic_object=HypotheticalDocument)
|
||||
formatted_prompt = hyde_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
# conversation_context=conversation_context,
|
||||
output_format=hyde_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
try:
|
||||
# 异步调用LLM
|
||||
response = await self._llm.invoke_async(formatted_prompt, False)
|
||||
start_time = time.time()
|
||||
response = await self._llm.invoke_async(prompt, False, response_format={"type": "json_object"})
|
||||
end_time = time.time()
|
||||
|
||||
# 解析输出
|
||||
response.content = response.content.strip()
|
||||
clean_output = re.sub(r'<think>.*?</think>', '', response.content, flags=re.DOTALL)
|
||||
parsed_output = hyde_parser.parse(clean_output)
|
||||
hyde_end_time = time.time()
|
||||
hyde_time = hyde_end_time - hyde_start_time
|
||||
logging.debug(f"异步假设性文档生成耗时统计 - 总耗时: {hyde_time:.2f}秒")
|
||||
return parsed_output
|
||||
# 解析JSON响应
|
||||
try:
|
||||
wiki_names = []
|
||||
json_response = json.loads(response.content)
|
||||
for match in json_response:
|
||||
wiki_names.append(match["content"])
|
||||
logging.debug(f"软件文档匹配耗时: {end_time - start_time:.2f}秒")
|
||||
return wiki_names
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logging.error(f"解析JSON响应时出错: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
# 如果解析失败,返回空的假设性回答
|
||||
logging.error(f"异步假设性文档生成失败: {e}", exc_info=True)
|
||||
return HypotheticalDocument(original_query=query, hypothetical_answer="")
|
||||
|
||||
logging.error(f"查找匹配软件文档时出错: {e}", exc_info=True)
|
||||
# 出错时返回空列表
|
||||
return []
|
||||
|
||||
async def _generate_multi_questions_async(self, query: str, chat_history: List[Dict[str, str]] = None, conversation_context: str = "") -> MultiQuestions:
|
||||
"""
|
||||
异步生成多角度问题
|
||||
|
||||
Reference in New Issue
Block a user