Compare commits
3 Commits
b06a84c059
...
82724d206b
| Author | SHA1 | Date | |
|---|---|---|---|
| 82724d206b | |||
| 5b5a2f2b16 | |||
| af1e1a9d9b |
-49
@@ -1,52 +1,3 @@
|
|||||||
sk-uollmeyatyiwfzszvxkpyndmzfrbqjpyixewmrastbmaqbhy
|
|
||||||
sk-xdlsjytiwilvodadkjxvwdgulhhdytkqvfpyrcnllclgzqkb
|
|
||||||
sk-ffkltifkylutornjhwmnmfjsqsywrjibvujhjtjctzgnkvlp
|
|
||||||
sk-vmwocqqjqxnsvzmeyvqskahjaclifpmsbhywvnrvwygkfyuj
|
|
||||||
sk-gzwkmzxeeunaywrdrgirdatqhdtqdgvzqpesvprwbbjhcchn
|
|
||||||
sk-duchutcxmygrnkhzmmlykvtzwaylqtdxfbbuhvfvzuapazii
|
|
||||||
sk-nlddwexmjxqtgdvahwvlotnomrzcgskxeakxkxauicknzfkp
|
|
||||||
sk-lopwluipwvilwpwztvaxfebueeyilefwgncgpeprqvwazxom
|
|
||||||
sk-rgwrklpvhhrluokkbgavzukuhhpfhqzmozpjzoezfhkxyorc
|
|
||||||
sk-cdrpglnfmyeeqyhtvxvkpcpwscsbfouwkagjpphuksfzeipy
|
|
||||||
sk-eyktixexxjqvwufezcmdrazcedtsphyiunhqpamlkrcaxwtg
|
|
||||||
sk-euzbguamsxqspfdjnrpbchkjkouncqipjvhnkkbvoihgwspe
|
|
||||||
sk-qlpoqleqaodseswzqklbwjwwcdjrqthbmvweuablibiszpnw
|
|
||||||
sk-gqjtkwjmrupugviflhsffhkpzxxmjcewviqsneurxnlfqewy
|
|
||||||
sk-hkxgjpdyxuxjklksunfzaetrhveelucmrldfjnlztibxgjgl
|
|
||||||
sk-fqjkatqvpkmvlkbqhjfkzmiifmiodayututyprdtldszmacd
|
|
||||||
sk-wrybirbfwtbfdijjrfpdfxlxzmvcrhfgqqbhyuysibmcmcez
|
|
||||||
sk-vgkpsmbcchymktakjsheqnjlkopiqcvntqcvxuxmxlllifod
|
|
||||||
sk-afvrxowsmjmlhguhfpfhmefcldzsohmnyumjdwwkrvzwjrym
|
|
||||||
sk-jwfemfolekojghqwfwvjxwmzdvdnhaznngqeurvzgwqepddp
|
|
||||||
sk-vxzwcizvmykzwbxaypvshdkbgjymnmelryyqftkxzxnnpibr
|
|
||||||
sk-bwcbyczfakqkwrsvmmraeikjfpiurhhanflfqkunpzezvagv
|
|
||||||
sk-dicoheyoylocrtjsacbvwqzwwibnnasuwhnhsudoemieengi
|
|
||||||
sk-mvuzqvuqmryqrqolztwthhzcnzibygjycdbeplbbcnbxkncq
|
|
||||||
sk-qavyzqmnfjbhqhuesfkjcyszkzmjsykpnaebuwzcxhkitoir
|
|
||||||
sk-jhrbmqmzyrlrqyehkkachwhztkuisphqbmoyvchmaqrcfxtf
|
|
||||||
sk-mtcfttyljuzteasdjbcrtwwwmqxetmaonbwvojzfijzfiplq
|
|
||||||
sk-iooturknmbpxhbiulovovtzpyayovyidifzpzkjgqacroxjt
|
|
||||||
sk-jgjhasobxeuzfgutdyuhcejocwdiwonhkdithchkonhxnawc
|
|
||||||
sk-mvyqkciotllopyozsfzwtjeicuhnvoihnrrxadfsfiakperx
|
|
||||||
sk-wticrisyjehvnlrlhmmxhrnlzknqpnkfxowlzvnozskvtvzc
|
|
||||||
sk-hiniqrfvuqlsgmhrqlezlribsaqdefuhpxxfavoqtszxtasw
|
|
||||||
sk-zlfncovfrpzczmjquirolpogdrzfarkcwwqkluvifwcvrezq
|
|
||||||
sk-blhnvpenedysngftlghrkxhweoququkvduikziuypzilpyrp
|
|
||||||
sk-rynvudqktwvnjleiahwdpeqqdkncsvawvjyicyiojoviiges
|
|
||||||
sk-ncskqkwomgqpfnfnachehkeaczzgiiuripeyzjrpnuzeosnn
|
|
||||||
sk-ijqdmtyeuqrbndqjggxyicfjrmpsgbsjwwkitgsmxqvcjrri
|
|
||||||
sk-ajgisqnkpgoiwxigrnjachusupagqpukteuknemhmnxsasre
|
|
||||||
sk-zsskrhjnoepgjngcsseklxfpwpozhenurtrluxlstxujdsti
|
|
||||||
sk-lubrliuefgmrxpfafdwrhzletyvhemqkpvriuuqncivlewgx
|
|
||||||
sk-pavwfgrpftdtzeiiaoousmkdptwujoeocyzzoudeqkfyoxks
|
|
||||||
sk-jfwhzkhpgxacedwxzkbwrwvlfqvnlhxaeyghjcmtshogsqub
|
|
||||||
sk-sqmaankkcsqpbtktdfcmpuxjxgarfzgvygdgxgztlmyxfpkp
|
|
||||||
sk-xhlgjmwmtkahrpdncwoqynjdrkekmsyftwqmomsodbggvbdd
|
|
||||||
sk-ynqqptobbeazmjyrmaytsvyczsqwrukpezizrlcloncxtwvc
|
|
||||||
sk-wswttgfrxrwijvqhctfilhvlxgdkgogrjhvjkdbzvqrocofa
|
|
||||||
sk-jdijeubeygjmqtxwryrbwmrpvqawinzwpcxodpolhcupzmpa
|
|
||||||
sk-xbloemctsowwicjvrtrrewreosnfojoijtygsfxfnjntridv
|
|
||||||
sk-isovavcefvkzlbjewnumeqqevmnoucojsxwskkitfktkemtq
|
|
||||||
sk-vxrlvvdzgythgyycuqehdloubxcdwhgojpowgxvgxsstjtvk
|
sk-vxrlvvdzgythgyycuqehdloubxcdwhgojpowgxvgxsstjtvk
|
||||||
sk-krgctzbdqekohpowmvftsjswgpxnwxadezeosdspelmtmukx
|
sk-krgctzbdqekohpowmvftsjswgpxnwxadezeosdspelmtmukx
|
||||||
sk-slcgfmphmbqwuvshoaygfkfaxpzcabtlpkhvfqjodajuynsl
|
sk-slcgfmphmbqwuvshoaygfkfaxpzcabtlpkhvfqjodajuynsl
|
||||||
|
|||||||
@@ -14,20 +14,21 @@ import httpx
|
|||||||
import traceback
|
import traceback
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
# 将项目根目录添加到Python路径
|
# 将项目根目录添加到Python路径
|
||||||
sys.path.append(os.getcwd())
|
sys.path.append(os.getcwd())
|
||||||
from rag2_0.tool.ModelTool import OpenAiLLM
|
from rag2_0.tool.ModelTool import OpenAiLLM
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
os.makedirs("data/logs", exist_ok=True)
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
handlers=[
|
handlers=[
|
||||||
logging.StreamHandler(),
|
logging.StreamHandler(),
|
||||||
logging.FileHandler('dialogue_to_workorder.log', encoding='utf-8')
|
logging.FileHandler('data/logs/dialogue_to_workorder.log', encoding='utf-8')
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
logger = logging.getLogger("dialogue_to_workorder")
|
logger = logging.getLogger("dialogue_to_workorder")
|
||||||
@@ -469,9 +470,10 @@ class DialogueToWorkorder:
|
|||||||
"客户问题": user_question_str,
|
"客户问题": user_question_str,
|
||||||
"问题类型": problem_type,
|
"问题类型": problem_type,
|
||||||
"是否抱怨": "是" if is_dissatisfaction else '否',
|
"是否抱怨": "是" if is_dissatisfaction else '否',
|
||||||
|
"抱怨内容": dissatisfaction_reasoning if is_dissatisfaction else '',
|
||||||
"抱怨级别": dissatisfaction_level if is_dissatisfaction else '',
|
"抱怨级别": dissatisfaction_level if is_dissatisfaction else '',
|
||||||
"是否投诉": "是" if is_complaint else '否',
|
"是否投诉": "是" if is_complaint else '否',
|
||||||
"解决方案": (solution_str + '\n存在抱怨:' + dissatisfaction_reasoning) if is_dissatisfaction else solution_str
|
"解决方案": solution_str
|
||||||
})
|
})
|
||||||
workorder_list.append(base_workorder_dict)
|
workorder_list.append(base_workorder_dict)
|
||||||
# for user_question in user_question_list:
|
# for user_question in user_question_list:
|
||||||
@@ -544,7 +546,7 @@ class DialogueToWorkorder:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# 获取结果
|
# 获取结果
|
||||||
for future in concurrent.futures.as_completed(future_to_conversation):
|
for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="处理会话进度"):
|
||||||
conversation_id = future_to_conversation[future]
|
conversation_id = future_to_conversation[future]
|
||||||
try:
|
try:
|
||||||
result_workorders = future.result()
|
result_workorders = future.result()
|
||||||
@@ -563,7 +565,7 @@ class DialogueToWorkorder:
|
|||||||
# 按照指定的列顺序重新排列DataFrame的列
|
# 按照指定的列顺序重新排列DataFrame的列
|
||||||
columns_order = [
|
columns_order = [
|
||||||
'工单编号', '产品线', '产品名称', '模块名称', '问题类型',
|
'工单编号', '产品线', '产品名称', '模块名称', '问题类型',
|
||||||
'客户问题', '解决方案', '是否抱怨', '是否投诉', '抱怨级别',
|
'客户问题', '解决方案', '是否抱怨', "抱怨内容", '是否投诉', '抱怨级别',
|
||||||
'会话id', '访客昵称', '处理坐席', '创建时间'
|
'会话id', '访客昵称', '处理坐席', '创建时间'
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -607,6 +609,7 @@ class DialogueToWorkorder:
|
|||||||
'客户问题': 20,
|
'客户问题': 20,
|
||||||
'解决方案': 30,
|
'解决方案': 30,
|
||||||
'是否抱怨': 9,
|
'是否抱怨': 9,
|
||||||
|
'抱怨内容': 30,
|
||||||
'是否投诉': 9,
|
'是否投诉': 9,
|
||||||
'抱怨级别': 9,
|
'抱怨级别': 9,
|
||||||
'会话id': 9,
|
'会话id': 9,
|
||||||
@@ -637,9 +640,9 @@ def parse_arguments():
|
|||||||
help='产品详情Excel文件路径')
|
help='产品详情Excel文件路径')
|
||||||
parser.add_argument('--max_workers', type=int, default=16,
|
parser.add_argument('--max_workers', type=int, default=16,
|
||||||
help='并发处理线程数,默认为16')
|
help='并发处理线程数,默认为16')
|
||||||
parser.add_argument('--start_date', type=str, required=False,
|
parser.add_argument('--start_date', type=str, required=False,default="2025-05-01 00:00:00",
|
||||||
help='开始日期,格式为YYYY-MM-DD')
|
help='开始日期,格式为YYYY-MM-DD')
|
||||||
parser.add_argument('--end_date', type=str, required=False,
|
parser.add_argument('--end_date', type=str, required=False,default="2025-05-24 23:59:59",
|
||||||
help='结束日期,格式为YYYY-MM-DD')
|
help='结束日期,格式为YYYY-MM-DD')
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|||||||
@@ -23,17 +23,18 @@ from tqdm import tqdm
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
os.makedirs('./data/log', exist_ok=True)
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
handlers=[
|
handlers=[
|
||||||
logging.FileHandler('./data/log/mariadb_client.log'),
|
logging.FileHandler('./data/logs/mariadb_client.log'),
|
||||||
logging.StreamHandler()
|
logging.StreamHandler()
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
os.makedirs('./data/log', exist_ok=True)
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DatabaseConfig:
|
class DatabaseConfig:
|
||||||
@@ -266,6 +267,9 @@ class DataProcessor:
|
|||||||
# 转换为字典列表
|
# 转换为字典列表
|
||||||
result = []
|
result = []
|
||||||
for record in filtered_df.to_dict('records'):
|
for record in filtered_df.to_dict('records'):
|
||||||
|
# 如果上一条消息和当前消息的发送者、创建时间、消息内容相同,则跳过
|
||||||
|
if result and result[-1]['会话id'] == record['SESSION_ID'] and result[-1]['消息发送者'] == record['message_sender'] and result[-1]['创建时间'] == record['CREATE_TIME'] and result[-1]['消息内容'] == record['processed_content']:
|
||||||
|
continue
|
||||||
result.append({
|
result.append({
|
||||||
"账号id": record["ACCOUNT"],
|
"账号id": record["ACCOUNT"],
|
||||||
"会话id": record["SESSION_ID"],
|
"会话id": record["SESSION_ID"],
|
||||||
@@ -396,37 +400,15 @@ class MariaDBClient:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def process_session_batch(db_client: MariaDBClient, session_batch: pd.DataFrame) -> List[List[Dict[str, Any]]]:
|
|
||||||
"""批量处理会话数据"""
|
|
||||||
conversations = []
|
|
||||||
|
|
||||||
for _, session_row in session_batch.iterrows():
|
|
||||||
try:
|
|
||||||
session_id = session_row['SESSION_ID']
|
|
||||||
messages_df = db_client.query_messages_by_session_id(session_id)
|
|
||||||
|
|
||||||
if messages_df is not None and not messages_df.empty:
|
|
||||||
conversation = db_client.data_processor.messages_df_to_list(messages_df)
|
|
||||||
if conversation:
|
|
||||||
conversations.append(conversation)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"处理会话 {session_row.get('SESSION_ID', 'unknown')} 时出错: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
return conversations
|
|
||||||
|
|
||||||
|
|
||||||
class SessionProcessor:
|
class SessionProcessor:
|
||||||
"""会话处理器,负责批量和并发处理"""
|
"""会话处理器,负责并发处理"""
|
||||||
|
|
||||||
def __init__(self, db_client: MariaDBClient, max_workers: int = None, batch_size: int = 50):
|
def __init__(self, db_client: MariaDBClient, max_workers: int = None):
|
||||||
self.db_client = db_client
|
self.db_client = db_client
|
||||||
self.max_workers = max_workers if max_workers is not None else os.cpu_count()
|
self.max_workers = max_workers if max_workers is not None else os.cpu_count()
|
||||||
self.batch_size = batch_size
|
|
||||||
self.temp_save_lock = threading.Lock() # 添加锁用于保护临时保存操作
|
self.temp_save_lock = threading.Lock() # 添加锁用于保护临时保存操作
|
||||||
|
|
||||||
logger.info(f"初始化会话处理器: max_workers={self.max_workers}, batch_size={self.batch_size}")
|
logger.info(f"初始化会话处理器: max_workers={self.max_workers}")
|
||||||
|
|
||||||
def process_sessions(self, sessions_df: pd.DataFrame) -> List[List[Dict[str, Any]]]:
|
def process_sessions(self, sessions_df: pd.DataFrame) -> List[List[Dict[str, Any]]]:
|
||||||
"""处理所有会话数据"""
|
"""处理所有会话数据"""
|
||||||
@@ -437,29 +419,41 @@ class SessionProcessor:
|
|||||||
total_sessions = len(sessions_df)
|
total_sessions = len(sessions_df)
|
||||||
logger.info(f"开始处理 {total_sessions} 个会话...")
|
logger.info(f"开始处理 {total_sessions} 个会话...")
|
||||||
|
|
||||||
# 分批处理
|
|
||||||
all_conversations = []
|
all_conversations = []
|
||||||
batch_count = (total_sessions + self.batch_size - 1) // self.batch_size
|
|
||||||
# 使用线程池处理批次
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
||||||
# 提交所有批次任务
|
|
||||||
future_to_batch = {}
|
|
||||||
|
|
||||||
for i in range(0, total_sessions, self.batch_size):
|
# 直接并发处理每个会话
|
||||||
batch = sessions_df.iloc[i:i + self.batch_size]
|
def process_single_session(session_row):
|
||||||
future = executor.submit(process_session_batch, self.db_client, batch)
|
try:
|
||||||
future_to_batch[future] = i // self.batch_size + 1
|
session_id = session_row['SESSION_ID']
|
||||||
|
messages_df = self.db_client.query_messages_by_session_id(session_id)
|
||||||
|
|
||||||
|
if messages_df is not None and not messages_df.empty:
|
||||||
|
conversation = self.db_client.data_processor.messages_df_to_list(messages_df)
|
||||||
|
if conversation:
|
||||||
|
return conversation
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理会话 {session_row.get('SESSION_ID', 'unknown')} 时出错: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 使用线程池处理所有会话
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||||
|
# 提交所有会话处理任务
|
||||||
|
future_to_session = {
|
||||||
|
executor.submit(process_single_session, row): i
|
||||||
|
for i, row in sessions_df.iterrows()
|
||||||
|
}
|
||||||
|
|
||||||
# 收集结果
|
# 收集结果
|
||||||
with tqdm(total=batch_count, desc="处理批次进度") as pbar:
|
with tqdm(total=total_sessions, desc="处理会话进度") as pbar:
|
||||||
for future in concurrent.futures.as_completed(future_to_batch):
|
for future in concurrent.futures.as_completed(future_to_session):
|
||||||
try:
|
try:
|
||||||
batch_conversations = future.result()
|
conversation = future.result()
|
||||||
all_conversations.extend(batch_conversations)
|
if conversation:
|
||||||
|
all_conversations.append(conversation)
|
||||||
|
|
||||||
# 使用锁保护临时列表的操作
|
|
||||||
with self.temp_save_lock:
|
|
||||||
# 每处理100个对话临时保存一次
|
# 每处理100个对话临时保存一次
|
||||||
|
if len(all_conversations) % 100 == 0:
|
||||||
|
with self.temp_save_lock:
|
||||||
logger.info(f"临时保存 {len(all_conversations)} 个对话")
|
logger.info(f"临时保存 {len(all_conversations)} 个对话")
|
||||||
temp_output_file = self.db_client.export_to_excel(
|
temp_output_file = self.db_client.export_to_excel(
|
||||||
all_conversations,
|
all_conversations,
|
||||||
@@ -469,12 +463,9 @@ class SessionProcessor:
|
|||||||
if temp_output_file:
|
if temp_output_file:
|
||||||
logger.info(f"临时保存完成: {temp_output_file}")
|
logger.info(f"临时保存完成: {temp_output_file}")
|
||||||
|
|
||||||
batch_num = future_to_batch[future]
|
|
||||||
logger.debug(f"批次 {batch_num} 完成,获得 {len(batch_conversations)} 个对话")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
batch_num = future_to_batch[future]
|
session_idx = future_to_session[future]
|
||||||
logger.error(f"处理批次 {batch_num} 时出错: {e}")
|
logger.error(f"处理会话索引 {session_idx} 时出错: {e}")
|
||||||
|
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
@@ -492,17 +483,17 @@ def main() -> None:
|
|||||||
# 创建数据库客户端
|
# 创建数据库客户端
|
||||||
with MariaDBClient(config, max_connections=12) as db_client:
|
with MariaDBClient(config, max_connections=12) as db_client:
|
||||||
# 查询会话数据
|
# 查询会话数据
|
||||||
start_date = '2025-01-01 00:00:00'
|
start_date = '2025-06-12 00:00:00'
|
||||||
end_date = '2025-06-12 00:00:00'
|
end_date = '2025-07-01 00:00:00'
|
||||||
|
|
||||||
logger.info(f"查询时间范围: {start_date} 到 {end_date}")
|
logger.info(f"查询时间范围: {start_date} 到 {end_date}")
|
||||||
# 创建会话处理器
|
# 创建会话处理器
|
||||||
processor = SessionProcessor(db_client, batch_size=100)
|
processor = SessionProcessor(db_client)
|
||||||
is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
|
# is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
|
||||||
if is_debug:
|
# if is_debug:
|
||||||
messages_df = db_client.query_messages_by_session_id("86c919e0-09f1-11f0-84ae-2daf59566989")
|
# messages_df = db_client.query_messages_by_session_id("86c919e0-09f1-11f0-84ae-2daf59566989")
|
||||||
print(db_client.data_processor.messages_df_to_list(messages_df))
|
# print(db_client.data_processor.messages_df_to_list(messages_df))
|
||||||
return []
|
# return []
|
||||||
|
|
||||||
sessions_df = db_client.query_sessions(start_date, end_date)
|
sessions_df = db_client.query_sessions(start_date, end_date)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,116 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
|
||||||
|
from regex import search
|
||||||
|
|
||||||
|
import ijson
|
||||||
|
|
||||||
|
df = pd.read_excel("data/excel/已分析数据汇总(第一轮).xlsx")
|
||||||
|
df=df[df["评价"]=="dislike"]
|
||||||
|
|
||||||
|
msg_id_list = df["msg_id"].tolist()
|
||||||
|
msg_debug_list = {}
|
||||||
|
# 流式解析 JSON 数组
|
||||||
|
with open("data/excel/msg_debug_list.json", "r", encoding="utf-8") as f:
|
||||||
|
# 使用ijson.items直接获取顶层键值对
|
||||||
|
for msg_id, data in ijson.kvitems(f, ''):
|
||||||
|
if msg_id in msg_id_list:
|
||||||
|
msg_debug_list[msg_id] = data
|
||||||
|
|
||||||
|
def get_rewrite_query(intent_node_execution_info)->str:
|
||||||
|
outputs_result =json.loads(intent_node_execution_info['outputs'])
|
||||||
|
return outputs_result['optimize_query']
|
||||||
|
|
||||||
|
def judge_error_node_and_reason(intent_node_execution_info, knowledge_filter_node_execution_info_list, answer_wiki_name)->dict:
|
||||||
|
result = {"问题改写结果":None, "错误环节":None, "错误原因":None, "具体描述":None}
|
||||||
|
if answer_wiki_name is None or pd.isna(answer_wiki_name):
|
||||||
|
return result
|
||||||
|
|
||||||
|
outputs_result =json.loads(intent_node_execution_info['outputs'])
|
||||||
|
result["问题改写结果"] = outputs_result['optimize_query']
|
||||||
|
if outputs_result['is_complete'] == False:
|
||||||
|
result["错误环节"] = "槽点填充"
|
||||||
|
result["错误原因"] = f"槽点缺失"
|
||||||
|
result["具体描述"] = f"缺失内容:{outputs_result['missing_slots']}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
if len(knowledge_filter_node_execution_info_list) == 0:
|
||||||
|
return result
|
||||||
|
|
||||||
|
knowledge_filter_node_execution_info=knowledge_filter_node_execution_info_list[0]
|
||||||
|
# 获取检索到的所有词条
|
||||||
|
knowledge_filter_outputs = json.loads(knowledge_filter_node_execution_info['outputs'])
|
||||||
|
source_knowledge = knowledge_filter_outputs['source_kno']
|
||||||
|
source_knowledge_title ="\n".join([item['title'] for item in source_knowledge])
|
||||||
|
if answer_wiki_name not in source_knowledge_title:
|
||||||
|
result["错误环节"] = "知识检索"
|
||||||
|
result["错误原因"] = f"未检索到对应词条"
|
||||||
|
|
||||||
|
# 获取词条名称及对应评分
|
||||||
|
result["具体描述"] = "检索到的词条如下:\n"
|
||||||
|
for index, item in enumerate(source_knowledge):
|
||||||
|
result["具体描述"] += f"词条名称:{item['title'].split('/')[-1]},重排评分:{item['metadata']['score']:.2f}\n"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 获取检索到的词条的metadata
|
||||||
|
knowledge_filter = knowledge_filter_outputs['knowledge_list_metadata']
|
||||||
|
knowledge_filter_title ="\n".join([item['title'] for item in knowledge_filter])
|
||||||
|
if answer_wiki_name not in knowledge_filter_title:
|
||||||
|
result["错误环节"] = "知识过滤"
|
||||||
|
result["错误原因"] = f"词条被过滤"
|
||||||
|
result["具体描述"] = "检索到的词条如下:\n"
|
||||||
|
for index, item in enumerate(source_knowledge):
|
||||||
|
result["具体描述"] += f"词条名称:{item['title'].split('/')[-1]},重排评分:{item['metadata']['score']:.2f}\n"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 检索正确,回答错误
|
||||||
|
result["错误环节"] = "生成错误"
|
||||||
|
result["错误原因"] = f""
|
||||||
|
result["具体描述"] = f""
|
||||||
|
return result
|
||||||
|
|
||||||
|
df["问题改写结果"] = None
|
||||||
|
df["错误环节"] = None
|
||||||
|
df["错误原因"] = None
|
||||||
|
df["具体描述"] = None
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
try:
|
||||||
|
msg_id = row["msg_id"]
|
||||||
|
answer = row["回答"]
|
||||||
|
query = row["提问"]
|
||||||
|
rating = row["评价"]
|
||||||
|
class_type = row["问题分类"]
|
||||||
|
dislike_reason = row["点踩原因"]
|
||||||
|
if dislike_reason is None or pd.isna(dislike_reason):
|
||||||
|
continue
|
||||||
|
|
||||||
|
answer_wiki_name = row["关联词条"]
|
||||||
|
search_wiki = row["检索到的词条"]
|
||||||
|
node_executions_info = msg_debug_list[msg_id]
|
||||||
|
intent_node_execution_info = [node_execution_info for node_execution_info in node_executions_info
|
||||||
|
if node_execution_info["title"] == "意图识别结果解析"]
|
||||||
|
|
||||||
|
knowledge_filter_node_execution_info_list = [node_execution_info for node_execution_info in node_executions_info
|
||||||
|
if node_execution_info["title"] == "提取处理后的知识"]
|
||||||
|
if len(intent_node_execution_info) == 0:
|
||||||
|
print(f"msg_id: {msg_id} 缺少节点信息")
|
||||||
|
continue
|
||||||
|
|
||||||
|
rewrite_query = get_rewrite_query(intent_node_execution_info[0])
|
||||||
|
df.loc[index, "问题改写结果"] = rewrite_query
|
||||||
|
if "有词条" not in dislike_reason:
|
||||||
|
continue
|
||||||
|
result = judge_error_node_and_reason(intent_node_execution_info[0], knowledge_filter_node_execution_info_list, answer_wiki_name)
|
||||||
|
for key, value in result.items():
|
||||||
|
df.loc[index, key] = value
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"msg_id: {msg_id} 处理失败: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
df.to_excel("data/excel/已分析数据汇总(第一轮)_分析.xlsx", index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append(os.getcwd())
|
||||||
|
import rag2_0.dify.dify_client.dify_api as DifyApi
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
pd_data = pd.read_excel("data/excel/2025年5月30日到6月10号对话记录_转工单.xlsx")
|
||||||
|
|
||||||
|
|
||||||
|
dify_api = DifyApi.DifyApi()
|
||||||
|
dataset_id = dify_api.get_or_create_dataset_by_name("工单问答数据")
|
||||||
|
document_id = dify_api.upload_text_to_document(text_name="5月30日到6月10号对话工单", text="", dataset_id=dataset_id)
|
||||||
|
|
||||||
|
segments_list=[]
|
||||||
|
for index, row in pd_data.iterrows():
|
||||||
|
query = row["客户问题"]
|
||||||
|
answer = row["解决方案"]
|
||||||
|
if "存在抱怨" in answer:
|
||||||
|
answer = answer.split("存在抱怨")[0]
|
||||||
|
|
||||||
|
content = f"问题:{query}\n解决方案:{answer}"
|
||||||
|
segments_list.append({
|
||||||
|
"content": str(content),
|
||||||
|
"answer": "",
|
||||||
|
"keywords": []
|
||||||
|
})
|
||||||
|
|
||||||
|
dify_api.add_document_segments(dataset_id=dataset_id, document_id=document_id, segments_list=segments_list)
|
||||||
@@ -402,7 +402,8 @@ class DifyApi:
|
|||||||
content: str,
|
content: str,
|
||||||
answer: str,
|
answer: str,
|
||||||
keywords: List[str],
|
keywords: List[str],
|
||||||
enabled: bool
|
enabled: bool,
|
||||||
|
regenerate_child_chunks: bool = True
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
更新指定文档的某个分段信息。
|
更新指定文档的某个分段信息。
|
||||||
@@ -430,7 +431,7 @@ class DifyApi:
|
|||||||
"answer": answer,
|
"answer": answer,
|
||||||
"keywords": keywords,
|
"keywords": keywords,
|
||||||
"enabled": enabled,
|
"enabled": enabled,
|
||||||
"regenerate_child_chunks": True
|
"regenerate_child_chunks": regenerate_child_chunks
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -37,9 +37,9 @@ class DifyExporter:
|
|||||||
self.query_log_dir = os.path.join(os.getcwd(), "data", "query_logs")
|
self.query_log_dir = os.path.join(os.getcwd(), "data", "query_logs")
|
||||||
self.query_log_file = query_log_file or os.path.join(self.query_log_dir, "answer_type_logs.json")
|
self.query_log_file = query_log_file or os.path.join(self.query_log_dir, "answer_type_logs.json")
|
||||||
|
|
||||||
# 设置日期过滤
|
# 设置日期过滤,转换为datetime对象
|
||||||
self.start_date = start_date
|
self.start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d %H") if start_date else None
|
||||||
self.end_date = end_date
|
self.end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d %H") if end_date else None
|
||||||
|
|
||||||
# 初始化工具类
|
# 初始化工具类
|
||||||
self.dify_pgsql = PgSql()
|
self.dify_pgsql = PgSql()
|
||||||
@@ -49,22 +49,22 @@ class DifyExporter:
|
|||||||
self.message_info_list = []
|
self.message_info_list = []
|
||||||
self.query_logs = {}
|
self.query_logs = {}
|
||||||
|
|
||||||
def load_query_logs(self):
|
def load_query_logs(self,path):
|
||||||
"""
|
"""
|
||||||
从文件加载查询日志
|
从文件加载查询日志
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
with open(self.query_log_file, 'r', encoding='utf-8') as f:
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
query_logs_list = json.load(f)
|
query_logs_list = json.load(f)
|
||||||
# 创建字典来存储每个查询的最新记录
|
# 创建字典来存储每个查询的最新记录workflow_run_id
|
||||||
for record in query_logs_list:
|
for record in query_logs_list:
|
||||||
query = record['query']
|
workflow_run_id = record['workflow_run_id']
|
||||||
timestamp = record.get('timestamp')
|
timestamp = record.get('timestamp')
|
||||||
# 如果查询不在字典中或者当前记录的时间戳更新,则更新字典
|
# 如果查询不在字典中或者当前记录的时间戳更新,则更新字典
|
||||||
if query not in self.query_logs or (timestamp and self.query_logs.get(query, {}).get('timestamp') and
|
if workflow_run_id not in self.query_logs or (timestamp and self.query_logs.get(workflow_run_id, {}).get('timestamp') and
|
||||||
datetime.datetime.fromisoformat(timestamp) >
|
datetime.datetime.fromisoformat(timestamp) >
|
||||||
datetime.datetime.fromisoformat(self.query_logs[query]['timestamp'])):
|
datetime.datetime.fromisoformat(self.query_logs[workflow_run_id]['timestamp'])):
|
||||||
self.query_logs[query] = record
|
self.query_logs[workflow_run_id] = record
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"加载查询日志失败: {e}")
|
print(f"加载查询日志失败: {e}")
|
||||||
@@ -103,6 +103,71 @@ class DifyExporter:
|
|||||||
message_chain_new.append(msg)
|
message_chain_new.append(msg)
|
||||||
return message_chain_new
|
return message_chain_new
|
||||||
|
|
||||||
|
def get_remark(self, msg_debug_info):
|
||||||
|
"""
|
||||||
|
获取备注
|
||||||
|
"""
|
||||||
|
intent_node_execution_info = [node_execution_info for node_execution_info in msg_debug_info['workflow_node_executions_info']
|
||||||
|
if node_execution_info["title"] == "意图识别结果解析"]
|
||||||
|
if len(intent_node_execution_info) == 0:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if intent_node_execution_info[0]["outputs"] is None:
|
||||||
|
return ""
|
||||||
|
intent_result = json.loads(intent_node_execution_info[0]["outputs"])
|
||||||
|
vertical_classification = intent_result.get("vertical_classification", "")
|
||||||
|
sub_classification = intent_result.get("sub_classification", "")
|
||||||
|
if vertical_classification == "固定话术类":
|
||||||
|
return "使用固定话术"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_node_info_by_title(self, workflow_node_executions_info:list, title:str) -> dict:
|
||||||
|
"""
|
||||||
|
获取指定标题的节点信息
|
||||||
|
"""
|
||||||
|
if workflow_node_executions_info is None:
|
||||||
|
return None
|
||||||
|
for node_execution in workflow_node_executions_info:
|
||||||
|
if node_execution["title"] == title:
|
||||||
|
return node_execution
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_wiki_list(self, msg_debug_info) -> list:
|
||||||
|
"""
|
||||||
|
获取检索到的词条列表
|
||||||
|
"""
|
||||||
|
wiki_list = []
|
||||||
|
if msg_debug_info['workflow_node_executions_info'] is None:
|
||||||
|
return []
|
||||||
|
node_execution = self.get_node_info_by_title(msg_debug_info['workflow_node_executions_info'], "提取处理后的知识")
|
||||||
|
if node_execution is not None:
|
||||||
|
if node_execution["outputs"] is None:
|
||||||
|
return []
|
||||||
|
source_kno = json.loads(node_execution["outputs"])["source_kno"]
|
||||||
|
knowledge_list_metadata = json.loads(node_execution["outputs"])["knowledge_list_metadata"]
|
||||||
|
for knowledge in knowledge_list_metadata:
|
||||||
|
document_name = knowledge['metadata']['document_name']
|
||||||
|
wiki_list.append(document_name.split("/")[-1])
|
||||||
|
return wiki_list
|
||||||
|
|
||||||
|
lock_node_execution = self.get_node_info_by_title(msg_debug_info['workflow_node_executions_info'], "软件锁知识")
|
||||||
|
if lock_node_execution is not None:
|
||||||
|
if lock_node_execution["outputs"] is None:
|
||||||
|
return []
|
||||||
|
source_kno = json.loads(lock_node_execution["outputs"])['json'][0]['retrieve_result']
|
||||||
|
for knowledge in source_kno:
|
||||||
|
document_name = knowledge['metadata']['document_name']
|
||||||
|
wiki_list.append(document_name.split("/")[-1])
|
||||||
|
|
||||||
|
wiki_list.append("锁信息查询")
|
||||||
|
wiki_list.append("软件锁注册、激活、查锁、试用锁延期")
|
||||||
|
return wiki_list
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def extract_message_info(self, message):
|
def extract_message_info(self, message):
|
||||||
"""
|
"""
|
||||||
从消息中提取信息
|
从消息中提取信息
|
||||||
@@ -121,7 +186,7 @@ class DifyExporter:
|
|||||||
user_name = msg_inputs.get("user_name", "")
|
user_name = msg_inputs.get("user_name", "")
|
||||||
msg_query = message["query"]
|
msg_query = message["query"]
|
||||||
msg_answer = message["answer"]
|
msg_answer = message["answer"]
|
||||||
|
msg_answer = msg_answer.split("----------------------------------------")[0]
|
||||||
# 将UTC+0时间转换为UTC+8时间
|
# 将UTC+0时间转换为UTC+8时间
|
||||||
created_at_utc = message['created_at']
|
created_at_utc = message['created_at']
|
||||||
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
||||||
@@ -131,17 +196,9 @@ class DifyExporter:
|
|||||||
if not msg_debug_info:
|
if not msg_debug_info:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
wiki_list = []
|
wiki_list = self.get_wiki_list(msg_debug_info)
|
||||||
if msg_debug_info['workflow_node_executions_info'] is not None:
|
# 获取备注
|
||||||
for node_execution in msg_debug_info['workflow_node_executions_info']:
|
remark = self.get_remark(msg_debug_info)
|
||||||
if node_execution["title"] == "提取处理后的知识":
|
|
||||||
if node_execution["outputs"] is None:
|
|
||||||
break
|
|
||||||
source_kno = json.loads(node_execution["outputs"])["source_kno"]
|
|
||||||
knowledge_list_metadata = json.loads(node_execution["outputs"])["knowledge_list_metadata"]
|
|
||||||
for knowledge in knowledge_list_metadata:
|
|
||||||
document_name = knowledge['metadata']['document_name']
|
|
||||||
wiki_list.append(document_name.split("/")[-1])
|
|
||||||
|
|
||||||
wiki_list = list(set(wiki_list))
|
wiki_list = list(set(wiki_list))
|
||||||
wiki_list_str = "\n".join(wiki_list)
|
wiki_list_str = "\n".join(wiki_list)
|
||||||
@@ -149,7 +206,8 @@ class DifyExporter:
|
|||||||
wiki_list_str = "无"
|
wiki_list_str = "无"
|
||||||
rating = self.dify_pgsql.get_message_rating(msg_id)
|
rating = self.dify_pgsql.get_message_rating(msg_id)
|
||||||
# 直接通过字典键获取query_type
|
# 直接通过字典键获取query_type
|
||||||
query_type = self.query_logs.get(msg_query, {}).get('query_type', "")
|
workflow_run_id = message['workflow_run_id']
|
||||||
|
query_type = self.query_logs.get(workflow_run_id, {}).get('query_type', "")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"msg_id": msg_id,
|
"msg_id": msg_id,
|
||||||
@@ -159,7 +217,8 @@ class DifyExporter:
|
|||||||
"提问时间": created_at,
|
"提问时间": created_at,
|
||||||
"评价": rating,
|
"评价": rating,
|
||||||
"问题分类": query_type,
|
"问题分类": query_type,
|
||||||
"检索到的词条": wiki_list_str
|
"检索到的词条": wiki_list_str,
|
||||||
|
"备注": remark
|
||||||
}
|
}
|
||||||
|
|
||||||
def process_conversations(self):
|
def process_conversations(self):
|
||||||
@@ -184,13 +243,10 @@ class DifyExporter:
|
|||||||
created_at_utc = message['created_at']
|
created_at_utc = message['created_at']
|
||||||
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
||||||
|
|
||||||
# 提取消息的创建日期时间,精确到小时
|
|
||||||
created_at_hour = created_at_utc8.strftime("%Y-%m-%d %H")
|
|
||||||
|
|
||||||
# 应用日期时间过滤
|
# 应用日期时间过滤
|
||||||
if self.start_date and created_at_hour < self.start_date:
|
if self.start_date and created_at_utc8 < self.start_date:
|
||||||
continue
|
continue
|
||||||
if self.end_date and created_at_hour > self.end_date:
|
if self.end_date and created_at_utc8 > self.end_date:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
message_info = self.extract_message_info(message)
|
message_info = self.extract_message_info(message)
|
||||||
@@ -216,7 +272,7 @@ class DifyExporter:
|
|||||||
# 设置列的顺序
|
# 设置列的顺序
|
||||||
columns_order = [
|
columns_order = [
|
||||||
"msg_id", "提问", "回答", "提问人", "提问时间",
|
"msg_id", "提问", "回答", "提问人", "提问时间",
|
||||||
"评价", "问题分类", "检索到的词条"
|
"评价", "问题分类", "检索到的词条", "备注"
|
||||||
]
|
]
|
||||||
|
|
||||||
# 确保所有列都存在,如果不存在则添加空列
|
# 确保所有列都存在,如果不存在则添加空列
|
||||||
@@ -252,7 +308,8 @@ class DifyExporter:
|
|||||||
"提问时间": 15,
|
"提问时间": 15,
|
||||||
"评价": 10,
|
"评价": 10,
|
||||||
"问题分类": 20,
|
"问题分类": 20,
|
||||||
"检索到的词条": 40
|
"检索到的词条": 40,
|
||||||
|
"备注": 40
|
||||||
}
|
}
|
||||||
|
|
||||||
# 应用列宽设置
|
# 应用列宽设置
|
||||||
@@ -281,7 +338,8 @@ class DifyExporter:
|
|||||||
数据库中的时间是UTC+0时区,会自动转换为UTC+8时区进行过滤和显示
|
数据库中的时间是UTC+0时区,会自动转换为UTC+8时区进行过滤和显示
|
||||||
"""
|
"""
|
||||||
# 加载查询日志
|
# 加载查询日志
|
||||||
self.load_query_logs()
|
self.load_query_logs(self.query_log_file)
|
||||||
|
self.load_query_logs("data/query_logs/answer_type_logs_071409.json")
|
||||||
|
|
||||||
# 处理会话数据
|
# 处理会话数据
|
||||||
self.process_conversations()
|
self.process_conversations()
|
||||||
@@ -294,12 +352,12 @@ class DifyExporter:
|
|||||||
# 如果指定了日期范围,则在文件名中体现
|
# 如果指定了日期范围,则在文件名中体现
|
||||||
date_suffix = ""
|
date_suffix = ""
|
||||||
if self.start_date:
|
if self.start_date:
|
||||||
# 将空格替换为下划线,使文件名更规范
|
# 格式化日期对象为字符串
|
||||||
formatted_start = self.start_date.replace(" ", "_")
|
formatted_start = self.start_date.strftime("%Y-%m-%d_%H")
|
||||||
date_suffix += f"_from_{formatted_start}"
|
date_suffix += f"_from_{formatted_start}"
|
||||||
if self.end_date:
|
if self.end_date:
|
||||||
# 将空格替换为下划线,使文件名更规范
|
# 格式化日期对象为字符串
|
||||||
formatted_end = self.end_date.replace(" ", "_")
|
formatted_end = self.end_date.strftime("%Y-%m-%d_%H")
|
||||||
date_suffix += f"_to_{formatted_end}"
|
date_suffix += f"_to_{formatted_end}"
|
||||||
output_file = os.path.join(os.getcwd(), "data", "excel", f"dify_export{date_suffix}_{timestamp}.xlsx")
|
output_file = os.path.join(os.getcwd(), "data", "excel", f"dify_export{date_suffix}_{timestamp}.xlsx")
|
||||||
|
|
||||||
@@ -321,7 +379,7 @@ if __name__ == "__main__":
|
|||||||
help='Dify应用ID')
|
help='Dify应用ID')
|
||||||
parser.add_argument('--query_log_file', '-q', type=str, default="data/query_logs/answer_type_logs.json",
|
parser.add_argument('--query_log_file', '-q', type=str, default="data/query_logs/answer_type_logs.json",
|
||||||
help='查询日志文件路径')
|
help='查询日志文件路径')
|
||||||
parser.add_argument('--start_date', '-s', type=str, default="2025-07-09 13",
|
parser.add_argument('--start_date', '-s', type=str, default="2025-07-14 00",
|
||||||
help='开始日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 14表示2025年7月8日14时(UTC+8时区)')
|
help='开始日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 14表示2025年7月8日14时(UTC+8时区)')
|
||||||
parser.add_argument('--end_date', '-e', type=str, default=None,
|
parser.add_argument('--end_date', '-e', type=str, default=None,
|
||||||
help='结束日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 18表示2025年7月8日18时(UTC+8时区)')
|
help='结束日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 18表示2025年7月8日18时(UTC+8时区)')
|
||||||
|
|||||||
@@ -288,8 +288,10 @@ class AsyncIntentRecognizer:
|
|||||||
# 步骤2: 使用向量检索找到相似的专业名词
|
# 步骤2: 使用向量检索找到相似的专业名词
|
||||||
try:
|
try:
|
||||||
vector_start_time = time.time()
|
vector_start_time = time.time()
|
||||||
# 对matched_terms中的每个关键字进行向量检索
|
|
||||||
for current_key in query_keys:
|
# 创建并行任务列表
|
||||||
|
async def process_single_keyword(current_key: str) -> List[Term]:
|
||||||
|
"""处理单个关键词的向量检索和重排序"""
|
||||||
vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
|
vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
|
||||||
current_key_terms = set()
|
current_key_terms = set()
|
||||||
# 添加向量检索结果
|
# 添加向量检索结果
|
||||||
@@ -304,7 +306,17 @@ class AsyncIntentRecognizer:
|
|||||||
current_key_terms.add(term)
|
current_key_terms.add(term)
|
||||||
if len(current_key_terms) > 0:
|
if len(current_key_terms) > 0:
|
||||||
reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
|
reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
|
||||||
matched_terms.extend(reranked_terms)
|
return reranked_terms
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 并行处理所有关键词
|
||||||
|
keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys]
|
||||||
|
keyword_results = await asyncio.gather(*keyword_tasks)
|
||||||
|
|
||||||
|
# 合并所有结果
|
||||||
|
for result in keyword_results:
|
||||||
|
matched_terms.extend(result)
|
||||||
|
|
||||||
vector_end_time = time.time()
|
vector_end_time = time.time()
|
||||||
vector_time = vector_end_time - vector_start_time
|
vector_time = vector_end_time - vector_start_time
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -649,7 +661,7 @@ class AsyncIntentRecognizer:
|
|||||||
formatted_prompt = step_back_prompt.format(
|
formatted_prompt = step_back_prompt.format(
|
||||||
query=query,
|
query=query,
|
||||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||||
conversation_context=conversation_context,
|
# conversation_context=conversation_context,
|
||||||
output_format=step_back_parser.get_format_instructions()
|
output_format=step_back_parser.get_format_instructions()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -688,7 +700,7 @@ class AsyncIntentRecognizer:
|
|||||||
formatted_prompt = follow_up_questions_prompt.format(
|
formatted_prompt = follow_up_questions_prompt.format(
|
||||||
query=query,
|
query=query,
|
||||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||||
conversation_context=conversation_context,
|
# conversation_context=conversation_context,
|
||||||
output_format=follow_up_parser.get_format_instructions()
|
output_format=follow_up_parser.get_format_instructions()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -727,7 +739,7 @@ class AsyncIntentRecognizer:
|
|||||||
formatted_prompt = hyde_prompt.format(
|
formatted_prompt = hyde_prompt.format(
|
||||||
query=query,
|
query=query,
|
||||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||||
conversation_context=conversation_context,
|
# conversation_context=conversation_context,
|
||||||
output_format=hyde_parser.get_format_instructions()
|
output_format=hyde_parser.get_format_instructions()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -766,7 +778,7 @@ class AsyncIntentRecognizer:
|
|||||||
formatted_prompt = multi_questions_prompt.format(
|
formatted_prompt = multi_questions_prompt.format(
|
||||||
query=query,
|
query=query,
|
||||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||||
conversation_context=conversation_context,
|
# conversation_context=conversation_context,
|
||||||
output_format=multi_questions_parser.get_format_instructions()
|
output_format=multi_questions_parser.get_format_instructions()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -226,30 +226,30 @@ step_back_prompt = """
|
|||||||
- 涵盖原始问题的核心主题
|
- 涵盖原始问题的核心主题
|
||||||
- 去除过于具体的限制条件(如时间、地点、特定版本、特定工程等)
|
- 去除过于具体的限制条件(如时间、地点、特定版本、特定工程等)
|
||||||
- 保持在同一领域和主题范围内
|
- 保持在同一领域和主题范围内
|
||||||
|
- 依次移除问题中的限定词或者修饰词
|
||||||
|
|
||||||
## 输入
|
## 输入
|
||||||
用户原始问题: {query}
|
用户原始问题: {query}
|
||||||
历史对话记录: {chat_history}
|
历史对话记录: {chat_history}
|
||||||
会话背景: {conversation_context}
|
|
||||||
|
|
||||||
## 输出格式
|
## 输出格式
|
||||||
{output_format}
|
{output_format}
|
||||||
|
|
||||||
## 示例
|
## 示例
|
||||||
原始问题: "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?"
|
原始问题: "2023版本如何在Windows 11系统上导入单位工程量清单?"
|
||||||
后退问题:
|
后退问题:
|
||||||
{{
|
{{
|
||||||
"original_query": "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?",
|
"original_query": "2023版本如何在Windows 11系统上导入单位工程量清单?",
|
||||||
"can_use_back_prompt": True,
|
"can_use_back_prompt": True,
|
||||||
"step_back_query": ["配网D3软件如何导入工程量清单?", "如何导入单位工程量清单?"]
|
"step_back_query": ["如何在Windows 11系统上导入单位工程量清单?", "如何导入单位工程量清单?"]
|
||||||
}}
|
}}
|
||||||
|
|
||||||
原始问题: "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?"
|
原始问题: "某个设备更换后,如何在系统中更新对应的定额?"
|
||||||
后退问题:
|
后退问题:
|
||||||
{{
|
{{
|
||||||
"original_query": "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?",
|
"original_query": "某个设备更换后,如何在系统中更新对应的定额?",
|
||||||
"can_use_back_prompt": True,
|
"can_use_back_prompt": True,
|
||||||
"step_back_query": ["技改T1软件中如何更新设备对应的定额?", "如何更新设备对应的定额?"]
|
"step_back_query": ["如何更新设备对应的定额?", "如何更新定额?"]
|
||||||
}}
|
}}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -271,7 +271,6 @@ follow_up_questions_prompt = """
|
|||||||
## 输入
|
## 输入
|
||||||
历史对话记录: {chat_history}
|
历史对话记录: {chat_history}
|
||||||
当前用户问题: {query}
|
当前用户问题: {query}
|
||||||
会话背景: {conversation_context}
|
|
||||||
|
|
||||||
## 输出格式
|
## 输出格式
|
||||||
{output_format}
|
{output_format}
|
||||||
@@ -308,7 +307,6 @@ hyde_prompt = """
|
|||||||
## 输入
|
## 输入
|
||||||
用户问题: {query}
|
用户问题: {query}
|
||||||
历史对话记录: {chat_history}
|
历史对话记录: {chat_history}
|
||||||
会话背景: {conversation_context}
|
|
||||||
|
|
||||||
## 输出格式
|
## 输出格式
|
||||||
{output_format}
|
{output_format}
|
||||||
@@ -343,7 +341,6 @@ multi_questions_prompt = """
|
|||||||
## 输入
|
## 输入
|
||||||
用户原始问题: {query}
|
用户原始问题: {query}
|
||||||
历史对话记录: {chat_history}
|
历史对话记录: {chat_history}
|
||||||
会话背景: {conversation_context}
|
|
||||||
|
|
||||||
## 输出格式
|
## 输出格式
|
||||||
{output_format}
|
{output_format}
|
||||||
|
|||||||
@@ -275,16 +275,17 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
stats = instance.get_usage_stats()
|
stats = instance.get_usage_stats()
|
||||||
all_balance=0.0
|
all_balance=0.0
|
||||||
buy_balance=19 * 10 * 14 # 购买18次,一次10条api_key,每个api_key有14元
|
buy_balance=24 * 10 * 14 # 购买18次,一次10条api_key,每个api_key有14元
|
||||||
invalid_api_keys = []
|
invalid_api_keys = []
|
||||||
for key, data in stats.items():
|
for key, data in stats.items():
|
||||||
usage_stats = APIKeyManager.get_key_usage_stats(key)
|
usage_stats = APIKeyManager.get_key_usage_stats(key)
|
||||||
all_balance+=float(usage_stats['data']['balance'])
|
all_balance+=float(usage_stats['data']['balance'])
|
||||||
valid,err_info = APIKeyManager.get_valid_api_keys(key)
|
# valid,err_info = APIKeyManager.get_valid_api_keys(key)
|
||||||
if not valid:
|
# if not valid:
|
||||||
print(f"api_key:{key}---赠送余额:{usage_stats['data']['balance']}元---报错信息:{err_info}")
|
# print(f"api_key:{key}---赠送余额:{usage_stats['data']['balance']}元---报错信息:{err_info}")
|
||||||
# invalid_api_keys.append(key)
|
# # invalid_api_keys.append(key)
|
||||||
else:
|
# else:
|
||||||
|
# print(f"api_key:{key}---赠送余额:{usage_stats['data']['balance']}元")
|
||||||
print(f"api_key:{key}---赠送余额:{usage_stats['data']['balance']}元")
|
print(f"api_key:{key}---赠送余额:{usage_stats['data']['balance']}元")
|
||||||
if float(usage_stats['data']['balance']) == 0:
|
if float(usage_stats['data']['balance']) == 0:
|
||||||
invalid_api_keys.append(key)
|
invalid_api_keys.append(key)
|
||||||
|
|||||||
Reference in New Issue
Block a user