From b6b697efdb7028c3172374d61e14a1b6990237ad Mon Sep 17 00:00:00 2001 From: paituo <330435863@qq.com> Date: Wed, 9 Apr 2025 14:21:18 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=9F=A5=E8=AF=86=E5=BA=93?= =?UTF-8?q?=E8=8C=83=E5=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.template | 14 ++++++++------ agentic_rag.py | 27 ++++++++++++++++++++++++--- main.py | 41 +++++++++++++++++++++++++---------------- pyproject.toml | 1 + 4 files changed, 58 insertions(+), 25 deletions(-) diff --git a/.env.template b/.env.template index 96c7fea..fbbcd06 100644 --- a/.env.template +++ b/.env.template @@ -9,13 +9,15 @@ MODEL_LIST=Qwen2.5-72B=openai:Qwen2.5-72B-Instruct-GPTQ-Int8 MODEL_BASE_URL=http://172.20.0.145:9995/v1 # 文件路径配置 -KNOWLEDGE_SOURCE_DIR=data -MEMORY_DB_FILE=tmp/agent_memory.db -VECTOR_DB_PATH=tmp/lancedb -SESSION_STORAGE_PATH=tmp/agent_sessions_json +MINGCI_KNOWLEDGE_SOURCE_DIR=data/业务名词库 +MINGCI_VECTOR_DB_PATH=tmp/mingcidb -# 知识库加载控制 -LOAD_KNOWLEDGE=true +KNOWLEDGE_SOURCE_DIR=data/控件布局 +VECTOR_DB_PATH=tmp/knowledgedb + +MEMORY_DB_FILE=tmp/agent_memory.db + +SESSION_STORAGE_PATH=tmp/agent_sessions_json AGNO_MONITOR=true AGNO_TELEMETRY=true diff --git a/agentic_rag.py b/agentic_rag.py index 38bc10d..a5a9987 100644 --- a/agentic_rag.py +++ b/agentic_rag.py @@ -130,12 +130,21 @@ def initialize_memory(model) -> AgentMemory: def initialize_vector_db() -> LanceDb: """初始化并返回配置好的LanceDb实例""" return LanceDb( - table_name="recipes", - uri=os.getenv("VECTOR_DB_PATH", "tmp/lancedb"), + table_name="knowledge", + uri=os.getenv("VECTOR_DB_PATH", "tmp/knowledgedb"), search_type=SearchType.hybrid, embedder=OpenAIEmbedder(id=embedding_model, base_url=embedding_baseUrl, api_key=api_key) ) - + +def initialize_mingci_vector_db() -> LanceDb: + """初始化并返回配置好的LanceDb实例""" + return LanceDb( + table_name="mingci", + uri=os.getenv("MINGCI_VECTOR_DB_PATH", "tmp/mingcidb"), + search_type=SearchType.hybrid, + embedder=OpenAIEmbedder(id=embedding_model, base_url=embedding_baseUrl, api_key=api_key) + ) + def initialize_knowledge_base() -> AgentKnowledge: """初始化并返回配置好的AgentKnowledge实例""" return AgentKnowledge( @@ -149,6 +158,18 @@ def initialize_knowledge_base() -> AgentKnowledge: reader=TextReader(), # 默认文本读取器 ) +def initialize_mingci_knowledge_base() -> AgentKnowledge: + """初始化并返回配置好的AgentKnowledge实例""" + return AgentKnowledge( + vector_db=initialize_mingci_vector_db(), + num_documents=3, # 检索3个最相关的文档 + chunking_strategy=DocumentChunking( + chunk_size=500, + overlap=50, + ), # 固定大小分块 + optimize_on=1000, # 每1000条数据进行向量优化 + reader=TextReader(), # 默认文本读取器 + ) def get_agentic_rag_agent( model_id: str = "openai:gpt-4o", diff --git a/main.py b/main.py index 1a340df..3e11d35 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from agno.document import Document from agno.utils.log import logger from dotenv import load_dotenv -from agentic_rag import initialize_knowledge_base, get_reader +from agentic_rag import initialize_knowledge_base, get_reader, initialize_mingci_knowledge_base # 加载.env文件 load_dotenv() @@ -14,25 +14,34 @@ import os def main(): print("Hello from agno-agentic-rag!") # 从.env加载知识库来源目录并初始化知识库 - load_knowledge = os.getenv("LOAD_KNOWLEDGE", "false").lower() == "true" + mingci_knowledge_source_dir = os.getenv("MINGCI_KNOWLEDGE_SOURCE_DIR") + if mingci_knowledge_source_dir and os.path.exists(mingci_knowledge_source_dir): + # 初始化知识库 + knowledge_base = initialize_mingci_knowledge_base() + + LoadKnowledgeToDatabase(knowledge_base, mingci_knowledge_source_dir) + knowledge_source_dir = os.getenv("KNOWLEDGE_SOURCE_DIR") - if load_knowledge and knowledge_source_dir and os.path.exists(knowledge_source_dir): + if knowledge_source_dir and os.path.exists(knowledge_source_dir): # 初始化知识库 knowledge_base = initialize_knowledge_base() - logger.info(f"加载知识库: {knowledge_source_dir}") - for root, _, files in os.walk(knowledge_source_dir): - for file in files: - file_path = os.path.join(root, file) - file_ext = os.path.splitext(file)[1][1:] # 获取文件扩展名 - reader = get_reader(file_ext) - if reader: - try: - filePath = Path(file_path) - docs: List[Document] = reader.read(filePath) - knowledge_base.load_documents(docs, upsert=True) - except Exception as e: - logger.warning(f"无法加载文档 {file_path}: {str(e)}") + LoadKnowledgeToDatabase(knowledge_base, knowledge_source_dir) + +def LoadKnowledgeToDatabase(knowledge_base, knowledge_source_dir): + logger.info(f"加载知识库: {knowledge_source_dir}") + for root, _, files in os.walk(knowledge_source_dir): + for file in files: + file_path = os.path.join(root, file) + file_ext = os.path.splitext(file)[1][1:] # 获取文件扩展名 + reader = get_reader(file_ext) + if reader: + try: + filePath = Path(file_path) + docs: List[Document] = reader.read(filePath) + knowledge_base.load_documents(docs, upsert=True) + except Exception as e: + logger.warning(f"无法加载文档 {file_path}: {str(e)}") if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 16ecdbb..e03e5a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "nest-asyncio>=1.6.0", "streamlit>=1.44.1", "openai", + "pylance", "extra-streamlit-components>=0.1.71", "sqlalchemy>=2.0.38", "websockets>=14.2",