Merge branch 'dev' of https://git.97id.com/ly/zjdataai-app into dev

2024-09-05 10:11:10 +08:00
parent aba6475c5a 03c4eb1af1
commit 75fde3598b
13 changed files with 5275 additions and 788 deletions
@@ -0,0 +1,3 @@
+[submodule "webapp"]
+	path = webapp
+	url = https://git.97id.com/ly/webapp.git
@@ -112,6 +112,7 @@ SYSTEM_PROMPT="You are a weather forecast agent. You help users to get the weath
 - You can install any pip package (if it exists) by running a cell with pip install.
 "

+
 PRJTOJSON_URL = 'http://10.1.6.60:8092'
 PROJECT_TITLE = "您好，我是博微工程理解小助手，您可以问我有关[线路工程]工程数据的相关问题！"
 CHAT_UPLOAD_FILECACHE = "./output/uploaded"
@@ -27,8 +27,10 @@ from llama_index.core.settings import Settings

 logger = logging.getLogger("uvicorn")

+
 v1_router = v = APIRouter()

+
 gEvent_handler = None


@@ -7,6 +7,21 @@ class BaseConfig(BaseModel):

 	def ParamterCfg(self,**args):
 		projectInfo = args.get('projectInfo')
+		questions = os.getenv("CONVERSATION_STARTERS", "dev")
+		return{
+			"opening_statement": self.projectInfo,
+			"suggested_questions": questions.split('\n'),
+			"suggested_questions_after_answer": {
+				"enabled": False
+			},
+			"speech_to_text": {
+				"enabled": False
+			},
+			"text_to_speech": {
+class BaseConfig(BaseModel):
+	projectInfo:str = os.getenv("PROJECT_TITLE","您好，我是博微工程理解小助手，您可以问我有关[线路工程]工程数据的相关问题！")
+
+	def ParamterCfg(self):
 		questions = os.getenv("CONVERSATION_STARTERS", "dev")
 		return{
 			"opening_statement": self.projectInfo,
@@ -55,6 +70,33 @@ class BaseConfig(BaseModel):
 					]
 				}
 			},
+			"system_parameters": {
+				"image_file_size_limit": "10"
+				"language": "",
+				"voice": ""
+			},
+			"retriever_resource": {
+				"enabled": True
+			},
+			"annotation_reply": {
+				"enabled": False
+			},
+			"more_like_this": {
+				"enabled": False
+			},
+			"user_input_form": [],
+			"sensitive_word_avoidance": {
+				"enabled": False
+			},
+			"file_upload": {
+				"image": {
+					"enabled": False,
+					"number_limits": 3,
+					"transfer_methods": [
+						"remote_url"
+					]
+				}
+			},
 			"system_parameters": {
 				"image_file_size_limit": "10"
 			}
@@ -10,6 +10,7 @@ import os
 logger = logging.getLogger(__name__)

 def load_configs():
+
    with open("config/loaders.yaml",encoding='utf-8') as f:
        configs = yaml.safe_load(f)
    return configs
@@ -98,6 +99,7 @@ def get_documents(docType:str):
            loader_config = loader_config or []
            match loader_type:
                case "file":
+
                    document = get_file_documents(FileLoaderConfig(**loader_config),docType)
                case "web":
                    document = get_web_documents(WebLoaderConfig(**loader_config))
@@ -107,4 +109,4 @@ def get_documents(docType:str):
                    raise ValueError(f"Invalid loader type: {loader_type}")
            documents.extend(document)

-    return documents
+
@@ -67,16 +67,15 @@ class ModelPlatform:
@register(ModelPlateCategory,'ollama')
 class OllamaPlatform(ModelPlatform):
    def model(self):
-        #from llama_index.embeddings.ollama import OllamaEmbedding
-        #from llama_index.llms.ollama.base import DEFAULT_REQUEST_TIMEOUT, Ollama
-        #
-        # base_url = os.getenv("OLLAMA_BASE_URL") or "http://127.0.0.1:11434"
-        # request_timeout = float(
-        #     os.getenv("OLLAMA_REQUEST_TIMEOUT", DEFAULT_REQUEST_TIMEOUT)
-        # )
-        # Settings.llm = Ollama(
-        #     base_url=base_url, model=os.getenv("MODEL"), request_timeout=request_timeout
-        # )
+        from llama_index.llms.ollama.base import DEFAULT_REQUEST_TIMEOUT, Ollama
+
+        base_url = os.getenv("OLLAMA_BASE_URL") or "http://127.0.0.1:11434"
+        request_timeout = float(
+            os.getenv("OLLAMA_REQUEST_TIMEOUT", DEFAULT_REQUEST_TIMEOUT)
+        )
+        Settings.llm = Ollama(
+            base_url=base_url, model=os.getenv("MODEL"), request_timeout=request_timeout
+        )
        pass

    def embedding(self):
@@ -35,7 +35,9 @@ chroma="^0.2.0"
 llama-index-vector-stores-chroma = "^0.1.10"
 llama-index-readers-json = "^0.1.5"
 llama-index-retrievers-bm25 = "^0.2.2"
-llama-index-experimental = "^0.2.0"
+llama-index-experimental = "^0.1.4"
+llama-index-llms-ollama = "^0.1.6"
+llama-index-embeddings-ollama = "^0.1.3"

 duckduckgo_search = "^6.2.6"

@@ -63,11 +65,22 @@ version = "^0.8"
 version = "0.0.7"


+[[tool.poetry.source]]
+name = "ali"
+url = "https://mirrors.aliyun.com/pypi/simple/"
+priority = "primary"
+

 [[tool.poetry.source]]
-name = "mirrors"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple/"
-priority = "default"
+name = "tencent"
+url = "https://mirrors.cloud.tencent.com/pypi/simple/"
+priority = "primary"
+
+
+[[tool.poetry.source]]
+name = "tsinghua"
+url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/"
+priority = "primary"

 [build-system]
 requires = [ "poetry-core" ]
@@ -0,0 +1,138 @@
+import nest_asyncio
+nest_asyncio.apply()
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import VectorStoreIndex
+from llama_index.core.evaluation import (
+    FaithfulnessEvaluator, 
+    DatasetGenerator, 
+    CorrectnessEvaluator, 
+    SemanticSimilarityEvaluator,
+)
+from llama_index.experimental.param_tuner import ParamTuner
+from llama_index.experimental.param_tuner.base import RunResult
+from llama_index.llms.openai import OpenAI
+
+import asyncio
+
+# 初始化环境
+from app.observability import init_observability
+from app.settings import init_settings
+from dotenv import load_dotenv
+
+load_dotenv()
+init_settings()
+init_observability()
+
+# 读取文档
+documents = SimpleDirectoryReader("D:/LLM_model/text2sql/zjdataai-app-test/backend/data-test").load_data()
+
+# 参数字典
+param_dict = {
+    "chunk_size": [512, 1024],
+    "top_k": [1, 5],
+    "temperature": [0.1, 1.0]
+}
+
+# 辅助函数
+def _build_index(chunk_size, documents):
+    # 构建索引
+    splitter = SentenceSplitter(chunk_size=chunk_size)
+    vector_index = VectorStoreIndex.from_documents(
+        documents, transformations=[splitter],
+    )
+    return vector_index
+
+# 评估函数
+def evaluate_query_engine(query_engine, questions):
+    loop = asyncio.get_event_loop()
+    correct, total = loop.run_until_complete(_evaluate_query_engine_async(query_engine, questions))
+    return correct, total
+
+async def _evaluate_query_engine_async(query_engine, questions):
+    c = [query_engine.aquery(q) for q in questions]
+    gathering_future = asyncio.gather(*c)
+    results = await gathering_future
+
+    total_correct = 0
+    for r in results:
+        eval_result = (
+            1 if FaithfulnessEvaluator().evaluate_response(response=r).passing else 0
+        )
+        total_correct += eval_result
+
+    return total_correct, len(results)
+
+
+
+# 生成问题
+question_generator = DatasetGenerator.from_documents(documents)
+eval_questions = question_generator.generate_questions_from_nodes(1)  # 假设生成10个问题
+
+# 打印生成的问题
+for i, q in enumerate(eval_questions, start=1):
+    print(f"问题 {i}: {q}")
+
+# 目标函数
+def objective_function(params_dict, documents, questions):
+    chunk_size = params_dict["chunk_size"]
+    top_k = params_dict["top_k"]
+    temperature = params_dict["temperature"]
+
+    # 构建索引
+    vector_index = _build_index(chunk_size, documents)
+    
+    # 查询引擎
+    query_engine = vector_index.as_query_engine(
+        similarity_top_k=top_k, temperature=temperature
+    )
+
+    # 评估查询引擎
+    correct, total = 0, len(questions)
+    question_answers = []  # 添加列表来收集问题和答案
+    
+    for question in questions:
+        response = query_engine.query(question)
+        if response is not None:
+            question_answers.append((question, response.response))
+            eval_result = FaithfulnessEvaluator().evaluate_response(response=response, query_str=question)
+            if eval_result.passing:
+                correct += 1
+
+    # 计算分数
+    score = correct / total if total > 0 else 0
+    return RunResult(score=score, params=params_dict, question_answers=question_answers)
+
+# 创建 ParamTuner 实例
+param_tuner = ParamTuner(
+    param_fn=lambda params_dict: objective_function(params_dict, documents, eval_questions),
+    param_dict=param_dict,
+    show_progress=True,
+)
+
+# 调用 tune 方法
+results = param_tuner.tune()
+best_result = results.best_run_result
+best_top_k = best_result.params["top_k"]
+best_chunk_size = best_result.params["chunk_size"]
+best_temperature = best_result.params["temperature"]
+print(f"得分: {best_result.score}")
+print(f"Top-k: {best_top_k}")
+print(f"文本块大小: {best_chunk_size}")
+print(f"温度: {best_temperature}")
+
+# 使用最佳参数再次运行查询引擎，并打印问题与答案
+best_vector_index = _build_index(best_chunk_size, documents)
+best_query_engine = best_vector_index.as_query_engine(
+    similarity_top_k=best_top_k, temperature=best_temperature
+)
+
+best_question_answers = []
+for question in eval_questions:
+    response = best_query_engine.query(question)
+    if response is not None:
+        best_question_answers.append((question, response.response))
+
+# 打印最佳参数下的问题与答案
+for i, (question, answer) in enumerate(best_question_answers, start=1):
+    print(f"最佳参数 - 问题 {i}: {question}\n答案: {answer}\n")
@@ -0,0 +1,81 @@
+from app.observability import init_observability
+from app.settings import init_settings
+from dotenv import load_dotenv
+
+import nest_asyncio
+nest_asyncio.apply()
+
+load_dotenv()
+
+
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import (
+    VectorStoreIndex,
+    SimpleDirectoryReader,
+    Response,
+)
+from llama_index.core.evaluation import (
+    FaithfulnessEvaluator, 
+    DatasetGenerator, 
+    CorrectnessEvaluator, 
+    SemanticSimilarityEvaluator,)
+
+
+
+init_settings()
+init_observability()
+
+faith_evaluator_qwen = FaithfulnessEvaluator() #诚实度评测
+corr_evaluator_qwen = CorrectnessEvaluator() #准确率评测
+Seman_evaluator_qwen = SemanticSimilarityEvaluator()#嵌入相似度评估
+
+documents = SimpleDirectoryReader("D:/LLM_model/text2sql/zjdataai-app-test/backend/data-test").load_data()
+
+splitter = SentenceSplitter(chunk_size=512)
+
+
+vector_index = VectorStoreIndex.from_documents(
+    documents, transformations=[splitter],
+)
+
+
+# # 运行评估
+# query_engine = vector_index.as_query_engine()
+# response_vector = query_engine.query("工程监理费的金额是多少？")
+# eval_result = evaluator_qwen.evaluate_response(response=response_vector)
+
+# print(response_vector)
+# print(eval_result)
+
+
+question_generator = DatasetGenerator.from_documents(documents)
+eval_questions = question_generator.generate_questions_from_nodes(5)
+print(eval_questions)
+
+import asyncio
+
+async def evaluate_query_engine_async(query_engine, questions):
+    c = [query_engine.aquery(q) for q in questions]
+    gathering_future = asyncio.gather(*c)
+    results = await gathering_future
+    #print(results)
+    
+    total_correct = 0
+    for r in results:
+        eval_result = (
+            1 if faith_evaluator_qwen.evaluate_response(response=r).passing else 0
+        )
+        total_correct += eval_result
+
+    return total_correct, len(results)
+
+def evaluate_query_engine(query_engine, questions):
+    loop = asyncio.get_event_loop()
+    correct, total = loop.run_until_complete(evaluate_query_engine_async(query_engine, questions))
+    return correct, total
+
+# 使用 evaluate_query_engine 函数
+vector_query_engine = vector_index.as_query_engine()
+correct, total = evaluate_query_engine(vector_query_engine, eval_questions[:5])
+
+print(f"score: {correct}/{total}")
@@ -8,7 +8,8 @@ from llama_index.core import VectorStoreIndex, SQLDatabase
 from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
 from llama_index.core.objects import SQLTableNodeMapping, ObjectIndex
 from app.api.routers.chat import generate_filters
-from app.engine import get_index, makeDescriptionByEngine
+from app.engine import get_index
+from app.engine.engine import makeDescriptionByEngine
 from app.engine.loaders.db import CustomDatabaseReader
 from app.engine.vectordb import get_vector_store
 from app.observability import init_observability
@@ -7,7 +7,8 @@ from llama_index.core.objects import SQLTableNodeMapping, ObjectIndex
 from sqlalchemy import create_engine

 from app.api.routers.chat import generate_filters
-from app.engine import get_index, makeDescriptionByEngine
+from app.engine import get_index
+from app.engine.engine import makeDescriptionByEngine
 from app.engine.vectordb import get_vector_store
 from app.observability import init_observability
 from app.settings import init_settings