zjdataai-app/backend/app/engine/generate.py

from dotenv import load_dotenv

load_dotenv()

import logging
import os

from app.engine.loaders import get_document_Types, get_documents,getProjectInfos
from app.engine.vectordb import get_vector_store,get_Neo4j_Graph_Store
from app.settings import init_settings
from app.engine.retriever.CHBM25Retriever import CHBM25Retriever
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser
from llama_index.core.settings import Settings
from llama_index.core.storage import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import PropertyGraphIndex
from app.engine.graph.extractor import PrjGraphExtractor

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

STORAGE_DIR = os.getenv("STORAGE_DIR", "storage")

def get_doc_store(docType:str):
    # If the storage directory is there, load the document store from it.
    # If not, set up an in-memory document store since we can't load from a directory that doesn't exist.
    storeDir = os.path.join(STORAGE_DIR,docType)
    if os.path.exists(storeDir):
        return SimpleDocumentStore.from_persist_dir(storeDir)
    else:
        return SimpleDocumentStore()

def run_pipeline(docstore, vector_store, documents):
    pipeline = IngestionPipeline(
        transformations=[
            #SentenceSplitter(
                #chunk_size=Settings.chunk_size,
                #chunk_overlap=Settings.chunk_overlap,
            #),
            #MarkdownNodeParser(),
            Settings.embed_model,
        ],
        docstore=docstore,
        docstore_strategy="upserts_and_delete",
        vector_store=vector_store,
    )

    # Run the ingestion pipeline and store the results
    nodes = pipeline.run(show_progress=True, documents=documents)
    return nodes

def persist_storage(docstore, vector_store):
    storage_context = StorageContext.from_defaults(
        docstore=docstore,
        vector_store=vector_store,
    )
    storage_context.persist(STORAGE_DIR)

def persist_BMRetriever(vector_store):
    STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm")
    nodes = vector_store.get_nodes([])
    top_k = min(int(os.getenv("TOP_K", "3")),len(nodes))
    bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=top_k,nodes = nodes)
    bmRetriver.persist(STORAGE_DIR)

def generate_datasource():
    logger.info("Generate index for the provided data")

    # Get the stores and documents or create new ones
    docTypes = get_document_Types()
    for docType in docTypes:
        documents = get_documents(docType)
        # Set private=false to mark the document as public (required for filtering)
        for doc in documents:
            doc.metadata["private"] = "false"
        docstore = get_doc_store(docType)
        vector_store = get_vector_store(docType)

        # Run the ingestion pipeline
        _ = run_pipeline(docstore, vector_store, documents)

        # Build the index and persist storage
        persist_storage(docstore, vector_store)
        persist_BMRetriever(vector_store)

    logger.info("Finished generating the index")

class PropertyGraphChache:
    def generate(self):
        GRAPH_STORE_TYPE = os.getenv("GRAPH_STORE_TYPE", "")
        GRAPH_STORAGE_DIR = os.getenv("GRAPH_STORAGE_PATH", "storage_graph")
        prjInfos = getProjectInfos()
        for prjInfo in prjInfos:
            prjFlag = prjInfo['flag']
            prjName = prjInfo['name']
            chche_Path = GRAPH_STORAGE_DIR + f'/{prjFlag}'

            if GRAPH_STORE_TYPE == 'neo4j':
                self.neo4jProertyGraph()
            else:
                self.simplePropertyGraph(prjName,prjFlag,chche_Path)

    def simplePropertyGraph(self,prjName:str,prjFlag:str,filePath:str):
        documents = get_documents(prjFlag)
        index = PropertyGraphIndex(
            nodes =documents,
            kg_extractors = [PrjGraphExtractor(prjName)],
            embed_model = Settings.embed_model,
            show_progress= True
            )
        os.makedirs(filePath,exist_ok = True)
        index.storage_context.persist(persist_dir = filePath)

    def neo4jProertyGraph(self,prjName:str,prjFlag:str,filePath:str):
        neo4jStore =get_Neo4j_Graph_Store(prjFlag)
        documents = get_documents(prjFlag)
        PropertyGraphIndex(
            nodes =documents,
            property_graph_store = neo4jStore,
            kg_extractors = [PrjGraphExtractor(prjName)],
            embed_model = Settings.embed_model,
            show_progress= True
            )


if __name__ == "__main__":
    init_settings()
    from phoenix.trace import using_project
    with using_project(os.getenv("PHOENIX_PROJECT_NAME") + "_generate") as obj:
        generate_datasource()
        PropertyGraphChache().generate()