diff --git a/backend/app/engine/generate.py b/backend/app/engine/generate.py index 605959c..423d2e3 100644 --- a/backend/app/engine/generate.py +++ b/backend/app/engine/generate.py @@ -5,8 +5,8 @@ load_dotenv() import logging import os -from app.engine.loaders import get_document_Types, get_documents -from app.engine.vectordb import get_vector_store +from app.engine.loaders import get_document_Types, get_documents,getProjectInfos +from app.engine.vectordb import get_vector_store,get_Neo4j_Graph_Store from app.settings import init_settings from app.engine.retriever.CHBM25Retriever import CHBM25Retriever from llama_index.core.ingestion import IngestionPipeline @@ -14,15 +14,15 @@ from llama_index.core.node_parser import SentenceSplitter,MarkdownNodeParser from llama_index.core.settings import Settings from llama_index.core.storage import StorageContext from llama_index.core.storage.docstore import SimpleDocumentStore +from llama_index.core import PropertyGraphIndex +from app.engine.graph.extractor import PrjGraphExtractor logging.basicConfig(level=logging.INFO) logger = logging.getLogger() STORAGE_DIR = os.getenv("STORAGE_DIR", "storage") - def get_doc_store(docType:str): - # If the storage directory is there, load the document store from it. # If not, set up an in-memory document store since we can't load from a directory that doesn't exist. storeDir = os.path.join(STORAGE_DIR,docType) @@ -31,7 +31,6 @@ def get_doc_store(docType:str): else: return SimpleDocumentStore() - def run_pipeline(docstore, vector_store, documents): pipeline = IngestionPipeline( transformations=[ @@ -49,10 +48,8 @@ def run_pipeline(docstore, vector_store, documents): # Run the ingestion pipeline and store the results nodes = pipeline.run(show_progress=True, documents=documents) - return nodes - def persist_storage(docstore, vector_store): storage_context = StorageContext.from_defaults( docstore=docstore, @@ -60,7 +57,6 @@ def persist_storage(docstore, vector_store): ) storage_context.persist(STORAGE_DIR) - def persist_BMRetriever(vector_store): STORAGE_DIR = os.getenv("BM_RETRIEVER_PATH", "storage_bm") nodes = vector_store.get_nodes([]) @@ -68,9 +64,7 @@ def persist_BMRetriever(vector_store): bmRetriver = CHBM25Retriever.from_defaults(similarity_top_k=top_k,nodes = nodes) bmRetriver.persist(STORAGE_DIR) - def generate_datasource(): - init_settings() logger.info("Generate index for the provided data") # Get the stores and documents or create new ones @@ -92,8 +86,47 @@ def generate_datasource(): logger.info("Finished generating the index") +class PropertyGraphChache: + def generate(self): + GRAPH_STORE_TYPE = os.getenv("GRAPH_STORE_TYPE", "") + GRAPH_STORAGE_DIR = os.getenv("GRAPH_STORAGE_DIR", "storage_graph") + prjInfos = getProjectInfos() + for prjInfo in prjInfos: + prjFlag = prjInfo['flag'] + prjName = prjInfo['name'] + chche_Path = GRAPH_STORAGE_DIR + f'/{prjFlag}' + + if GRAPH_STORE_TYPE == 'neo4j': + self.neo4jProertyGraph() + else: + self.simplePropertyGraph(prjName,prjFlag,chche_Path) + + def simplePropertyGraph(self,prjName:str,prjFlag:str,filePath:str): + documents = get_documents(prjFlag) + index = PropertyGraphIndex( + nodes =documents, + kg_extractors = [PrjGraphExtractor(prjName)], + embed_model = Settings.embed_model, + show_progress= True + ) + os.makedirs(filePath,exist_ok = True) + index.storage_context.persist(persist_dir = filePath) + + def neo4jProertyGraph(self,prjName:str,prjFlag:str,filePath:str): + neo4jStore =get_Neo4j_Graph_Store(prjFlag) + documents = get_documents(prjFlag) + PropertyGraphIndex( + nodes =documents, + property_graph_store = neo4jStore, + kg_extractors = [PrjGraphExtractor(prjName)], + embed_model = Settings.embed_model, + show_progress= True + ) + if __name__ == "__main__": + init_settings() from phoenix.trace import using_project with using_project(os.getenv("PHOENIX_PROJECT_NAME") + "_generate") as obj: generate_datasource() + PropertyGraphChache().generate() diff --git a/backend/app/engine/graph/extractor.py b/backend/app/engine/graph/extractor.py new file mode 100644 index 0000000..f5e8a8e --- /dev/null +++ b/backend/app/engine/graph/extractor.py @@ -0,0 +1,128 @@ +import os +from llama_index.core.schema import TransformComponent, BaseNode +from llama_index.core.graph_stores.types import ( + EntityNode, + Relation, + Triplet, + KG_NODES_KEY, + KG_RELATIONS_KEY, +) +from app.engine.loaders.projectJson import ProjectJson +from app.engine.loaders.markdownReader import ChunkMarkdownReader + +class PrjGraphExtractor(TransformComponent): + ProjectName:str + def __init__(self,PrjName:str): + super().__init__(ProjectName = PrjName) + + + def __call__( + self, llama_nodes: list[BaseNode], **kwargs + ) -> list[BaseNode]: + if len(llama_nodes) > 0: + self._addPrjNode(llama_nodes[0]) + + for llama_node in llama_nodes: + fileName = self._getFileName(llama_node) + if fileName == '工程属性': + self._dealAttributeNode(llama_node) + else: + self._dealCommonNode(llama_node) + return llama_nodes + + def _dealCommonNode(self,llama_node:BaseNode): + fileName = self._getFileName(llama_node) + existing_nodes:list = llama_node.metadata.pop(KG_NODES_KEY, []) + existing_relations:list = llama_node.metadata.pop(KG_RELATIONS_KEY, []) + + records:dict[str,list] = self._getRecordNode(llama_node) + fInfos = fileName.split('_') + if len(fInfos) == 1: + existing_nodes.append(EntityNode(name=fInfos[0], label=fInfos[0])) + elif len(fInfos) == 2: + existing_nodes.append(EntityNode(name=fileName, label=fInfos[1])) + else: + raise ValueError("文件名存在多个下划线") + + index = 0 + for record in records: + index = index + 1 + rcdName = self._getRecordName(fileName,record) + existing_nodes.append(EntityNode(name=rcdName, label=rcdName,properties = record)) + existing_relations.append( + Relation( + label="包含", + source_id= fileName, + target_id= rcdName, + properties={}, + ) + ) + + existing_relations.append( + Relation( + label="包含", + source_id= self.ProjectName, + target_id= fileName, + properties={}, + ) + ) + + + llama_node.metadata[KG_NODES_KEY] = existing_nodes + llama_node.metadata[KG_RELATIONS_KEY] = existing_relations + + def _dealAttributeNode(self,llama_node:BaseNode): + fileName = self._getFileName(llama_node) + existing_nodes:list = llama_node.metadata.pop(KG_NODES_KEY, []) + existing_relations:list = llama_node.metadata.pop(KG_RELATIONS_KEY, []) + records:dict[str,list] = self._getRecordNode(llama_node) + existing_nodes.append(EntityNode(name=fileName, label=fileName)) + + index = 0 + for record in records: + index = index + 1 + attName = self._getRecordName(fileName,record) + existing_nodes.append(EntityNode(name=attName, label='属性',properties = record)) + existing_relations.append( + Relation( + label="聚合", + source_id= fileName, + target_id= attName, + properties={}, + ) + ) + + existing_relations.append( + Relation( + label="包含", + source_id= self.ProjectName, + target_id= fileName, + properties={}, + ) + ) + + llama_node.metadata[KG_NODES_KEY] = existing_nodes + llama_node.metadata[KG_RELATIONS_KEY] = existing_relations + + def _getRecordNode(self,llama_node:BaseNode): + content = llama_node.get_content() + rd = ChunkMarkdownReader() + rd.markdown_to_tups(content) + records = rd.records() + return records + + def _getFileName(self,llama_node:BaseNode): + meta = llama_node.metadata + fileName:str = os.path.splitext(meta['file_name'])[0] + return fileName + + def _addPrjNode(self,llama_node:BaseNode): + existing_nodes:list = llama_node.metadata.pop(KG_NODES_KEY, []) + existing_nodes.append(EntityNode(name=self.ProjectName, label=self.ProjectName)) + llama_node.metadata[KG_NODES_KEY] = existing_nodes + + def _getRecordName(self,fileName:str,record:dict): + for name,value in record.items(): + if '名称' in name: + return value + raise ValueError('记录名称为空') \ No newline at end of file diff --git a/backend/app/engine/graph/propertyGraph.py b/backend/app/engine/graph/propertyGraph.py new file mode 100644 index 0000000..a76b3dd --- /dev/null +++ b/backend/app/engine/graph/propertyGraph.py @@ -0,0 +1,87 @@ +from llama_index.core.indices.property_graph import LLMSynonymRetriever,VectorContextRetriever,PGRetriever +from llama_index.core.indices.property_graph.transformations.schema_llm import * +from llama_index.core import SimpleDirectoryReader +from llama_index.core import settings +from llama_index.core import PropertyGraphIndex +from typing import List,Tuple,Literal +from app.settings import init_settings +import os +from llama_index.core.storage.storage_context import StorageContext +from llama_index.core import load_index_from_storage +from app.observability import init_observability +from app.engine.vectordb import get_Neo4j_Graph_Store +from llama_index.core.response_synthesizers import ResponseMode +from util.register import * +from llama_index.core.query_engine import RetrieverQueryEngine +from app.engine.prompt import text_qa_template, refine_template, summary_template, simple_template +from app.engine.engine import get_node_postprocessors + +class PropertyGraph: + def __init__(self,prjFlag:str) -> None: + self._prjFlag = prjFlag + + def create_query_engine(self,retriever): + postprocess = get_node_postprocessors() + query_engine = RetrieverQueryEngine.from_args( + retriever = retriever, + text_qa_template=text_qa_template, + refine_template=refine_template, + summary_template = summary_template, + simple_template = simple_template, + node_postprocessors=postprocess, + use_async=True, + streaming=False, + response_mode = ResponseMode.TREE_SUMMARIZE + ) + return query_engine + + def getPropertyGraphIndex(self): + GRAPH_STORE_TYPE = os.getenv("GRAPH_STORE_TYPE", "") + if GRAPH_STORE_TYPE == 'neo4j': + index = PropertyGraphIndex.from_existing(property_graph_store= get_Neo4j_Graph_Store(self._prjFlag)) + else: + GRAPH_STORAGE_DIR = os.getenv("GRAPH_STORAGE_DIR", "storage_graph") + prjCachePath = GRAPH_STORAGE_DIR + f"/{self._prjFlag}" + if not os.path.exists(prjCachePath): + return None + storeContext = StorageContext.from_defaults(persist_dir = prjCachePath) + index = load_index_from_storage(storeContext) + return index + + def query(self,query_str:str): + index = self.getPropertyGraphIndex() + synonym_retriver = LLMSynonymRetriever(index.property_graph_store, + llm=settings.Settings.llm, + max_keywords=10, + include_text=False + ) + if index.property_graph_store.supports_vector_queries: + vector_store = None + else: + vector_store = index.vector_store + vector_retriver = VectorContextRetriever(index.property_graph_store, + vector_store = vector_store, + embed_model=settings.Settings.embed_model, + similarity_top_k=5, + include_text=False + ) + + retriever = index.as_retriever(sub_retrievers=[synonym_retriver,vector_retriver]) + query_engine = self.create_query_engine(retriever) + + response = query_engine.query(query_str) + print(response) + return str(response) + + +if __name__ == "__main__": + init_settings() + init_observability() + # graph = PropertyGraph('projects_1b20bbf4-3243-4ac3-bcf0-8a91e9157521') + # graph.query('代码为XLBT的金额是') + + + + + + diff --git a/backend/app/engine/loaders/file.py b/backend/app/engine/loaders/file.py index b2990f4..dcacee4 100644 --- a/backend/app/engine/loaders/file.py +++ b/backend/app/engine/loaders/file.py @@ -100,6 +100,7 @@ class CustomFileMetadataFunc: def _is_default_fs(self,fs: fsspec.AbstractFileSystem) -> bool: return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir + def llama_parse_parser(): if os.getenv("LLAMA_CLOUD_API_KEY") is None: raise ValueError( diff --git a/backend/app/engine/loaders/markDown.py b/backend/app/engine/loaders/markDown.py index 698266a..933a051 100644 --- a/backend/app/engine/loaders/markDown.py +++ b/backend/app/engine/loaders/markDown.py @@ -56,9 +56,9 @@ class MarkDown: return strTitle + "\n" + markdown_table -prjSon = ProjectJson('') -prjSon.parse() -tables = prjSon.tables() -for name,table in tables.items(): - mdObj = MarkDown(table,f'') - mdObj.build() \ No newline at end of file +# prjSon = ProjectJson('C:\\Users\\wanyaokun\\Desktop\\markdown\\Project\\110千伏思科变电站工程') +# prjSon.parse() +# tables = prjSon.tables() +# for name,table in tables.items(): +# mdObj = MarkDown(table,f'C:\\Users\\wanyaokun\\Desktop\\markdown\\data\\110千伏思科变电站工程\\{table.name()}.md') +# mdObj.build() \ No newline at end of file diff --git a/backend/app/engine/loaders/markdownReader.py b/backend/app/engine/loaders/markdownReader.py index d0688b2..a7b61f0 100644 --- a/backend/app/engine/loaders/markdownReader.py +++ b/backend/app/engine/loaders/markdownReader.py @@ -19,32 +19,38 @@ class ChunkMarkdownReader(MarkdownReader): def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: markdown_tups: List[Tuple[Optional[str], str]] = [] - lines = markdown_text.split("\n") + lines = self._multi_char_split(markdown_text,'\r\n') + lines = [line for line in lines if line!=''] strTitle = '' tokensNum:int = 0 current_lines = [] strheader:str = '' headerSize:int = 0 + bAreadyJudgeTitle = False for line in lines: tokensNum += self._token_size(line) if tokensNum > self._chunkSize and len(current_lines) > 0: if len(markdown_tups) == 0: - markdown_tups.append((strTitle + strheader , "\n".join(current_lines))) + titleHead = strTitle + '\n' + strheader if strTitle!= '' else strheader + markdown_tups.append((titleHead, "\n".join(current_lines))) else: markdown_tups.append((strheader , "\n".join(current_lines))) tokensNum = headerSize current_lines.clear() - current_lines.append(line) - if strTitle!='' and strheader!='': + + if strheader!='': self._rows.append(line) - if line == '\n' or line == '\r': - if tokensNum > self._chunkSize: - raise ValueError('标题Token数大于chunkSize大小') - strTitle = "\n".join(current_lines) - #headerSize = headerSize + self._token_size(strTitle) - current_lines.clear() + if line.startswith('|') and strTitle == '' and not bAreadyJudgeTitle: + if len(current_lines) > 0: + if tokensNum > self._chunkSize: + raise ValueError('标题Token数大于chunkSize大小') + strTitle = "\n".join(current_lines) + current_lines.clear() + bAreadyJudgeTitle = True + + current_lines.append(line) if line.startswith("|---"): self._colheader = current_lines[0] @@ -55,10 +61,11 @@ class ChunkMarkdownReader(MarkdownReader): if len(current_lines) > 0: if len(markdown_tups) == 0: - markdown_tups.append((strTitle + strheader , "\n".join(current_lines))) + titleHead = strTitle + '\n' + strheader if strTitle!= '' else strheader + markdown_tups.append((titleHead, "\n".join(current_lines))) else: markdown_tups.append((strheader , "\n".join(current_lines))) - + return [ ( key if key is None else re.sub(r"#", "", key).strip(), @@ -86,4 +93,24 @@ class ChunkMarkdownReader(MarkdownReader): return gData[Field] return '' + def records(self): + cols = self._colheader.split('|') + cols = cols[1:-1] + records = [] + for row in self._rows: + rowtrs = row.split('|') + rowdatas = [item for item in rowtrs if (item!='\r' or item!='\n')] + rowdatas = rowdatas[1:-1] + if len(rowdatas) == 0: + continue + record = {} + for cName,rValue in zip(cols,rowdatas): + record[cName] = rValue + records.append(record) + return records + def _multi_char_split(self,string, separators): + # 将多个分隔符连成一个正则表达式 + pattern = '[' + re.escape(separators) + ']' + # 使用正则表达式进行分割 + return re.split(pattern, string) \ No newline at end of file diff --git a/backend/app/engine/vectordb.py b/backend/app/engine/vectordb.py index 7d30c69..0992883 100644 --- a/backend/app/engine/vectordb.py +++ b/backend/app/engine/vectordb.py @@ -2,6 +2,7 @@ import os from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.vector_stores.qdrant import QdrantVectorStore from qdrant_client import qdrant_client +from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore qclient = None @@ -69,4 +70,14 @@ def get_vector_store(docType:str): case _: raise ValueError(f"Invalid vector store type: {store_type}") - return store \ No newline at end of file + return store + + +def get_Neo4j_Graph_Store(docType:str): + neo4jStore = Neo4jPropertyGraphStore( + username= os.getenv('NEO4J_USERNAME'), + password= os.getenv('NEO4J_PASSWORD'), + url=os.getenv('NEO4J_URL'), + database= docType + ) + return neo4jStore \ No newline at end of file