工程名称下拉项获取兼容.md文件,同时新增自定义答案合成类
This commit is contained in:
@@ -3,7 +3,7 @@ import yaml
|
||||
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
|
||||
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
|
||||
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
|
||||
from app.engine.loaders.projectJson import getProjectName
|
||||
from app.engine.loaders.file import getProjectName
|
||||
import os
|
||||
|
||||
|
||||
|
||||
@@ -6,6 +6,9 @@ from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.readers.json import JSONReader
|
||||
from llama_parse import LlamaParse
|
||||
from pydantic import BaseModel, validator
|
||||
from app.engine.loaders.markdownReader import ChunkMarkdownReader
|
||||
from app.engine.loaders.projectJson import ProjectJson
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -20,7 +23,6 @@ class FileLoaderConfig(BaseModel):
|
||||
raise ValueError(f"Directory '{v}' does not exist")
|
||||
return v
|
||||
|
||||
|
||||
def llama_parse_parser():
|
||||
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
|
||||
raise ValueError(
|
||||
@@ -35,7 +37,6 @@ def llama_parse_parser():
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def llama_parse_extractor() -> Dict[str, LlamaParse]:
|
||||
from llama_parse.utils import SUPPORTED_FILE_TYPES
|
||||
|
||||
@@ -43,8 +44,11 @@ def llama_parse_extractor() -> Dict[str, LlamaParse]:
|
||||
return {file_type: parser for file_type in SUPPORTED_FILE_TYPES}
|
||||
|
||||
def llama_local_extractor() -> Dict[str, BaseReader]:
|
||||
return {".json" : JSONReader(clean_json=False,levels_back=0)}
|
||||
|
||||
parser = {
|
||||
".json" : JSONReader(clean_json=False,levels_back=0),
|
||||
".md" : ChunkMarkdownReader(),
|
||||
}
|
||||
return parser
|
||||
|
||||
def get_file_documents(config: FileLoaderConfig,childPath: str):
|
||||
from llama_index.core.readers import SimpleDirectoryReader
|
||||
@@ -86,3 +90,32 @@ def get_file_documents(config: FileLoaderConfig,childPath: str):
|
||||
else:
|
||||
# Raise the error if it is not the case of empty data dir
|
||||
raise e
|
||||
|
||||
def prjFileSuffix(dir:str):
|
||||
entries = os.listdir(dir)
|
||||
file_names = [entry for entry in entries if os.path.isfile(os.path.join(dir, entry))]
|
||||
if len(file_names) > 0:
|
||||
return os.path.splitext(file_names[0])[1]
|
||||
return ''
|
||||
|
||||
def getProjectName(dir:str):
|
||||
suffix = prjFileSuffix(dir)
|
||||
if suffix== '.json':
|
||||
prjJson = ProjectJson(dir)
|
||||
prjJson.parse()
|
||||
tb = prjJson.table('工程属性')
|
||||
records = tb.records()
|
||||
for record in records:
|
||||
name = record.value('名称')
|
||||
if name == '工程名称':
|
||||
return record.value('值')
|
||||
elif suffix == '.md':
|
||||
md_files = [f for f in os.listdir(dir) if f.endswith('.md')]
|
||||
for md_file in md_files:
|
||||
prjPath = os.path.join(dir, md_file)
|
||||
basename = os.path.splitext(md_file)[0]
|
||||
if basename =='工程属性':
|
||||
rd = ChunkMarkdownReader()
|
||||
rd.load_data(prjPath)
|
||||
return rd.findValue("名称=='工程名称'",'值')
|
||||
return ''
|
||||
@@ -13,6 +13,8 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
) -> None:
|
||||
self._chunkSize = chunkSize
|
||||
self._tokenizer = get_tokenizer()
|
||||
self._colheader = ''
|
||||
self._rows = []
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
@@ -34,6 +36,8 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
tokensNum = headerSize
|
||||
current_lines.clear()
|
||||
current_lines.append(line)
|
||||
if strTitle!='' and strheader!='':
|
||||
self._rows.append(line)
|
||||
|
||||
if line == '\n' or line == '\r':
|
||||
if tokensNum > self._chunkSize:
|
||||
@@ -43,10 +47,12 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
current_lines.clear()
|
||||
|
||||
if line.startswith("|---"):
|
||||
self._colheader = current_lines[0]
|
||||
strheader = "\n".join(current_lines)
|
||||
headerSize= headerSize + self._token_size(strheader)
|
||||
current_lines.clear()
|
||||
|
||||
|
||||
if len(current_lines) > 0:
|
||||
if len(markdown_tups) == 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
@@ -62,4 +68,22 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
]
|
||||
|
||||
def _token_size(self, text: str) -> int:
|
||||
return len(self._tokenizer(text))
|
||||
return len(self._tokenizer(text))
|
||||
|
||||
def findValue(self,expression:str,Field:str):
|
||||
cols = self._colheader.split('|')
|
||||
cols = [item for item in cols if item]
|
||||
|
||||
for row in self._rows:
|
||||
rowtrs = row.split('|')
|
||||
rowdatas = [item for item in rowtrs if item and (item!='\r' or item!='\n')]
|
||||
if len(rowdatas) == 0:
|
||||
continue
|
||||
gData = {}
|
||||
for cName,rValue in zip(cols,rowdatas):
|
||||
gData[cName] = rValue
|
||||
if eval(expression,gData):
|
||||
return gData[Field]
|
||||
return ''
|
||||
|
||||
|
||||
|
||||
@@ -55,7 +55,6 @@ class JsonTable:
|
||||
def comment(self):
|
||||
return self._comment
|
||||
|
||||
|
||||
class ProjectJson:
|
||||
def __init__(self,dir:str) -> None:
|
||||
self._dir = dir
|
||||
@@ -76,14 +75,5 @@ class ProjectJson:
|
||||
def tables(self):
|
||||
return self._tables
|
||||
|
||||
def getProjectName(dir:str):
|
||||
prjJson = ProjectJson(dir)
|
||||
prjJson.parse()
|
||||
tb:JsonTable = prjJson.table('工程属性')
|
||||
records = tb.records()
|
||||
for record in records:
|
||||
name = record.value('名称')
|
||||
if name == '工程名称':
|
||||
return record.value('值')
|
||||
return ''
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user