新增MarkDown切片

This commit is contained in:
wanyaokun
2024-09-06 18:22:01 +08:00
parent 1c773924db
commit bc124c5513
34 changed files with 150 additions and 31 deletions
+64
View File
@@ -0,0 +1,64 @@
from app.engine.loaders.projectJson import *
class MarkDown:
def __init__(self,table:JsonTable,path:str) -> None:
self._table = table
self._path = path
def build(self):
flds:Dict[str,Field] = self._table.fields()
records:List[Record] = self._table.records()
columns:list = []
colComments:list = []
ignores:List[str] = []
for name,fld in flds.items():
if name =='_id' or name =='nodeType' or name =='relTbId':
ignores.append(name)
continue
columns.append(fld.value('name'))
colComments.append(fld.value('alias'))
rowdatas = []
for record in records:
datas = []
for col in columns:
if col in ignores:
continue
txt:str = record.value(col)
datas.append(txt.replace('\n'," "))
rowdatas.append(datas)
content = self.convert(self._table.name(),self._table.comment(),columns,colComments,rowdatas)
with open(self._path, 'w',encoding='utf-8') as file:
file.write(content)
def convert(self,tableName:str,tableComment:str,columns:list,colComments:list,rowdatas:list):
strTitle = "# " + tableName + '\n'
if tableName!='':
strTitle+= f"备注:{tableComment}" + '\n'
for i in range(len(columns)):
strTitle+= f"- 字段名称:{columns[i]}" + '\n'
comment = colComments[i]
if comment!='':
strTitle+= f" - 备注:{comment}" + '\n'
markdown_table = "|"
# 添加列标题
markdown_table += "|".join(columns) + "|\n"
# 添加分隔行
markdown_table += "|" + "|".join(['---' for _ in columns]) + "|\n"
# 遍历每个数据行
for row in rowdatas:
# 添加数据行
markdown_table += "|" + "|".join(row) + "|\n"
return strTitle + "\n" + markdown_table
prjSon = ProjectJson('')
prjSon.parse()
tables = prjSon.tables()
for name,table in tables.items():
mdObj = MarkDown(table,f'')
mdObj.build()
@@ -0,0 +1,59 @@
from llama_index.readers.file.markdown import MarkdownReader
from typing import Any, Dict, List, Optional, Tuple
import re
from llama_index.core.utils import get_tokenizer
class ChunkMarkdownReader(MarkdownReader):
def __init__(
self,
*args: Any,
chunkSize:int = 2048,
**kwargs: Any,
) -> None:
self._chunkSize = chunkSize
self._tokenizer = get_tokenizer()
super().__init__(*args,**kwargs)
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
markdown_tups: List[Tuple[Optional[str], str]] = []
lines = markdown_text.split("\n")
strTitle = ''
tokensNum:int = 0
current_lines = []
strheader:str = ''
headerSize:int = 0
for line in lines:
tokensNum += self._token_size(line)
if tokensNum > self._chunkSize and len(current_lines) > 0:
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
tokensNum = headerSize
current_lines.clear()
current_lines.append(line)
if line == '\n' or line == '\r':
if tokensNum > self._chunkSize:
raise ValueError('标题Token数大于chunkSize大小')
strTitle = "\n".join(current_lines)
headerSize = headerSize + self._token_size(strTitle)
current_lines.clear()
if line.startswith("|---"):
strheader = "\n".join(current_lines)
headerSize= headerSize + self._token_size(strheader)
current_lines.clear()
if len(current_lines) > 0:
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
return [
(
key if key is None else re.sub(r"#", "", key).strip(),
re.sub(r"<.*?>", "", value),
)
for key, value in markdown_tups
]
def _token_size(self, text: str) -> int:
return len(self._tokenizer(text))
+16
View File
@@ -24,13 +24,16 @@ class JsonTable:
self._filePth = filePth
self._fields:Dict[str,Field] = {}
self._records:List[Record] = []
self._fileName = os.path.splitext(os.path.basename(filePth))[0]
self._name = ''
self._comment = ''
def parse(self):
with open(self._filePth, 'r',encoding='utf-8') as file:
jsObj = json.load(file)
data:dict = jsObj.get('table')
self._name = data.get('name')
self._comment = data.get('comment')
Jsfields = data.get('fields')
for jsfiled in Jsfields:
field = Field(jsfiled)
@@ -42,6 +45,16 @@ class JsonTable:
def records(self):
return self._records
def fields(self):
return self._fields
def name(self):
return self._fileName
def comment(self):
return self._comment
class ProjectJson:
def __init__(self,dir:str) -> None:
@@ -59,6 +72,9 @@ class ProjectJson:
def table(self,tableName:str):
return self._tables[tableName]
def tables(self):
return self._tables
def getProjectName(dir:str):
prjJson = ProjectJson(dir)