新增MarkDown切片
This commit is contained in:
@@ -39,7 +39,7 @@ def run_pipeline(docstore, vector_store, documents):
|
||||
#chunk_size=Settings.chunk_size,
|
||||
#chunk_overlap=Settings.chunk_overlap,
|
||||
#),
|
||||
MarkdownNodeParser(),
|
||||
#MarkdownNodeParser(),
|
||||
Settings.embed_model,
|
||||
],
|
||||
docstore=docstore,
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
from app.engine.loaders.projectJson import *
|
||||
|
||||
class MarkDown:
|
||||
def __init__(self,table:JsonTable,path:str) -> None:
|
||||
self._table = table
|
||||
self._path = path
|
||||
|
||||
def build(self):
|
||||
flds:Dict[str,Field] = self._table.fields()
|
||||
records:List[Record] = self._table.records()
|
||||
columns:list = []
|
||||
colComments:list = []
|
||||
ignores:List[str] = []
|
||||
for name,fld in flds.items():
|
||||
if name =='_id' or name =='nodeType' or name =='relTbId':
|
||||
ignores.append(name)
|
||||
continue
|
||||
|
||||
columns.append(fld.value('name'))
|
||||
colComments.append(fld.value('alias'))
|
||||
|
||||
rowdatas = []
|
||||
for record in records:
|
||||
datas = []
|
||||
for col in columns:
|
||||
if col in ignores:
|
||||
continue
|
||||
txt:str = record.value(col)
|
||||
datas.append(txt.replace('\n'," "))
|
||||
rowdatas.append(datas)
|
||||
|
||||
content = self.convert(self._table.name(),self._table.comment(),columns,colComments,rowdatas)
|
||||
with open(self._path, 'w',encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
||||
def convert(self,tableName:str,tableComment:str,columns:list,colComments:list,rowdatas:list):
|
||||
strTitle = "# " + tableName + '\n'
|
||||
if tableName!='':
|
||||
strTitle+= f"备注:{tableComment}" + '\n'
|
||||
|
||||
for i in range(len(columns)):
|
||||
strTitle+= f"- 字段名称:{columns[i]}" + '\n'
|
||||
comment = colComments[i]
|
||||
if comment!='':
|
||||
strTitle+= f" - 备注:{comment}" + '\n'
|
||||
|
||||
markdown_table = "|"
|
||||
# 添加列标题
|
||||
markdown_table += "|".join(columns) + "|\n"
|
||||
# 添加分隔行
|
||||
markdown_table += "|" + "|".join(['---' for _ in columns]) + "|\n"
|
||||
# 遍历每个数据行
|
||||
for row in rowdatas:
|
||||
# 添加数据行
|
||||
markdown_table += "|" + "|".join(row) + "|\n"
|
||||
return strTitle + "\n" + markdown_table
|
||||
|
||||
|
||||
prjSon = ProjectJson('')
|
||||
prjSon.parse()
|
||||
tables = prjSon.tables()
|
||||
for name,table in tables.items():
|
||||
mdObj = MarkDown(table,f'')
|
||||
mdObj.build()
|
||||
@@ -0,0 +1,59 @@
|
||||
from llama_index.readers.file.markdown import MarkdownReader
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import re
|
||||
from llama_index.core.utils import get_tokenizer
|
||||
|
||||
|
||||
class ChunkMarkdownReader(MarkdownReader):
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
chunkSize:int = 2048,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self._chunkSize = chunkSize
|
||||
self._tokenizer = get_tokenizer()
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
markdown_tups: List[Tuple[Optional[str], str]] = []
|
||||
lines = markdown_text.split("\n")
|
||||
|
||||
strTitle = ''
|
||||
tokensNum:int = 0
|
||||
current_lines = []
|
||||
strheader:str = ''
|
||||
headerSize:int = 0
|
||||
for line in lines:
|
||||
tokensNum += self._token_size(line)
|
||||
if tokensNum > self._chunkSize and len(current_lines) > 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
tokensNum = headerSize
|
||||
current_lines.clear()
|
||||
|
||||
current_lines.append(line)
|
||||
|
||||
if line == '\n' or line == '\r':
|
||||
if tokensNum > self._chunkSize:
|
||||
raise ValueError('标题Token数大于chunkSize大小')
|
||||
strTitle = "\n".join(current_lines)
|
||||
headerSize = headerSize + self._token_size(strTitle)
|
||||
current_lines.clear()
|
||||
|
||||
if line.startswith("|---"):
|
||||
strheader = "\n".join(current_lines)
|
||||
headerSize= headerSize + self._token_size(strheader)
|
||||
current_lines.clear()
|
||||
|
||||
if len(current_lines) > 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
return [
|
||||
(
|
||||
key if key is None else re.sub(r"#", "", key).strip(),
|
||||
re.sub(r"<.*?>", "", value),
|
||||
)
|
||||
for key, value in markdown_tups
|
||||
]
|
||||
|
||||
def _token_size(self, text: str) -> int:
|
||||
return len(self._tokenizer(text))
|
||||
@@ -24,13 +24,16 @@ class JsonTable:
|
||||
self._filePth = filePth
|
||||
self._fields:Dict[str,Field] = {}
|
||||
self._records:List[Record] = []
|
||||
self._fileName = os.path.splitext(os.path.basename(filePth))[0]
|
||||
self._name = ''
|
||||
self._comment = ''
|
||||
|
||||
def parse(self):
|
||||
with open(self._filePth, 'r',encoding='utf-8') as file:
|
||||
jsObj = json.load(file)
|
||||
data:dict = jsObj.get('table')
|
||||
self._name = data.get('name')
|
||||
self._comment = data.get('comment')
|
||||
Jsfields = data.get('fields')
|
||||
for jsfiled in Jsfields:
|
||||
field = Field(jsfiled)
|
||||
@@ -42,6 +45,16 @@ class JsonTable:
|
||||
|
||||
def records(self):
|
||||
return self._records
|
||||
|
||||
def fields(self):
|
||||
return self._fields
|
||||
|
||||
def name(self):
|
||||
return self._fileName
|
||||
|
||||
def comment(self):
|
||||
return self._comment
|
||||
|
||||
|
||||
class ProjectJson:
|
||||
def __init__(self,dir:str) -> None:
|
||||
@@ -59,6 +72,9 @@ class ProjectJson:
|
||||
|
||||
def table(self,tableName:str):
|
||||
return self._tables[tableName]
|
||||
|
||||
def tables(self):
|
||||
return self._tables
|
||||
|
||||
def getProjectName(dir:str):
|
||||
prjJson = ProjectJson(dir)
|
||||
|
||||
Reference in New Issue
Block a user