新增属性图谱
This commit is contained in:
@@ -19,32 +19,38 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
markdown_tups: List[Tuple[Optional[str], str]] = []
|
||||
lines = markdown_text.split("\n")
|
||||
lines = self._multi_char_split(markdown_text,'\r\n')
|
||||
lines = [line for line in lines if line!='']
|
||||
|
||||
strTitle = ''
|
||||
tokensNum:int = 0
|
||||
current_lines = []
|
||||
strheader:str = ''
|
||||
headerSize:int = 0
|
||||
bAreadyJudgeTitle = False
|
||||
for line in lines:
|
||||
tokensNum += self._token_size(line)
|
||||
if tokensNum > self._chunkSize and len(current_lines) > 0:
|
||||
if len(markdown_tups) == 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
titleHead = strTitle + '\n' + strheader if strTitle!= '' else strheader
|
||||
markdown_tups.append((titleHead, "\n".join(current_lines)))
|
||||
else:
|
||||
markdown_tups.append((strheader , "\n".join(current_lines)))
|
||||
tokensNum = headerSize
|
||||
current_lines.clear()
|
||||
current_lines.append(line)
|
||||
if strTitle!='' and strheader!='':
|
||||
|
||||
if strheader!='':
|
||||
self._rows.append(line)
|
||||
|
||||
if line == '\n' or line == '\r':
|
||||
if tokensNum > self._chunkSize:
|
||||
raise ValueError('标题Token数大于chunkSize大小')
|
||||
strTitle = "\n".join(current_lines)
|
||||
#headerSize = headerSize + self._token_size(strTitle)
|
||||
current_lines.clear()
|
||||
if line.startswith('|') and strTitle == '' and not bAreadyJudgeTitle:
|
||||
if len(current_lines) > 0:
|
||||
if tokensNum > self._chunkSize:
|
||||
raise ValueError('标题Token数大于chunkSize大小')
|
||||
strTitle = "\n".join(current_lines)
|
||||
current_lines.clear()
|
||||
bAreadyJudgeTitle = True
|
||||
|
||||
current_lines.append(line)
|
||||
|
||||
if line.startswith("|---"):
|
||||
self._colheader = current_lines[0]
|
||||
@@ -55,10 +61,11 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
|
||||
if len(current_lines) > 0:
|
||||
if len(markdown_tups) == 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
titleHead = strTitle + '\n' + strheader if strTitle!= '' else strheader
|
||||
markdown_tups.append((titleHead, "\n".join(current_lines)))
|
||||
else:
|
||||
markdown_tups.append((strheader , "\n".join(current_lines)))
|
||||
|
||||
|
||||
return [
|
||||
(
|
||||
key if key is None else re.sub(r"#", "", key).strip(),
|
||||
@@ -86,4 +93,24 @@ class ChunkMarkdownReader(MarkdownReader):
|
||||
return gData[Field]
|
||||
return ''
|
||||
|
||||
def records(self):
|
||||
cols = self._colheader.split('|')
|
||||
cols = cols[1:-1]
|
||||
records = []
|
||||
for row in self._rows:
|
||||
rowtrs = row.split('|')
|
||||
rowdatas = [item for item in rowtrs if (item!='\r' or item!='\n')]
|
||||
rowdatas = rowdatas[1:-1]
|
||||
if len(rowdatas) == 0:
|
||||
continue
|
||||
record = {}
|
||||
for cName,rValue in zip(cols,rowdatas):
|
||||
record[cName] = rValue
|
||||
records.append(record)
|
||||
return records
|
||||
|
||||
def _multi_char_split(self,string, separators):
|
||||
# 将多个分隔符连成一个正则表达式
|
||||
pattern = '[' + re.escape(separators) + ']'
|
||||
# 使用正则表达式进行分割
|
||||
return re.split(pattern, string)
|
||||
Reference in New Issue
Block a user