调整MarkDown文件读取的格式内容

This commit is contained in:
wanyaokun
2024-09-09 19:10:16 +08:00
parent bc124c5513
commit 54f19a20fc
+8 -2
View File
@@ -27,17 +27,19 @@ class ChunkMarkdownReader(MarkdownReader):
for line in lines: for line in lines:
tokensNum += self._token_size(line) tokensNum += self._token_size(line)
if tokensNum > self._chunkSize and len(current_lines) > 0: if tokensNum > self._chunkSize and len(current_lines) > 0:
if len(markdown_tups) == 0:
markdown_tups.append((strTitle + strheader , "\n".join(current_lines))) markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
else:
markdown_tups.append((strheader , "\n".join(current_lines)))
tokensNum = headerSize tokensNum = headerSize
current_lines.clear() current_lines.clear()
current_lines.append(line) current_lines.append(line)
if line == '\n' or line == '\r': if line == '\n' or line == '\r':
if tokensNum > self._chunkSize: if tokensNum > self._chunkSize:
raise ValueError('标题Token数大于chunkSize大小') raise ValueError('标题Token数大于chunkSize大小')
strTitle = "\n".join(current_lines) strTitle = "\n".join(current_lines)
headerSize = headerSize + self._token_size(strTitle) #headerSize = headerSize + self._token_size(strTitle)
current_lines.clear() current_lines.clear()
if line.startswith("|---"): if line.startswith("|---"):
@@ -46,7 +48,11 @@ class ChunkMarkdownReader(MarkdownReader):
current_lines.clear() current_lines.clear()
if len(current_lines) > 0: if len(current_lines) > 0:
if len(markdown_tups) == 0:
markdown_tups.append((strTitle + strheader , "\n".join(current_lines))) markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
else:
markdown_tups.append((strheader , "\n".join(current_lines)))
return [ return [
( (
key if key is None else re.sub(r"#", "", key).strip(), key if key is None else re.sub(r"#", "", key).strip(),