新增MarkDown切片
This commit is contained in:
@@ -0,0 +1,59 @@
|
||||
from llama_index.readers.file.markdown import MarkdownReader
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import re
|
||||
from llama_index.core.utils import get_tokenizer
|
||||
|
||||
|
||||
class ChunkMarkdownReader(MarkdownReader):
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
chunkSize:int = 2048,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self._chunkSize = chunkSize
|
||||
self._tokenizer = get_tokenizer()
|
||||
super().__init__(*args,**kwargs)
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
markdown_tups: List[Tuple[Optional[str], str]] = []
|
||||
lines = markdown_text.split("\n")
|
||||
|
||||
strTitle = ''
|
||||
tokensNum:int = 0
|
||||
current_lines = []
|
||||
strheader:str = ''
|
||||
headerSize:int = 0
|
||||
for line in lines:
|
||||
tokensNum += self._token_size(line)
|
||||
if tokensNum > self._chunkSize and len(current_lines) > 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
tokensNum = headerSize
|
||||
current_lines.clear()
|
||||
|
||||
current_lines.append(line)
|
||||
|
||||
if line == '\n' or line == '\r':
|
||||
if tokensNum > self._chunkSize:
|
||||
raise ValueError('标题Token数大于chunkSize大小')
|
||||
strTitle = "\n".join(current_lines)
|
||||
headerSize = headerSize + self._token_size(strTitle)
|
||||
current_lines.clear()
|
||||
|
||||
if line.startswith("|---"):
|
||||
strheader = "\n".join(current_lines)
|
||||
headerSize= headerSize + self._token_size(strheader)
|
||||
current_lines.clear()
|
||||
|
||||
if len(current_lines) > 0:
|
||||
markdown_tups.append((strTitle + strheader , "\n".join(current_lines)))
|
||||
return [
|
||||
(
|
||||
key if key is None else re.sub(r"#", "", key).strip(),
|
||||
re.sub(r"<.*?>", "", value),
|
||||
)
|
||||
for key, value in markdown_tups
|
||||
]
|
||||
|
||||
def _token_size(self, text: str) -> int:
|
||||
return len(self._tokenizer(text))
|
||||
Reference in New Issue
Block a user