Spaces:
Sleeping
Sleeping
File size: 728 Bytes
7fdb8e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
from uuid import uuid4
from langchain.text_splitter import MarkdownTextSplitter
from rag_demo.preprocessing.base import Chunk
from rag_demo.preprocessing.base import Document
def chunk_text(
document: Document, chunk_size: int = 500, chunk_overlap: int = 50
) -> list[Chunk]:
text_splitter = MarkdownTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(document.text)
result = []
for chunk in chunks:
result.append(
Chunk(
content=chunk,
document_id=document.document_id,
chunk_id=uuid4(),
metadata=document.metadata,
)
)
return result
|