File size: 728 Bytes
7fdb8e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from uuid import uuid4

from langchain.text_splitter import MarkdownTextSplitter
from rag_demo.preprocessing.base import Chunk
from rag_demo.preprocessing.base import Document


def chunk_text(
    document: Document, chunk_size: int = 500, chunk_overlap: int = 50
) -> list[Chunk]:
    text_splitter = MarkdownTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(document.text)
    result = []
    for chunk in chunks:
        result.append(
            Chunk(
                content=chunk,
                document_id=document.document_id,
                chunk_id=uuid4(),
                metadata=document.metadata,
            )
        )

    return result