File size: 953 Bytes
c55e75f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def chunk_document_to_dict(doc: str, doc_name: str, desired_chunk_size: int = 400, max_chunk_size: int = 500):
    chunks = {}
    chunk = ''
    chunk_number = 1

    for line in doc.splitlines():
        chunk += line + '\n'
        if len(chunk) >= desired_chunk_size:
            chunk_id = f"{doc_name}_{chunk_number}"
            chunks[chunk_id] = chunk[:max_chunk_size]
            chunk = ''
            chunk_number += 1

    if chunk:  # Залишок запихаємо в останній чанк
        chunk_id = f"{doc_name}_{chunk_number}"
        chunks[chunk_id] = chunk

    return chunks


def chunk_documents_to_dict(docs: dict, desired_chunk_size: int = 400, max_chunk_size: int = 500):
    all_chunks = {}
    for doc_name, doc_text in docs.items():
        chunks = chunk_document_to_dict(doc_text, doc_name, desired_chunk_size, max_chunk_size)
        all_chunks.update(chunks)
    return all_chunks