File size: 4,557 Bytes
826f9a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pdfplumber
import re
from transformers import AutoTokenizer
from typing import List, Dict
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# We try to extract the section and subsection data  along with the text to be appended to the chunk
def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]:
    """Extract text with section/subsection hierarchy"""
    content = []
    current_section = ""
    current_subsection = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                # Detect section headers
                section_match = re.match(r'\\section\*{(.+?)}', line)
                subsection_match = re.match(r'\\subsection\*{(.+?)}', line)
                
                if section_match:
                    current_section = section_match.group(1)
                    current_subsection = ""
                    content.append({
                        'type': 'section',
                        'title': current_section,
                        'text': ""
                    })
                elif subsection_match:
                    current_subsection = subsection_match.group(1)
                    content.append({
                        'type': 'subsection',
                        'title': current_subsection,
                        'text': ""
                    })
                else:
                    if content:
                        content[-1]['text'] += line + " "
                    else:
                        content.append({
                            'type': 'text',
                            'title': "",
                            'text': line
                        })
    
    return content

def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]:
    """Create chunks optimized for DistilBERT with hierarchy context"""
    chunks = []
    current_chunk = []
    current_tokens = 0
    current_section = ""
    current_subsection = ""
    
    for item in content:
        # Build context header
        header = ""
        if item['type'] == 'section':
            current_section = item['title']
            current_subsection = ""
            header = f"[SECTION] {current_section}\n"
        elif item['type'] == 'subsection':
            current_subsection = item['title']
            header = f"[SUBSECTION] {current_subsection}\n"
        
        # Split text into sentences
        sentences = re.split(r'(?<=[.!?])\s+', item['text'])
        
        for sentence in sentences:
            full_text = header + sentence if header else sentence
            tokens = tokenizer.encode(full_text)
            
            if current_tokens + len(tokens) > max_tokens:
                if current_chunk:
                    chunk_text = "\n".join(current_chunk)
                    chunks.append({
                        'text': chunk_text,
                        'section': current_section,
                        'subsection': current_subsection,
                        'tokens': current_tokens,
                        'file_name':file_name
                    })
                    # Carry over overlap
                    overlap_tokens = tokenizer.encode(chunk_text)[-overlap:]
                    current_chunk = [tokenizer.decode(overlap_tokens)]
                    current_tokens = len(overlap_tokens)
                    header = ""  # Reset header after overlap
                
            current_chunk.append(full_text)
            current_tokens += len(tokens)
            header = ""  # Clear header after first use
    
    # Add remaining content
    if current_chunk:
        chunk_text = "\n".join(current_chunk)
        chunks.append({
            'text': chunk_text,
            'section': current_section,
            'subsection': current_subsection,
            'tokens': current_tokens,
            'file_name':file_name
        })
    
    return chunks

def process_pdf(pdf_path: str) -> List[Dict]:
    """Process PDF into BERT-optimized chunks"""
    structured_content = extract_text_with_hierarchy(pdf_path)
    return create_bert_chunks(pdf_path,structured_content)