Spaces:

Sasidhar
/

llmgaurdrails

Sleeping

File size: 4,557 Bytes

826f9a4

import pdfplumber
import re
from transformers import AutoTokenizer
from typing import List, Dict
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# We try to extract the section and subsection data  along with the text to be appended to the chunk
def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]:
    """Extract text with section/subsection hierarchy"""
    content = []
    current_section = ""
    current_subsection = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                # Detect section headers
                section_match = re.match(r'\\section\*{(.+?)}', line)
                subsection_match = re.match(r'\\subsection\*{(.+?)}', line)
                
                if section_match:
                    current_section = section_match.group(1)
                    current_subsection = ""
                    content.append({
                        'type': 'section',
                        'title': current_section,
                        'text': ""
                    })
                elif subsection_match:
                    current_subsection = subsection_match.group(1)
                    content.append({
                        'type': 'subsection',
                        'title': current_subsection,
                        'text': ""
                    })
                else:
                    if content:
                        content[-1]['text'] += line + " "
                    else:
                        content.append({
                            'type': 'text',
                            'title': "",
                            'text': line
                        })
    
    return content

def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]:
    """Create chunks optimized for DistilBERT with hierarchy context"""
    chunks = []
    current_chunk = []
    current_tokens = 0
    current_section = ""
    current_subsection = ""
    
    for item in content:
        # Build context header
        header = ""
        if item['type'] == 'section':
            current_section = item['title']
            current_subsection = ""
            header = f"[SECTION] {current_section}\n"
        elif item['type'] == 'subsection':
            current_subsection = item['title']
            header = f"[SUBSECTION] {current_subsection}\n"
        
        # Split text into sentences
        sentences = re.split(r'(?<=[.!?])\s+', item['text'])
        
        for sentence in sentences:
            full_text = header + sentence if header else sentence
            tokens = tokenizer.encode(full_text)
            
            if current_tokens + len(tokens) > max_tokens:
                if current_chunk:
                    chunk_text = "\n".join(current_chunk)
                    chunks.append({
                        'text': chunk_text,
                        'section': current_section,
                        'subsection': current_subsection,
                        'tokens': current_tokens,
                        'file_name':file_name
                    })
                    # Carry over overlap
                    overlap_tokens = tokenizer.encode(chunk_text)[-overlap:]
                    current_chunk = [tokenizer.decode(overlap_tokens)]
                    current_tokens = len(overlap_tokens)
                    header = ""  # Reset header after overlap
                
            current_chunk.append(full_text)
            current_tokens += len(tokens)
            header = ""  # Clear header after first use
    
    # Add remaining content
    if current_chunk:
        chunk_text = "\n".join(current_chunk)
        chunks.append({
            'text': chunk_text,
            'section': current_section,
            'subsection': current_subsection,
            'tokens': current_tokens,
            'file_name':file_name
        })
    
    return chunks

def process_pdf(pdf_path: str) -> List[Dict]:
    """Process PDF into BERT-optimized chunks"""
    structured_content = extract_text_with_hierarchy(pdf_path)
    return create_bert_chunks(pdf_path,structured_content)