Sasidhar's picture
Upload 16 files
826f9a4 verified
raw
history blame
4.56 kB
import pdfplumber
import re
from transformers import AutoTokenizer
from typing import List, Dict
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# We try to extract the section and subsection data along with the text to be appended to the chunk
def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]:
"""Extract text with section/subsection hierarchy"""
content = []
current_section = ""
current_subsection = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# Detect section headers
section_match = re.match(r'\\section\*{(.+?)}', line)
subsection_match = re.match(r'\\subsection\*{(.+?)}', line)
if section_match:
current_section = section_match.group(1)
current_subsection = ""
content.append({
'type': 'section',
'title': current_section,
'text': ""
})
elif subsection_match:
current_subsection = subsection_match.group(1)
content.append({
'type': 'subsection',
'title': current_subsection,
'text': ""
})
else:
if content:
content[-1]['text'] += line + " "
else:
content.append({
'type': 'text',
'title': "",
'text': line
})
return content
def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]:
"""Create chunks optimized for DistilBERT with hierarchy context"""
chunks = []
current_chunk = []
current_tokens = 0
current_section = ""
current_subsection = ""
for item in content:
# Build context header
header = ""
if item['type'] == 'section':
current_section = item['title']
current_subsection = ""
header = f"[SECTION] {current_section}\n"
elif item['type'] == 'subsection':
current_subsection = item['title']
header = f"[SUBSECTION] {current_subsection}\n"
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', item['text'])
for sentence in sentences:
full_text = header + sentence if header else sentence
tokens = tokenizer.encode(full_text)
if current_tokens + len(tokens) > max_tokens:
if current_chunk:
chunk_text = "\n".join(current_chunk)
chunks.append({
'text': chunk_text,
'section': current_section,
'subsection': current_subsection,
'tokens': current_tokens,
'file_name':file_name
})
# Carry over overlap
overlap_tokens = tokenizer.encode(chunk_text)[-overlap:]
current_chunk = [tokenizer.decode(overlap_tokens)]
current_tokens = len(overlap_tokens)
header = "" # Reset header after overlap
current_chunk.append(full_text)
current_tokens += len(tokens)
header = "" # Clear header after first use
# Add remaining content
if current_chunk:
chunk_text = "\n".join(current_chunk)
chunks.append({
'text': chunk_text,
'section': current_section,
'subsection': current_subsection,
'tokens': current_tokens,
'file_name':file_name
})
return chunks
def process_pdf(pdf_path: str) -> List[Dict]:
"""Process PDF into BERT-optimized chunks"""
structured_content = extract_text_with_hierarchy(pdf_path)
return create_bert_chunks(pdf_path,structured_content)