Spaces:
Sleeping
Sleeping
import pdfplumber | |
import re | |
from transformers import AutoTokenizer | |
from typing import List, Dict | |
import pandas as pd | |
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
# We try to extract the section and subsection data along with the text to be appended to the chunk | |
def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]: | |
"""Extract text with section/subsection hierarchy""" | |
content = [] | |
current_section = "" | |
current_subsection = "" | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
lines = text.split('\n') | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
# Detect section headers | |
section_match = re.match(r'\\section\*{(.+?)}', line) | |
subsection_match = re.match(r'\\subsection\*{(.+?)}', line) | |
if section_match: | |
current_section = section_match.group(1) | |
current_subsection = "" | |
content.append({ | |
'type': 'section', | |
'title': current_section, | |
'text': "" | |
}) | |
elif subsection_match: | |
current_subsection = subsection_match.group(1) | |
content.append({ | |
'type': 'subsection', | |
'title': current_subsection, | |
'text': "" | |
}) | |
else: | |
if content: | |
content[-1]['text'] += line + " " | |
else: | |
content.append({ | |
'type': 'text', | |
'title': "", | |
'text': line | |
}) | |
return content | |
def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]: | |
"""Create chunks optimized for DistilBERT with hierarchy context""" | |
chunks = [] | |
current_chunk = [] | |
current_tokens = 0 | |
current_section = "" | |
current_subsection = "" | |
for item in content: | |
# Build context header | |
header = "" | |
if item['type'] == 'section': | |
current_section = item['title'] | |
current_subsection = "" | |
header = f"[SECTION] {current_section}\n" | |
elif item['type'] == 'subsection': | |
current_subsection = item['title'] | |
header = f"[SUBSECTION] {current_subsection}\n" | |
# Split text into sentences | |
sentences = re.split(r'(?<=[.!?])\s+', item['text']) | |
for sentence in sentences: | |
full_text = header + sentence if header else sentence | |
tokens = tokenizer.encode(full_text) | |
if current_tokens + len(tokens) > max_tokens: | |
if current_chunk: | |
chunk_text = "\n".join(current_chunk) | |
chunks.append({ | |
'text': chunk_text, | |
'section': current_section, | |
'subsection': current_subsection, | |
'tokens': current_tokens, | |
'file_name':file_name | |
}) | |
# Carry over overlap | |
overlap_tokens = tokenizer.encode(chunk_text)[-overlap:] | |
current_chunk = [tokenizer.decode(overlap_tokens)] | |
current_tokens = len(overlap_tokens) | |
header = "" # Reset header after overlap | |
current_chunk.append(full_text) | |
current_tokens += len(tokens) | |
header = "" # Clear header after first use | |
# Add remaining content | |
if current_chunk: | |
chunk_text = "\n".join(current_chunk) | |
chunks.append({ | |
'text': chunk_text, | |
'section': current_section, | |
'subsection': current_subsection, | |
'tokens': current_tokens, | |
'file_name':file_name | |
}) | |
return chunks | |
def process_pdf(pdf_path: str) -> List[Dict]: | |
"""Process PDF into BERT-optimized chunks""" | |
structured_content = extract_text_with_hierarchy(pdf_path) | |
return create_bert_chunks(pdf_path,structured_content) | |