Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import re | |
| from transformers import AutoTokenizer | |
| from typing import List, Dict | |
| import pandas as pd | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
| # We try to extract the section and subsection data along with the text to be appended to the chunk | |
| def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]: | |
| """Extract text with section/subsection hierarchy""" | |
| content = [] | |
| current_section = "" | |
| current_subsection = "" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| lines = text.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Detect section headers | |
| section_match = re.match(r'\\section\*{(.+?)}', line) | |
| subsection_match = re.match(r'\\subsection\*{(.+?)}', line) | |
| if section_match: | |
| current_section = section_match.group(1) | |
| current_subsection = "" | |
| content.append({ | |
| 'type': 'section', | |
| 'title': current_section, | |
| 'text': "" | |
| }) | |
| elif subsection_match: | |
| current_subsection = subsection_match.group(1) | |
| content.append({ | |
| 'type': 'subsection', | |
| 'title': current_subsection, | |
| 'text': "" | |
| }) | |
| else: | |
| if content: | |
| content[-1]['text'] += line + " " | |
| else: | |
| content.append({ | |
| 'type': 'text', | |
| 'title': "", | |
| 'text': line | |
| }) | |
| return content | |
| def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]: | |
| """Create chunks optimized for DistilBERT with hierarchy context""" | |
| chunks = [] | |
| current_chunk = [] | |
| current_tokens = 0 | |
| current_section = "" | |
| current_subsection = "" | |
| for item in content: | |
| # Build context header | |
| header = "" | |
| if item['type'] == 'section': | |
| current_section = item['title'] | |
| current_subsection = "" | |
| header = f"[SECTION] {current_section}\n" | |
| elif item['type'] == 'subsection': | |
| current_subsection = item['title'] | |
| header = f"[SUBSECTION] {current_subsection}\n" | |
| # Split text into sentences | |
| sentences = re.split(r'(?<=[.!?])\s+', item['text']) | |
| for sentence in sentences: | |
| full_text = header + sentence if header else sentence | |
| tokens = tokenizer.encode(full_text) | |
| if current_tokens + len(tokens) > max_tokens: | |
| if current_chunk: | |
| chunk_text = "\n".join(current_chunk) | |
| chunks.append({ | |
| 'text': chunk_text, | |
| 'section': current_section, | |
| 'subsection': current_subsection, | |
| 'tokens': current_tokens, | |
| 'file_name':file_name | |
| }) | |
| # Carry over overlap | |
| overlap_tokens = tokenizer.encode(chunk_text)[-overlap:] | |
| current_chunk = [tokenizer.decode(overlap_tokens)] | |
| current_tokens = len(overlap_tokens) | |
| header = "" # Reset header after overlap | |
| current_chunk.append(full_text) | |
| current_tokens += len(tokens) | |
| header = "" # Clear header after first use | |
| # Add remaining content | |
| if current_chunk: | |
| chunk_text = "\n".join(current_chunk) | |
| chunks.append({ | |
| 'text': chunk_text, | |
| 'section': current_section, | |
| 'subsection': current_subsection, | |
| 'tokens': current_tokens, | |
| 'file_name':file_name | |
| }) | |
| return chunks | |
| def process_pdf(pdf_path: str) -> List[Dict]: | |
| """Process PDF into BERT-optimized chunks""" | |
| structured_content = extract_text_with_hierarchy(pdf_path) | |
| return create_bert_chunks(pdf_path,structured_content) | |