Spaces:

Sasidhar
/

llmgaurdrails

Sleeping

App Files Files Community

llmgaurdrails / custom_models /groundedness_checker /pdf_data_chunker.py

Sasidhar

Upload 16 files

826f9a4 verified 4 months ago

raw

history blame

4.56 kB

	import pdfplumber
	import re
	from transformers import AutoTokenizer
	from typing import List, Dict
	import pandas as pd

	tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

	# We try to extract the section and subsection data along with the text to be appended to the chunk
	def extract_text_with_hierarchy(pdf_path: str) -> List[Dict]:
	"""Extract text with section/subsection hierarchy"""
	content = []
	current_section = ""
	current_subsection = ""

	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	lines = text.split('\n')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Detect section headers
	section_match = re.match(r'\\section\*{(.+?)}', line)
	subsection_match = re.match(r'\\subsection\*{(.+?)}', line)

	if section_match:
	current_section = section_match.group(1)
	current_subsection = ""
	content.append({
	'type': 'section',
	'title': current_section,
	'text': ""
	})
	elif subsection_match:
	current_subsection = subsection_match.group(1)
	content.append({
	'type': 'subsection',
	'title': current_subsection,
	'text': ""
	})
	else:
	if content:
	content[-1]['text'] += line + " "
	else:
	content.append({
	'type': 'text',
	'title': "",
	'text': line
	})

	return content

	def create_bert_chunks(file_name:str,content: List[Dict], max_tokens=450, overlap=50) -> List[Dict]:
	"""Create chunks optimized for DistilBERT with hierarchy context"""
	chunks = []
	current_chunk = []
	current_tokens = 0
	current_section = ""
	current_subsection = ""

	for item in content:
	# Build context header
	header = ""
	if item['type'] == 'section':
	current_section = item['title']
	current_subsection = ""
	header = f"[SECTION] {current_section}\n"
	elif item['type'] == 'subsection':
	current_subsection = item['title']
	header = f"[SUBSECTION] {current_subsection}\n"

	# Split text into sentences
	sentences = re.split(r'(?<=[.!?])\s+', item['text'])

	for sentence in sentences:
	full_text = header + sentence if header else sentence
	tokens = tokenizer.encode(full_text)

	if current_tokens + len(tokens) > max_tokens:
	if current_chunk:
	chunk_text = "\n".join(current_chunk)
	chunks.append({
	'text': chunk_text,
	'section': current_section,
	'subsection': current_subsection,
	'tokens': current_tokens,
	'file_name':file_name
	})
	# Carry over overlap
	overlap_tokens = tokenizer.encode(chunk_text)[-overlap:]
	current_chunk = [tokenizer.decode(overlap_tokens)]
	current_tokens = len(overlap_tokens)
	header = "" # Reset header after overlap

	current_chunk.append(full_text)
	current_tokens += len(tokens)
	header = "" # Clear header after first use

	# Add remaining content
	if current_chunk:
	chunk_text = "\n".join(current_chunk)
	chunks.append({
	'text': chunk_text,
	'section': current_section,
	'subsection': current_subsection,
	'tokens': current_tokens,
	'file_name':file_name
	})

	return chunks

	def process_pdf(pdf_path: str) -> List[Dict]:
	"""Process PDF into BERT-optimized chunks"""
	structured_content = extract_text_with_hierarchy(pdf_path)
	return create_bert_chunks(pdf_path,structured_content)