Spaces:
Sleeping
Sleeping
import spacy | |
import torch | |
import random | |
import numpy as np | |
import re | |
from tqdm import tqdm | |
from transformers import pipeline | |
from llmgaurdrails.custom_models.groundedness_checker.ungrounded_answer_generator import UngroundedAnswerGenerator | |
from llmgaurdrails.llms.openai_client import invoke_api | |
# A Simple QA Generator that generates a question and answer based on a given context. This is based on a fine tuned model on a QA dataset | |
class SimpleQAGenerator: | |
def __init__(self): | |
self.qg_model = pipeline( | |
"text2text-generation", | |
model="valhalla/t5-base-qa-qg-hl", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
self.ungrounded_gen = UngroundedAnswerGenerator() | |
self.nlp = spacy.load("en_core_web_sm") | |
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict: | |
"""Create standardized training entry with validation checks""" | |
# Clean and validate inputs | |
context = self._clean_text(context) | |
question = self._clean_text(question).rstrip("?") + "?" | |
answer = self._clean_answer(answer) | |
if not question or not answer: | |
return None | |
return { | |
"context": context, | |
"question": question, | |
"answer": answer, | |
"label": int(bool(label)), # Force 0/1 encoding | |
"meta": { | |
"context_hash": hash(context), | |
"answer_type": self._classify_answer_type(answer), | |
"question_type": self._classify_question(question) | |
} | |
} | |
def _clean_text(self, text: str) -> str: | |
"""Basic text normalization""" | |
return re.sub(r'\s+', ' ', text).strip() | |
def _clean_answer(self, answer: str) -> str: | |
"""Answer-specific cleaning""" | |
answer = self._clean_text(answer) | |
if answer.lower() in ["", "n/a", "unknown"]: | |
return "[INVALID]" | |
return answer | |
def _classify_answer_type(self, answer: str) -> str: | |
"""Categorize answers for analysis""" | |
if "$" in answer: return "monetary" | |
if "%" in answer: return "percentage" | |
if any(c.isdigit() for c in answer): return "numeric" | |
return "textual" | |
def _classify_question(self, question: str) -> str: | |
"""Identify question types""" | |
q = question.lower() | |
if "how much" in q: return "quantity" | |
if "when" in q: return "temporal" | |
if "why" in q: return "reason" | |
return "factual" | |
def generate_dataset(self, chunks: list) -> list: | |
dataset = [] | |
for chunk_dict in tqdm(chunks, desc="Generating QA pairs"): | |
chunk = chunk_dict['text'] | |
if not chunk.strip(): | |
continue | |
questions = self._generate_questions(chunk) | |
for question in questions: | |
if not question.strip(): | |
continue | |
grounded = self._get_grounded_answer(chunk, question) | |
ungrounded = self.ungrounded_gen.generate(chunk, grounded) | |
dataset.append(self._create_entry(chunk, question, grounded, 1)) | |
dataset.append(self._create_entry(chunk, question, ungrounded, 0)) | |
return dataset | |
def _generate_questions(self, context: str) -> list: | |
try: | |
output = self.qg_model( | |
f"generate questions: {context}", | |
max_length=64, | |
num_return_sequences=3, | |
do_sample=True, | |
temperature=0.9 | |
) | |
return [q['generated_text'].strip() for q in output] | |
except: | |
return [] | |
def _get_grounded_answer(self, context: str, question: str) -> str: | |
try: | |
answer = self.qg_model( | |
f"answer: {context} question: {question}", | |
max_length=64, | |
num_beams=1 | |
)[0]['generated_text'].strip() | |
return answer if answer else "[No Answer]" | |
except: | |
return "[No Answer]" | |