Spaces:
Sleeping
Sleeping
| import spacy | |
| import torch | |
| import random | |
| import numpy as np | |
| import re | |
| from tqdm import tqdm | |
| from transformers import pipeline | |
| from llmgaurdrails.custom_models.groundedness_checker.ungrounded_answer_generator import UngroundedAnswerGenerator | |
| from llmgaurdrails.llms.openai_client import invoke_api | |
| # A Simple QA Generator that generates a question and answer based on a given context. This is based on a fine tuned model on a QA dataset | |
| class SimpleQAGenerator: | |
| def __init__(self): | |
| self.qg_model = pipeline( | |
| "text2text-generation", | |
| model="valhalla/t5-base-qa-qg-hl", | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| self.ungrounded_gen = UngroundedAnswerGenerator() | |
| self.nlp = spacy.load("en_core_web_sm") | |
| def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict: | |
| """Create standardized training entry with validation checks""" | |
| # Clean and validate inputs | |
| context = self._clean_text(context) | |
| question = self._clean_text(question).rstrip("?") + "?" | |
| answer = self._clean_answer(answer) | |
| if not question or not answer: | |
| return None | |
| return { | |
| "context": context, | |
| "question": question, | |
| "answer": answer, | |
| "label": int(bool(label)), # Force 0/1 encoding | |
| "meta": { | |
| "context_hash": hash(context), | |
| "answer_type": self._classify_answer_type(answer), | |
| "question_type": self._classify_question(question) | |
| } | |
| } | |
| def _clean_text(self, text: str) -> str: | |
| """Basic text normalization""" | |
| return re.sub(r'\s+', ' ', text).strip() | |
| def _clean_answer(self, answer: str) -> str: | |
| """Answer-specific cleaning""" | |
| answer = self._clean_text(answer) | |
| if answer.lower() in ["", "n/a", "unknown"]: | |
| return "[INVALID]" | |
| return answer | |
| def _classify_answer_type(self, answer: str) -> str: | |
| """Categorize answers for analysis""" | |
| if "$" in answer: return "monetary" | |
| if "%" in answer: return "percentage" | |
| if any(c.isdigit() for c in answer): return "numeric" | |
| return "textual" | |
| def _classify_question(self, question: str) -> str: | |
| """Identify question types""" | |
| q = question.lower() | |
| if "how much" in q: return "quantity" | |
| if "when" in q: return "temporal" | |
| if "why" in q: return "reason" | |
| return "factual" | |
| def generate_dataset(self, chunks: list) -> list: | |
| dataset = [] | |
| for chunk_dict in tqdm(chunks, desc="Generating QA pairs"): | |
| chunk = chunk_dict['text'] | |
| if not chunk.strip(): | |
| continue | |
| questions = self._generate_questions(chunk) | |
| for question in questions: | |
| if not question.strip(): | |
| continue | |
| grounded = self._get_grounded_answer(chunk, question) | |
| ungrounded = self.ungrounded_gen.generate(chunk, grounded) | |
| dataset.append(self._create_entry(chunk, question, grounded, 1)) | |
| dataset.append(self._create_entry(chunk, question, ungrounded, 0)) | |
| return dataset | |
| def _generate_questions(self, context: str) -> list: | |
| try: | |
| output = self.qg_model( | |
| f"generate questions: {context}", | |
| max_length=64, | |
| num_return_sequences=3, | |
| do_sample=True, | |
| temperature=0.9 | |
| ) | |
| return [q['generated_text'].strip() for q in output] | |
| except: | |
| return [] | |
| def _get_grounded_answer(self, context: str, question: str) -> str: | |
| try: | |
| answer = self.qg_model( | |
| f"answer: {context} question: {question}", | |
| max_length=64, | |
| num_beams=1 | |
| )[0]['generated_text'].strip() | |
| return answer if answer else "[No Answer]" | |
| except: | |
| return "[No Answer]" | |