Spaces:
Sleeping
Sleeping
File size: 4,305 Bytes
826f9a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import spacy
import torch
import random
import numpy as np
import re
from tqdm import tqdm
from transformers import pipeline
from llmgaurdrails.custom_models.groundedness_checker.ungrounded_answer_generator import UngroundedAnswerGenerator
from llmgaurdrails.llms.openai_client import invoke_api
# A Simple QA Generator that generates a question and answer based on a given context. This is based on a fine tuned model on a QA dataset
class SimpleQAGenerator:
def __init__(self):
self.qg_model = pipeline(
"text2text-generation",
model="valhalla/t5-base-qa-qg-hl",
device=0 if torch.cuda.is_available() else -1
)
self.ungrounded_gen = UngroundedAnswerGenerator()
self.nlp = spacy.load("en_core_web_sm")
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
"""Create standardized training entry with validation checks"""
# Clean and validate inputs
context = self._clean_text(context)
question = self._clean_text(question).rstrip("?") + "?"
answer = self._clean_answer(answer)
if not question or not answer:
return None
return {
"context": context,
"question": question,
"answer": answer,
"label": int(bool(label)), # Force 0/1 encoding
"meta": {
"context_hash": hash(context),
"answer_type": self._classify_answer_type(answer),
"question_type": self._classify_question(question)
}
}
def _clean_text(self, text: str) -> str:
"""Basic text normalization"""
return re.sub(r'\s+', ' ', text).strip()
def _clean_answer(self, answer: str) -> str:
"""Answer-specific cleaning"""
answer = self._clean_text(answer)
if answer.lower() in ["", "n/a", "unknown"]:
return "[INVALID]"
return answer
def _classify_answer_type(self, answer: str) -> str:
"""Categorize answers for analysis"""
if "$" in answer: return "monetary"
if "%" in answer: return "percentage"
if any(c.isdigit() for c in answer): return "numeric"
return "textual"
def _classify_question(self, question: str) -> str:
"""Identify question types"""
q = question.lower()
if "how much" in q: return "quantity"
if "when" in q: return "temporal"
if "why" in q: return "reason"
return "factual"
def generate_dataset(self, chunks: list) -> list:
dataset = []
for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
chunk = chunk_dict['text']
if not chunk.strip():
continue
questions = self._generate_questions(chunk)
for question in questions:
if not question.strip():
continue
grounded = self._get_grounded_answer(chunk, question)
ungrounded = self.ungrounded_gen.generate(chunk, grounded)
dataset.append(self._create_entry(chunk, question, grounded, 1))
dataset.append(self._create_entry(chunk, question, ungrounded, 0))
return dataset
def _generate_questions(self, context: str) -> list:
try:
output = self.qg_model(
f"generate questions: {context}",
max_length=64,
num_return_sequences=3,
do_sample=True,
temperature=0.9
)
return [q['generated_text'].strip() for q in output]
except:
return []
def _get_grounded_answer(self, context: str, question: str) -> str:
try:
answer = self.qg_model(
f"answer: {context} question: {question}",
max_length=64,
num_beams=1
)[0]['generated_text'].strip()
return answer if answer else "[No Answer]"
except:
return "[No Answer]"
|