File size: 4,305 Bytes
826f9a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import spacy
import torch
import random
import numpy as np
import re
from tqdm import tqdm
from transformers import pipeline
from llmgaurdrails.custom_models.groundedness_checker.ungrounded_answer_generator import UngroundedAnswerGenerator
from llmgaurdrails.llms.openai_client import invoke_api

#  A Simple QA Generator that generates a question and answer based on a given context. This is based on a fine tuned model on a QA dataset
class SimpleQAGenerator:
    def __init__(self):
        self.qg_model = pipeline(
            "text2text-generation", 
            model="valhalla/t5-base-qa-qg-hl",
            device=0 if torch.cuda.is_available() else -1
        )
        self.ungrounded_gen = UngroundedAnswerGenerator()

        self.nlp = spacy.load("en_core_web_sm")
    
    def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
        """Create standardized training entry with validation checks"""
        # Clean and validate inputs
        context = self._clean_text(context)
        question = self._clean_text(question).rstrip("?") + "?"
        answer = self._clean_answer(answer)
        
        if not question or not answer:
            return None
            
        return {
            "context": context,
            "question": question,
            "answer": answer,
            "label": int(bool(label)),  # Force 0/1 encoding
            "meta": {
                "context_hash": hash(context),
                "answer_type": self._classify_answer_type(answer),
                "question_type": self._classify_question(question)
            }
        }
    
    def _clean_text(self, text: str) -> str:
        """Basic text normalization"""
        return re.sub(r'\s+', ' ', text).strip()
    
    def _clean_answer(self, answer: str) -> str:
        """Answer-specific cleaning"""
        answer = self._clean_text(answer)
        if answer.lower() in ["", "n/a", "unknown"]:
            return "[INVALID]"
        return answer
    
    def _classify_answer_type(self, answer: str) -> str:
        """Categorize answers for analysis"""
        if "$" in answer: return "monetary"
        if "%" in answer: return "percentage"
        if any(c.isdigit() for c in answer): return "numeric"
        return "textual"
    
    def _classify_question(self, question: str) -> str:
        """Identify question types"""
        q = question.lower()
        if "how much" in q: return "quantity"
        if "when" in q: return "temporal"
        if "why" in q: return "reason"
        return "factual"

    def generate_dataset(self, chunks: list) -> list:
        dataset = []
        for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
            
            chunk = chunk_dict['text']

            if not chunk.strip():
                continue
                
            questions = self._generate_questions(chunk)
            for question in questions:
                if not question.strip():
                    continue
                
                grounded = self._get_grounded_answer(chunk, question)
                ungrounded = self.ungrounded_gen.generate(chunk, grounded)
                
                dataset.append(self._create_entry(chunk, question, grounded, 1))
                dataset.append(self._create_entry(chunk, question, ungrounded, 0))
        
        return dataset
    
    def _generate_questions(self, context: str) -> list:
        try:
            output = self.qg_model(
                f"generate questions: {context}",
                max_length=64,
                num_return_sequences=3,
                do_sample=True,
                temperature=0.9
            )
            return [q['generated_text'].strip() for q in output]
        except:
            return []

    def _get_grounded_answer(self, context: str, question: str) -> str:

        try:
            answer = self.qg_model(
                f"answer: {context} question: {question}",
                max_length=64,
                num_beams=1
            )[0]['generated_text'].strip()
            return answer if answer else "[No Answer]"
        except:
            return "[No Answer]"