File size: 5,371 Bytes
7847f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826f9a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import re
from tqdm import tqdm
import json
import pickle
from llmgaurdrails.llms.openai_client import invoke_api

class LLMBasedQAGenerator:

    def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
        """Create standardized training entry with validation checks"""
        # Clean and validate inputs
        context = self._clean_text(context)
        question = self._clean_text(question).rstrip("?") + "?"
        answer = self._clean_answer(answer)
        
        if not question or not answer:
            return None
            
        return {
            "context": context,
            "question": question,
            "answer": answer,
            "label": int(bool(label)),  # Force 0/1 encoding
            "meta": {
                "context_hash": hash(context),
                "answer_type": self._classify_answer_type(answer),
                "question_type": self._classify_question(question)
            }
        }
    
    def _clean_text(self, text: str) -> str:
        """Basic text normalization"""
        return re.sub(r'\s+', ' ', text).strip()
    
    def _clean_answer(self, answer: str) -> str:
        """Answer-specific cleaning"""
        answer = self._clean_text(answer)
        if answer.lower() in ["", "n/a", "unknown"]:
            return "[INVALID]"
        return answer
    
    def _classify_answer_type(self, answer: str) -> str:
        """Categorize answers for analysis"""
        if "$" in answer: return "monetary"
        if "%" in answer: return "percentage"
        if any(c.isdigit() for c in answer): return "numeric"
        return "textual"
    
    def _classify_question(self, question: str) -> str:
        """Identify question types"""
        q = question.lower()
        if "how much" in q: return "quantity"
        if "when" in q: return "temporal"
        if "why" in q: return "reason"
        return "factual"

    
    def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
        
        questions = []
        answers =[]
        # Generate a question and a grounded answer
        for i in range(num_questions):
            try:
                grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
                                            The question and answer should not exceed 15 words each.
                                            The response should ne a json with 'question' and 'answer as the key'"""
                grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
                grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)

                # print("Question:",grounded_qa_response)
                grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
                
                questions.append(grounded_qa['question'])
                answers.append(grounded_qa['answer'])
            except:
                questions.append('')
                answers.append('')

        return questions,answers

    def _generate_ungrounded_answer(self,chunk,question,grounded_answer):
        
        try:
            ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are  based on the given context. But factually or logically incorrect.
                                        The 'answer' part of the response should not exceed 15 words each.
                                        The response should ne a json with just one key 'answer'"""
            ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible"

            ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30)
            # print("answer:",ungrounded_answer_response)
            answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json"))
            return answer_json['answer']
        except:
            print("errored in answer")
            return ''
        
    def generate_dataset(self, chunks: list,
                         persist_dataset:bool =False,
                         presisted_file_path: str = "training_data") -> list:
        
        dataset = []

        for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
            
            chunk = chunk_dict['text']

            if not chunk.strip():
                continue
                
            questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk)

            for question,grounded_answer in zip(questions,grounded_answers):
                if not question.strip():
                    continue
                
                ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer)
                
                dataset.append(self._create_entry(chunk, question, grounded_answer, 1))
                dataset.append(self._create_entry(chunk, question, ungrounded, 0))
        
        if persist_dataset:
            pickle.dump(dataset,open(presisted_file_path,'ab'))

        return dataset