Sasidhar commited on
Commit
7847f4e
·
verified ·
1 Parent(s): 239fd97

Update custom_models/groundedness_checker/llm_based_qa_generator.py

Browse files
custom_models/groundedness_checker/llm_based_qa_generator.py CHANGED
@@ -1,127 +1,126 @@
1
- import numpy as np
2
- import re
3
- from tqdm import tqdm
4
- import json
5
- import pickle
6
- from llmgaurdrails.llms.openai_client import invoke_api
7
-
8
- class LLMBasedQAGenerator:
9
-
10
- def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
11
- """Create standardized training entry with validation checks"""
12
- # Clean and validate inputs
13
- context = self._clean_text(context)
14
- question = self._clean_text(question).rstrip("?") + "?"
15
- answer = self._clean_answer(answer)
16
-
17
- if not question or not answer:
18
- return None
19
-
20
- return {
21
- "context": context,
22
- "question": question,
23
- "answer": answer,
24
- "label": int(bool(label)), # Force 0/1 encoding
25
- "meta": {
26
- "context_hash": hash(context),
27
- "answer_type": self._classify_answer_type(answer),
28
- "question_type": self._classify_question(question)
29
- }
30
- }
31
-
32
- def _clean_text(self, text: str) -> str:
33
- """Basic text normalization"""
34
- return re.sub(r'\s+', ' ', text).strip()
35
-
36
- def _clean_answer(self, answer: str) -> str:
37
- """Answer-specific cleaning"""
38
- answer = self._clean_text(answer)
39
- if answer.lower() in ["", "n/a", "unknown"]:
40
- return "[INVALID]"
41
- return answer
42
-
43
- def _classify_answer_type(self, answer: str) -> str:
44
- """Categorize answers for analysis"""
45
- if "$" in answer: return "monetary"
46
- if "%" in answer: return "percentage"
47
- if any(c.isdigit() for c in answer): return "numeric"
48
- return "textual"
49
-
50
- def _classify_question(self, question: str) -> str:
51
- """Identify question types"""
52
- q = question.lower()
53
- if "how much" in q: return "quantity"
54
- if "when" in q: return "temporal"
55
- if "why" in q: return "reason"
56
- return "factual"
57
-
58
-
59
- def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
60
-
61
- questions = []
62
- answers =[]
63
- # Generate a question and a grounded answer
64
- for i in range(num_questions):
65
- try:
66
- grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
67
- The question and answer should not exceed 15 words each.
68
- The response should ne a json with 'question' and 'answer as the key'"""
69
- grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
70
- grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
71
-
72
- # print("Question:",grounded_qa_response)
73
- grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
74
-
75
- questions.append(grounded_qa['question'])
76
- answers.append(grounded_qa['answer'])
77
- except:
78
- print("errored")
79
- questions.append('')
80
- answers.append('')
81
-
82
- return questions,answers
83
-
84
- def _generate_ungrounded_answer(self,chunk,question,grounded_answer):
85
-
86
- try:
87
- ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect.
88
- The 'answer' part of the response should not exceed 15 words each.
89
- The response should ne a json with just one key 'answer'"""
90
- ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible"
91
-
92
- ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30)
93
- # print("answer:",ungrounded_answer_response)
94
- answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json"))
95
- return answer_json['answer']
96
- except:
97
- print("errored in answer")
98
- return ''
99
-
100
- def generate_dataset(self, chunks: list,
101
- persist_dataset:bool =False,
102
- presisted_file_path: str = "training_data") -> list:
103
-
104
- dataset = []
105
-
106
- for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
107
-
108
- chunk = chunk_dict['text']
109
-
110
- if not chunk.strip():
111
- continue
112
-
113
- questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk)
114
-
115
- for question,grounded_answer in zip(questions,grounded_answers):
116
- if not question.strip():
117
- continue
118
-
119
- ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer)
120
-
121
- dataset.append(self._create_entry(chunk, question, grounded_answer, 1))
122
- dataset.append(self._create_entry(chunk, question, ungrounded, 0))
123
-
124
- if persist_dataset:
125
- pickle.dump(dataset,open(presisted_file_path,'ab'))
126
-
127
  return dataset
 
1
+ import numpy as np
2
+ import re
3
+ from tqdm import tqdm
4
+ import json
5
+ import pickle
6
+ from llmgaurdrails.llms.openai_client import invoke_api
7
+
8
+ class LLMBasedQAGenerator:
9
+
10
+ def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
11
+ """Create standardized training entry with validation checks"""
12
+ # Clean and validate inputs
13
+ context = self._clean_text(context)
14
+ question = self._clean_text(question).rstrip("?") + "?"
15
+ answer = self._clean_answer(answer)
16
+
17
+ if not question or not answer:
18
+ return None
19
+
20
+ return {
21
+ "context": context,
22
+ "question": question,
23
+ "answer": answer,
24
+ "label": int(bool(label)), # Force 0/1 encoding
25
+ "meta": {
26
+ "context_hash": hash(context),
27
+ "answer_type": self._classify_answer_type(answer),
28
+ "question_type": self._classify_question(question)
29
+ }
30
+ }
31
+
32
+ def _clean_text(self, text: str) -> str:
33
+ """Basic text normalization"""
34
+ return re.sub(r'\s+', ' ', text).strip()
35
+
36
+ def _clean_answer(self, answer: str) -> str:
37
+ """Answer-specific cleaning"""
38
+ answer = self._clean_text(answer)
39
+ if answer.lower() in ["", "n/a", "unknown"]:
40
+ return "[INVALID]"
41
+ return answer
42
+
43
+ def _classify_answer_type(self, answer: str) -> str:
44
+ """Categorize answers for analysis"""
45
+ if "$" in answer: return "monetary"
46
+ if "%" in answer: return "percentage"
47
+ if any(c.isdigit() for c in answer): return "numeric"
48
+ return "textual"
49
+
50
+ def _classify_question(self, question: str) -> str:
51
+ """Identify question types"""
52
+ q = question.lower()
53
+ if "how much" in q: return "quantity"
54
+ if "when" in q: return "temporal"
55
+ if "why" in q: return "reason"
56
+ return "factual"
57
+
58
+
59
+ def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
60
+
61
+ questions = []
62
+ answers =[]
63
+ # Generate a question and a grounded answer
64
+ for i in range(num_questions):
65
+ try:
66
+ grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
67
+ The question and answer should not exceed 15 words each.
68
+ The response should ne a json with 'question' and 'answer as the key'"""
69
+ grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
70
+ grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
71
+
72
+ # print("Question:",grounded_qa_response)
73
+ grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
74
+
75
+ questions.append(grounded_qa['question'])
76
+ answers.append(grounded_qa['answer'])
77
+ except:
78
+ questions.append('')
79
+ answers.append('')
80
+
81
+ return questions,answers
82
+
83
+ def _generate_ungrounded_answer(self,chunk,question,grounded_answer):
84
+
85
+ try:
86
+ ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect.
87
+ The 'answer' part of the response should not exceed 15 words each.
88
+ The response should ne a json with just one key 'answer'"""
89
+ ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible"
90
+
91
+ ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30)
92
+ # print("answer:",ungrounded_answer_response)
93
+ answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json"))
94
+ return answer_json['answer']
95
+ except:
96
+ print("errored in answer")
97
+ return ''
98
+
99
+ def generate_dataset(self, chunks: list,
100
+ persist_dataset:bool =False,
101
+ presisted_file_path: str = "training_data") -> list:
102
+
103
+ dataset = []
104
+
105
+ for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
106
+
107
+ chunk = chunk_dict['text']
108
+
109
+ if not chunk.strip():
110
+ continue
111
+
112
+ questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk)
113
+
114
+ for question,grounded_answer in zip(questions,grounded_answers):
115
+ if not question.strip():
116
+ continue
117
+
118
+ ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer)
119
+
120
+ dataset.append(self._create_entry(chunk, question, grounded_answer, 1))
121
+ dataset.append(self._create_entry(chunk, question, ungrounded, 0))
122
+
123
+ if persist_dataset:
124
+ pickle.dump(dataset,open(presisted_file_path,'ab'))
125
+
 
126
  return dataset