Spaces:
Sleeping
Sleeping
Update custom_models/groundedness_checker/llm_based_qa_generator.py
Browse files
custom_models/groundedness_checker/llm_based_qa_generator.py
CHANGED
@@ -1,127 +1,126 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import re
|
3 |
-
from tqdm import tqdm
|
4 |
-
import json
|
5 |
-
import pickle
|
6 |
-
from llmgaurdrails.llms.openai_client import invoke_api
|
7 |
-
|
8 |
-
class LLMBasedQAGenerator:
|
9 |
-
|
10 |
-
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
|
11 |
-
"""Create standardized training entry with validation checks"""
|
12 |
-
# Clean and validate inputs
|
13 |
-
context = self._clean_text(context)
|
14 |
-
question = self._clean_text(question).rstrip("?") + "?"
|
15 |
-
answer = self._clean_answer(answer)
|
16 |
-
|
17 |
-
if not question or not answer:
|
18 |
-
return None
|
19 |
-
|
20 |
-
return {
|
21 |
-
"context": context,
|
22 |
-
"question": question,
|
23 |
-
"answer": answer,
|
24 |
-
"label": int(bool(label)), # Force 0/1 encoding
|
25 |
-
"meta": {
|
26 |
-
"context_hash": hash(context),
|
27 |
-
"answer_type": self._classify_answer_type(answer),
|
28 |
-
"question_type": self._classify_question(question)
|
29 |
-
}
|
30 |
-
}
|
31 |
-
|
32 |
-
def _clean_text(self, text: str) -> str:
|
33 |
-
"""Basic text normalization"""
|
34 |
-
return re.sub(r'\s+', ' ', text).strip()
|
35 |
-
|
36 |
-
def _clean_answer(self, answer: str) -> str:
|
37 |
-
"""Answer-specific cleaning"""
|
38 |
-
answer = self._clean_text(answer)
|
39 |
-
if answer.lower() in ["", "n/a", "unknown"]:
|
40 |
-
return "[INVALID]"
|
41 |
-
return answer
|
42 |
-
|
43 |
-
def _classify_answer_type(self, answer: str) -> str:
|
44 |
-
"""Categorize answers for analysis"""
|
45 |
-
if "$" in answer: return "monetary"
|
46 |
-
if "%" in answer: return "percentage"
|
47 |
-
if any(c.isdigit() for c in answer): return "numeric"
|
48 |
-
return "textual"
|
49 |
-
|
50 |
-
def _classify_question(self, question: str) -> str:
|
51 |
-
"""Identify question types"""
|
52 |
-
q = question.lower()
|
53 |
-
if "how much" in q: return "quantity"
|
54 |
-
if "when" in q: return "temporal"
|
55 |
-
if "why" in q: return "reason"
|
56 |
-
return "factual"
|
57 |
-
|
58 |
-
|
59 |
-
def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
|
60 |
-
|
61 |
-
questions = []
|
62 |
-
answers =[]
|
63 |
-
# Generate a question and a grounded answer
|
64 |
-
for i in range(num_questions):
|
65 |
-
try:
|
66 |
-
grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
|
67 |
-
The question and answer should not exceed 15 words each.
|
68 |
-
The response should ne a json with 'question' and 'answer as the key'"""
|
69 |
-
grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
|
70 |
-
grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
|
71 |
-
|
72 |
-
# print("Question:",grounded_qa_response)
|
73 |
-
grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
|
74 |
-
|
75 |
-
questions.append(grounded_qa['question'])
|
76 |
-
answers.append(grounded_qa['answer'])
|
77 |
-
except:
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
The
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
answer_json
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
dataset.append(self._create_entry(chunk, question,
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
return dataset
|
|
|
1 |
+
import numpy as np
|
2 |
+
import re
|
3 |
+
from tqdm import tqdm
|
4 |
+
import json
|
5 |
+
import pickle
|
6 |
+
from llmgaurdrails.llms.openai_client import invoke_api
|
7 |
+
|
8 |
+
class LLMBasedQAGenerator:
|
9 |
+
|
10 |
+
def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict:
|
11 |
+
"""Create standardized training entry with validation checks"""
|
12 |
+
# Clean and validate inputs
|
13 |
+
context = self._clean_text(context)
|
14 |
+
question = self._clean_text(question).rstrip("?") + "?"
|
15 |
+
answer = self._clean_answer(answer)
|
16 |
+
|
17 |
+
if not question or not answer:
|
18 |
+
return None
|
19 |
+
|
20 |
+
return {
|
21 |
+
"context": context,
|
22 |
+
"question": question,
|
23 |
+
"answer": answer,
|
24 |
+
"label": int(bool(label)), # Force 0/1 encoding
|
25 |
+
"meta": {
|
26 |
+
"context_hash": hash(context),
|
27 |
+
"answer_type": self._classify_answer_type(answer),
|
28 |
+
"question_type": self._classify_question(question)
|
29 |
+
}
|
30 |
+
}
|
31 |
+
|
32 |
+
def _clean_text(self, text: str) -> str:
|
33 |
+
"""Basic text normalization"""
|
34 |
+
return re.sub(r'\s+', ' ', text).strip()
|
35 |
+
|
36 |
+
def _clean_answer(self, answer: str) -> str:
|
37 |
+
"""Answer-specific cleaning"""
|
38 |
+
answer = self._clean_text(answer)
|
39 |
+
if answer.lower() in ["", "n/a", "unknown"]:
|
40 |
+
return "[INVALID]"
|
41 |
+
return answer
|
42 |
+
|
43 |
+
def _classify_answer_type(self, answer: str) -> str:
|
44 |
+
"""Categorize answers for analysis"""
|
45 |
+
if "$" in answer: return "monetary"
|
46 |
+
if "%" in answer: return "percentage"
|
47 |
+
if any(c.isdigit() for c in answer): return "numeric"
|
48 |
+
return "textual"
|
49 |
+
|
50 |
+
def _classify_question(self, question: str) -> str:
|
51 |
+
"""Identify question types"""
|
52 |
+
q = question.lower()
|
53 |
+
if "how much" in q: return "quantity"
|
54 |
+
if "when" in q: return "temporal"
|
55 |
+
if "why" in q: return "reason"
|
56 |
+
return "factual"
|
57 |
+
|
58 |
+
|
59 |
+
def _generate_questions_and_grounded_answers(self,chunk,num_questions=3):
|
60 |
+
|
61 |
+
questions = []
|
62 |
+
answers =[]
|
63 |
+
# Generate a question and a grounded answer
|
64 |
+
for i in range(num_questions):
|
65 |
+
try:
|
66 |
+
grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context.
|
67 |
+
The question and answer should not exceed 15 words each.
|
68 |
+
The response should ne a json with 'question' and 'answer as the key'"""
|
69 |
+
grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context."
|
70 |
+
grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100)
|
71 |
+
|
72 |
+
# print("Question:",grounded_qa_response)
|
73 |
+
grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json"))
|
74 |
+
|
75 |
+
questions.append(grounded_qa['question'])
|
76 |
+
answers.append(grounded_qa['answer'])
|
77 |
+
except:
|
78 |
+
questions.append('')
|
79 |
+
answers.append('')
|
80 |
+
|
81 |
+
return questions,answers
|
82 |
+
|
83 |
+
def _generate_ungrounded_answer(self,chunk,question,grounded_answer):
|
84 |
+
|
85 |
+
try:
|
86 |
+
ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect.
|
87 |
+
The 'answer' part of the response should not exceed 15 words each.
|
88 |
+
The response should ne a json with just one key 'answer'"""
|
89 |
+
ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible"
|
90 |
+
|
91 |
+
ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30)
|
92 |
+
# print("answer:",ungrounded_answer_response)
|
93 |
+
answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json"))
|
94 |
+
return answer_json['answer']
|
95 |
+
except:
|
96 |
+
print("errored in answer")
|
97 |
+
return ''
|
98 |
+
|
99 |
+
def generate_dataset(self, chunks: list,
|
100 |
+
persist_dataset:bool =False,
|
101 |
+
presisted_file_path: str = "training_data") -> list:
|
102 |
+
|
103 |
+
dataset = []
|
104 |
+
|
105 |
+
for chunk_dict in tqdm(chunks, desc="Generating QA pairs"):
|
106 |
+
|
107 |
+
chunk = chunk_dict['text']
|
108 |
+
|
109 |
+
if not chunk.strip():
|
110 |
+
continue
|
111 |
+
|
112 |
+
questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk)
|
113 |
+
|
114 |
+
for question,grounded_answer in zip(questions,grounded_answers):
|
115 |
+
if not question.strip():
|
116 |
+
continue
|
117 |
+
|
118 |
+
ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer)
|
119 |
+
|
120 |
+
dataset.append(self._create_entry(chunk, question, grounded_answer, 1))
|
121 |
+
dataset.append(self._create_entry(chunk, question, ungrounded, 0))
|
122 |
+
|
123 |
+
if persist_dataset:
|
124 |
+
pickle.dump(dataset,open(presisted_file_path,'ab'))
|
125 |
+
|
|
|
126 |
return dataset
|