only accept one rule as a solution, we select the first one. Do not allow groundings
Browse files- .gitignore +1 -0
- README.md +7 -4
- VerifiableRewardsForScalableLogicalReasoning.py +30 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.idea
|
README.md
CHANGED
@@ -18,16 +18,19 @@ description: >-
|
|
18 |
|
19 |
# Metric Card for Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
25 |
### How it Works
|
26 |
- **Input:** The symbolic judge takes as input a candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples.
|
27 |
- **Execution:** The candidate rule is executed against the validation program using a Prolog interpreter.
|
28 |
- **Correctness Criteria:** The rule is considered correct if it entails all positive examples and rejects all negative examples.
|
29 |
- **Metrics:** The symbolic judge computes a range of evaluation metrics (detailed below).
|
|
|
30 |
**Note:** A local Prolog interpreter is required to execute validation programs.
|
|
|
31 |
---
|
32 |
|
33 |
### Inputs
|
|
|
18 |
|
19 |
# Metric Card for Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning
|
20 |
|
21 |
+
This metric is part of the SLR framework (AIML-TUDA/SLR-Bench) and provides rewards for logical reasoning tasks.
|
22 |
+
THe reward model is grounded in the ILP (Inductive Logic Programming) paradigm, testing whether a given hypothesis (logic rule) solves a logical reasoning task.
|
23 |
+
TO check for entailment, the logic rule is executed against a set of background knowledge and examples, ensuring automatic evaluation that is verifiable, transparent, and reproducible.
|
24 |
+
|
25 |
+
|
26 |
### How it Works
|
27 |
- **Input:** The symbolic judge takes as input a candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples.
|
28 |
- **Execution:** The candidate rule is executed against the validation program using a Prolog interpreter.
|
29 |
- **Correctness Criteria:** The rule is considered correct if it entails all positive examples and rejects all negative examples.
|
30 |
- **Metrics:** The symbolic judge computes a range of evaluation metrics (detailed below).
|
31 |
+
|
32 |
**Note:** A local Prolog interpreter is required to execute validation programs.
|
33 |
+
|
34 |
---
|
35 |
|
36 |
### Inputs
|
VerifiableRewardsForScalableLogicalReasoning.py
CHANGED
@@ -100,13 +100,41 @@ Returns:
|
|
100 |
"""
|
101 |
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
104 |
"""
|
105 |
Evaluates a predicted rule against the validation program using Prolog.
|
106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
# Extract configuration
|
108 |
positive_pred = eval_config.get("positive_predicate", "eastbound")
|
109 |
negative_pred = eval_config.get("negative_predicate", "westbound")
|
|
|
|
|
|
|
|
|
|
|
110 |
# extract predicate from rule_to_evaluate
|
111 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction)
|
112 |
if positive_pred not in rule_to_evaluate:
|
@@ -234,6 +262,7 @@ def extract_ilp_from_text_v2(text, target_predicates=None):
|
|
234 |
if not statement.endswith('.'):
|
235 |
statement += '.'
|
236 |
p_code += statement + '\n'
|
|
|
237 |
return p_code.strip() # Ensure no trailing whitespace
|
238 |
|
239 |
|
@@ -315,7 +344,7 @@ class VerifiableRewardsForScalableLogicalReasoning(evaluate.Metric):
|
|
315 |
eval_inputs.append((prediction, validation_program, eval_config))
|
316 |
|
317 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
318 |
-
if len(eval_inputs) >
|
319 |
# Process evaluations in parallel
|
320 |
num_cpus = max(1, mp.cpu_count() - 1) # Leave one CPU free
|
321 |
with mp.Pool(processes=num_cpus) as pool:
|
|
|
100 |
"""
|
101 |
|
102 |
|
103 |
+
def validate_rule_no_hardcoded_cars(prediction):
|
104 |
+
"""Reject rules that hardcode specific car identifiers"""
|
105 |
+
import re
|
106 |
+
|
107 |
+
# Look for has_car with a constant (lowercase) in second position
|
108 |
+
hardcoded_pattern = r'has_car\([^,]+,\s*([a-z][a-z0-9_]*)\)'
|
109 |
+
matches = re.findall(hardcoded_pattern, prediction)
|
110 |
+
|
111 |
+
if matches:
|
112 |
+
return False, f"Rule contains ground cars: {matches[0]}"
|
113 |
+
|
114 |
+
return True, "Rule is valid"
|
115 |
+
|
116 |
+
|
117 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
118 |
"""
|
119 |
Evaluates a predicted rule against the validation program using Prolog.
|
120 |
"""
|
121 |
+
is_valid, validation_msg = validate_rule_no_hardcoded_cars(prediction)
|
122 |
+
if not is_valid:
|
123 |
+
return {
|
124 |
+
"is_correct": False,
|
125 |
+
"partial_score": 0.0,
|
126 |
+
"syntax_valid": False,
|
127 |
+
"error": f"Rule validation failed: {validation_msg}"
|
128 |
+
}
|
129 |
+
|
130 |
# Extract configuration
|
131 |
positive_pred = eval_config.get("positive_predicate", "eastbound")
|
132 |
negative_pred = eval_config.get("negative_predicate", "westbound")
|
133 |
+
|
134 |
+
|
135 |
+
validation_program = anonymize_entities(validation_program)
|
136 |
+
|
137 |
+
|
138 |
# extract predicate from rule_to_evaluate
|
139 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction)
|
140 |
if positive_pred not in rule_to_evaluate:
|
|
|
262 |
if not statement.endswith('.'):
|
263 |
statement += '.'
|
264 |
p_code += statement + '\n'
|
265 |
+
print(p_code)
|
266 |
return p_code.strip() # Ensure no trailing whitespace
|
267 |
|
268 |
|
|
|
344 |
eval_inputs.append((prediction, validation_program, eval_config))
|
345 |
|
346 |
# if more than 1k predictions, we use multiprocessing to speed up the evaluation
|
347 |
+
if len(eval_inputs) > 500:
|
348 |
# Process evaluations in parallel
|
349 |
num_cpus = max(1, mp.cpu_count() - 1) # Leave one CPU free
|
350 |
with mp.Pool(processes=num_cpus) as pool:
|