|
import evaluate |
|
import gradio as gr |
|
import os |
|
|
|
|
|
|
|
def create_interface(module): |
|
def evaluate_fn(prediction, references, pos_pred, neg_pred): |
|
|
|
if not prediction or prediction.strip() == "": |
|
return "", "", "", "Please provide a candidate hypothesis to evaluate." |
|
|
|
if not references or references.strip() == "": |
|
return "", "", "", "Please provide a validation program." |
|
|
|
if not pos_pred or pos_pred.strip() == "": |
|
return "", "", "", "Please specify the positive predicate name." |
|
|
|
if not neg_pred or neg_pred.strip() == "": |
|
return "", "", "", "Please specify the negative predicate name." |
|
|
|
|
|
pred = prediction.strip() |
|
|
|
|
|
ref = { |
|
"validation_program": references.strip(), |
|
"evaluation_config": { |
|
"positive_predicate": pos_pred, |
|
"negative_predicate": neg_pred |
|
} |
|
} |
|
|
|
|
|
results = module.compute(predictions=[pred], references=[ref]) |
|
|
|
|
|
error_msg = "" |
|
if results["detailed_results"] and len(results["detailed_results"]) > 0: |
|
error = results["detailed_results"][0].get("error") |
|
if error: |
|
error_msg = error |
|
|
|
return ( |
|
f"Accuracy score: {results['accuracy']:.4f}", |
|
f"Partial score: {results['partial_score']:.4f}", |
|
f"Syntax score: {results['syntax_score']:.4f}", |
|
error_msg |
|
) |
|
|
|
|
|
def load_example(example): |
|
return ( |
|
example["rule"], |
|
example["validation"], |
|
example["pos_pred"], |
|
example["neg_pred"] |
|
) |
|
|
|
|
|
readme_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md") |
|
with open(readme_path, 'r') as f: |
|
readme_content = f.read() |
|
readme_content = '# Metric Card ' + readme_content.split('# Metric Card ')[1] |
|
|
|
|
|
example_train = { |
|
"description": "Basic Train Problem", |
|
"validation": """eastbound(train0). |
|
has_car(train0, car0_1). |
|
car_num(car0_1, 1). |
|
car_color(car0_1, white). |
|
car_len(car0_1, short). |
|
has_wall(car0_1, full). |
|
|
|
westbound(train1). |
|
has_car(train1, car1_1). |
|
car_num(car1_1, 1). |
|
car_color(car1_1, yellow). |
|
car_len(car1_1, short). |
|
has_wall(car1_1, full). |
|
""", |
|
"rule": "eastbound(Train):- has_car(Train, Car1), car_color(Car1, white).", |
|
"pos_pred": "eastbound", |
|
"neg_pred": "westbound" |
|
} |
|
|
|
example_family = { |
|
"description": "Family Relationships", |
|
"validation": """% Custom problem |
|
parent(john, mary). |
|
parent(john, bob). |
|
parent(alice, bob). |
|
parent(susan, alice). |
|
|
|
% Examples |
|
grandparent(susan, bob). |
|
not_grandparent(john, alice).""", |
|
"rule": "grandparent(X, Y) :- parent(X, Z), parent(Z, Y).", |
|
"pos_pred": "grandparent", |
|
"neg_pred": "not_grandparent" |
|
} |
|
|
|
with gr.Blocks(title="Symbolic Judge") as demo: |
|
with gr.Tab("Evaluation"): |
|
gr.Markdown("# Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning") |
|
gr.Markdown(""" |
|
Verifiable Rewards for Scalable Logical Reasoning (**SLR**) provides verifiable rewards via logic programm execution. |
|
It deterministically evaluates candidate hypotheses by executing them against the validation program and verifying all positive examples ($E^+$) are entailed and all negative examples ($E^-$) are not entailed . |
|
Evaluations performed are fully verifiable and grounded in formal logic, ensuring an automatic, transparent, and reproducible standard for evaluation and reward in both supervised and reinforcement learning settings. |
|
How it Works: |
|
- Input: A candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples. |
|
- Execution: The candidate rule is executed against the validation program using a Prolog interpreter. |
|
- Correctness Criteria: The rule is considered correct if it entails all positive examples and rejects all negative examples. |
|
- Metrics: We provide a range of evaluation metrics (detailed below). |
|
- Usage: see **Documentation tab** for details on how to use Verifiable Rewards for Scalable Logical Reasoning in your own projects. |
|
**Note:** A local Prolog interpreter is required to execute validation programs. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
gr.Markdown("### Model Output") |
|
prediction_input = gr.Textbox( |
|
label="Candidate Hypothesis to be evaluated(predicted rule by the model)", |
|
placeholder="eastbound(T) :- has_car(T, C), short(C), open(C).", |
|
lines=5 |
|
) |
|
|
|
with gr.Group(): |
|
gr.Markdown("### Validation Program") |
|
|
|
references_input = gr.Textbox( |
|
label="The validation program contains background knowledge and examples for testing", |
|
placeholder="% Background knowledge\ncar(car_1). car(car_2).\nlong(car_2). short(car_1).\nopen(car_1). closed(car_2).\n\n% Examples\neastbound(train_1).\nwestbound(train_2).\n\n% Train configurations\nhas_car(train_1, car_1).\nhas_car(train_2, car_2).", |
|
lines=12 |
|
) |
|
|
|
with gr.Row(): |
|
pos_pred_input = gr.Textbox( |
|
label="Positive Validation Examples", |
|
value="eastbound", |
|
placeholder="eastbound", |
|
info="The predicate name identifying positive examples in the validation program" |
|
) |
|
neg_pred_input = gr.Textbox( |
|
label="Negative Validation Examples", |
|
value="westbound", |
|
placeholder="westbound", |
|
info="The predicate name identifying negative examples in the validation program" |
|
) |
|
|
|
eval_button = gr.Button("Evaluate", variant="primary") |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
gr.Markdown("### Evaluation Metrics") |
|
with gr.Group(): |
|
accuracy_output = gr.Textbox( |
|
label="Overall Accuracy", |
|
info="Proportion of hypotheses that solve the tasks", |
|
container=True |
|
) |
|
partial_score_output = gr.Textbox( |
|
label="Partial Score", |
|
info="Proportion of examples that are correctly classified in the tasks", |
|
container=True |
|
) |
|
syntax_score_output = gr.Textbox( |
|
label="Syntax Score", |
|
info="Proportion of syntactically valid hypothesis", |
|
container=True |
|
) |
|
error_output = gr.Textbox( |
|
label="Syntax Details", |
|
info="Error messages for syntax errors or execution failures", |
|
container=True, |
|
) |
|
gr.Markdown("Note: This interface evaluates a single hypothesis at a time. Use Python API for batch processing") |
|
|
|
|
|
examples = [ |
|
["Train Problem", example_train], |
|
["Family Relationships", example_family] |
|
] |
|
|
|
with gr.Accordion("Example Logical Reasoning Tasks", open=True): |
|
example_radio = gr.Radio([ex[0] for ex in examples], label="Select an example", value="Train Problem") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Selected Example Preview") |
|
example_description = gr.Markdown("**Description**: " + example_train["description"]) |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("#### Candidate Hypothesis") |
|
example_rule = gr.Code(value=example_train["rule"]) |
|
with gr.Column(): |
|
gr.Markdown("#### Validation Program") |
|
example_validation = gr.Code(value=example_train["validation"]) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("#### Validation Examples") |
|
example_predicates = gr.Markdown(f""" |
|
**Positive Examples**: `{example_train["pos_pred"]}` |
|
**Negative Examples**: `{example_train["neg_pred"]}` |
|
""") |
|
|
|
|
|
load_button = gr.Button("Load Selected Example", variant="secondary") |
|
gr.Markdown("### Citation") |
|
|
|
gr.Markdown(""" |
|
If you use Symbolic Judge in your work, please cite: |
|
``` |
|
@misc{anonymous2025slr, |
|
title={Verifiable Rewards for Scalable Logical Reasoning}, |
|
author={Anonymous}, |
|
year={2025}, |
|
} |
|
``` |
|
""") |
|
|
|
|
|
def update_example_preview(selection): |
|
selected_example = next((ex[1] for ex in examples if ex[0] == selection), example_train) |
|
return ( |
|
"**Description**: " + selected_example["description"], |
|
selected_example["rule"], |
|
selected_example["validation"], |
|
f""" |
|
**Positive Examples**: `{selected_example["pos_pred"]}` |
|
**Negative Examples**: `{selected_example["neg_pred"]}` |
|
""" |
|
) |
|
|
|
example_radio.change( |
|
fn=update_example_preview, |
|
inputs=[example_radio], |
|
outputs=[example_description, example_rule, example_validation, example_predicates] |
|
) |
|
|
|
|
|
def load_selected_example(selection): |
|
selected_example = next((ex[1] for ex in examples if ex[0] == selection), example_train) |
|
return load_example(selected_example) |
|
|
|
load_button.click( |
|
fn=load_selected_example, |
|
inputs=[example_radio], |
|
outputs=[prediction_input, references_input, pos_pred_input, neg_pred_input] |
|
) |
|
|
|
|
|
eval_button.click( |
|
fn=evaluate_fn, |
|
inputs=[prediction_input, references_input, pos_pred_input, neg_pred_input], |
|
outputs=[accuracy_output, partial_score_output, syntax_score_output, error_output] |
|
) |
|
|
|
with gr.Tab("Documentation"): |
|
gr.Markdown(readme_content) |
|
|
|
return demo |
|
|
|
|
|
module = evaluate.load("LG-Anonym/VerifiableRewardsForScalableLogicalReasoning") |
|
create_interface(module).launch() |