Spaces:

rbiswasfc
/

single-fact-curation

Sleeping

File size: 7,104 Bytes

import json
import os
import random
from collections import Counter

from datasets import Dataset, load_dataset
from fasthtml.common import *
from fastlite import database
from huggingface_hub import create_repo, login

login(token=os.environ.get("HF_TOKEN"))

fact_dataset = load_dataset("griffin/iclr2025_data_scores", split="train").to_list()
fact_dataset = [{"example_id": i, **example} for i, example in enumerate(fact_dataset)]

db = database("data/examples.db")
examples = db.t.examples
if examples not in db.t:
    examples.create(
        id=int,
        example_id=int,
        question_type=str,
        question=str,
        answer=str,
        decision=str,
        pk="id",
    )

evaluated_ids = set(row["example_id"] for row in examples.rows)
question_types = sorted(set(ex["question_type"] for ex in fact_dataset))


def get_stats():
    total_examples = Counter(ex["question_type"] for ex in fact_dataset)
    curated_examples = Counter(row["question_type"] for row in examples.rows)
    stats = {
        qt: {"total": total_examples[qt], "curated": curated_examples[qt]}
        for qt in question_types
    }
    return stats


def get_example(selected_type=None):
    available_examples = [
        ex for ex in fact_dataset if ex["example_id"] not in evaluated_ids
    ]
    if selected_type:
        available_examples = [
            ex for ex in available_examples if ex["question_type"] == selected_type
        ]

    if not available_examples:
        return None

    example = random.choice(available_examples)

    keep_keys = [
        "example_id",
        "question_type",
        "question",
        "rationale",
        "answer",
        "log_ll",
        "oracle_log_ll",
        "oracle_advantage",
        "prediction",
        "prediction_oracle",
        "accuracy",
        "accuracy_oracle",
        "accuracy_status",
    ]

    return {k: example[k] for k in keep_keys if k in example}


# app
style = Style("""
            body { background-color: #1e1e1e; color: #d4d4d4; font-family: Arial, sans-serif; }
            h1, h2, h3 { color: #61dafb; }
            .example-container { margin-top: 20px; }
            .example-table { border-collapse: collapse; width: 100%; }
            .example-table th, .example-table td { border: 1px solid #3a3a3a; padding: 8px; text-align: left; }
            .example-table th { background-color: #2a2a2a; color: #61dafb; }
            .example-table td { color: #d4d4d4; }
            #evaluation-form { margin-top: 20px; }
            #evaluation-form button { margin-right: 10px; background-color: #0e639c; color: white; border: none; padding: 10px 20px; cursor: pointer; }
            #evaluation-form button:hover { background-color: #1177bb; }
            select { background-color: #2a2a2a; color: #d4d4d4; border: 1px solid #3a3a3a; padding: 5px; }
            a { color: #61dafb; text-decoration: none; }
            a:hover { text-decoration: underline; }
        """)

app, rt = fast_app(live=True, hdrs=(style,))


def render_stats(stats):
    return Table(
        Tr(Th("Question Type"), Th("Curated"), Th("Total")),
        *[
            Tr(
                Td(qt),
                Td(
                    f"{stats[qt]['curated']} ({stats[qt]['curated']/stats[qt]['total']:.1%})"
                ),
                Td(stats[qt]["total"]),
            )
            for qt in question_types
        ],
        cls="stats-table",
    )


def render_example(example):
    return Div(
        H3("Example Details"),
        Table(
            *[Tr(Th(key), Td(str(value))) for key, value in example.items()],
            cls="example-table",
        ),
        id="example-details",
    )


def upload_to_hf():
    create_repo(
        repo_id="rbiswasfc/iclr-eval-examples",
        token=os.environ.get("HF_TOKEN"),
        private=True,
        repo_type="dataset",
        exist_ok=True,
    )

    examples = db.t.examples
    annotations = examples()

    hf_ds = Dataset.from_list(annotations)
    hf_ds.push_to_hub("rbiswasfc/iclr-eval-examples", token=os.environ.get("HF_TOKEN"))


@rt("/")
async def get(question_type: str = None):
    stats = get_stats()
    example = get_example(question_type)

    dropdown = Select(
        Option("Question Types", value="", selected=question_type is None),
        *[Option(qt, value=qt, selected=qt == question_type) for qt in question_types],
        name="question_type",
        hx_get="/",
        hx_target="body",
        hx_push_url="true",
    )

    if example is None:
        content = Div(
            H2("All examples of this type have been evaluated!"), render_stats(stats)
        )
    else:
        content = Div(
            H2("Example"),
            Div(
                render_example(example),
                Form(
                    Button(
                        "Good Example",
                        name="decision",
                        value="good",
                        hx_post="/evaluate",
                        hx_target="#example-details",
                    ),
                    Button(
                        "Bad Example",
                        name="decision",
                        value="bad",
                        hx_post="/evaluate",
                        hx_target="#example-details",
                    ),
                    Hidden(name="example", value=json.dumps(example)),
                    id="evaluation-form",
                ),
                id="example-container",
            ),
        )

    view_stats_link = A("Curation Stats", href="/stats", cls="view-stats-link")

    return Titled(
        "Example Curation",
        H2("Question Type Selection"),
        dropdown,
        content,
        view_stats_link,
    )


@rt("/stats")
async def get():
    stats = get_stats()
    stats = render_stats(stats)

    return Titled(
        "Curation Statistics",
        Div(
            stats,
            A("Back to Curation", href="/", cls="back-link"),
            cls="container",
        ),
    )


@rt("/evaluate")
async def post(decision: str, example: str):
    example_dict = json.loads(example)

    # Insert the evaluated example into the database
    examples.insert(
        {
            "id": len(list(examples.rows)) + 1,  # Auto-increment ID
            "example_id": example_dict["example_id"],
            "question_type": example_dict["question_type"],
            "question": example_dict["question"],
            "answer": example_dict["answer"],
            "decision": decision,
        }
    )
    upload_to_hf()

    # Add the evaluated example's ID to the set of evaluated IDs
    evaluated_ids.add(example_dict["example_id"])
    # Get a new example
    new_example = get_example(example_dict["question_type"])

    if new_example is None:
        return Div(H2("All examples of this type have been evaluated!"))
    else:
        return render_example(new_example)


# serve()
if __name__ == "__main__":
    import os

    import uvicorn

    # setup_hf_backup(app)
    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))