Spaces:

PabloTJ
/

palindroms

Running

File size: 5,157 Bytes

12a6276
9d5d030
12a6276
 
 
ad47898
12a6276
9d5d030
 
 
4963b4f
 
 
 
9d5d030
 
4963b4f
4136261
 
 
 
 
 
 
 
4963b4f
9d5d030
 
 
12a6276
 
4963b4f
9d5d030
 
 
 
 
 
 
 
 
 
 
 
 
 
4963b4f
9d5d030
 
 
 
 
 
12a6276
 
 
 
 
 
 
ec2f5cd
3fb2bff
 
4963b4f
 
 
3fb2bff
 
 
12a6276
4963b4f
 
 
 
 
12a6276
 
 
 
 
 
 
 
ad47898
4136261
12a6276
4963b4f
 
4136261
 
ec2f5cd
4963b4f
4136261
3fb2bff
9d5d030
4963b4f
9d5d030
4136261
4963b4f
4136261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5d030
ec2f5cd
12a6276
ec2f5cd
ad47898
ec2f5cd
 
ad47898
ec2f5cd
ad47898
ec2f5cd
12a6276
ad47898
4963b4f
 
ad47898
9d5d030
 
4136261
ec2f5cd
9d5d030
ec2f5cd
9d5d030
ec2f5cd
9d5d030

import gradio as gr
from transformers import pipeline, set_seed
import re
import numpy as np
import pandas as pd
import os

# Set a seed for reproducibility
set_seed(42)

# Define two premium generation models for better quality outputs.
premium_models = [
    "mistralai/Mistral-7B-v0.1",
    "HuggingFaceH4/zephyr-7b-beta"
]

# Define five languages: English, German, Spanish, French, Portuguese.
languages = {
    "en": "English",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "pt": "Portuguese"
}

# Define two cost-effective grammar evaluation models.
grammar_model_names = [
    "vennify/t5-base-grammar-correction",
    "hassaanik/grammar-correction-model"
]

# Functions to load pipelines on demand.
def load_generation_pipeline(model_name):
    try:
        return pipeline("text-generation", model=model_name)
    except Exception as e:
        print(f"Error loading generation model {model_name}: {e}")
        return None

def load_grammar_pipeline(model_name):
    try:
        return pipeline("text2text-generation", model=model_name)
    except Exception as e:
        print(f"Error loading grammar model {model_name}: {e}")
        return None

# Pre-load grammar evaluator pipelines.
rater_models = []
for model_name in grammar_model_names:
    p = load_grammar_pipeline(model_name)
    if p is not None:
        rater_models.append(p)

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text.lower())

def is_palindrome(text):
    cleaned = clean_text(text)
    return cleaned == cleaned[::-1]

# Updated prompt that instructs the model to output ONLY the palindrome.
def build_prompt(lang):
    return (
        f"Instruction: Generate a single original palindrome in {lang}.\n"
        "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
        "Do not output any additional text, commentary, or the prompt itself.\n"
        "Palindrome: "
    )

def grammar_prompt(pal, lang):
    return (
        f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
        "Return only a number with no explanation.\n\n"
        f'"{pal}"\n'
    )

def extract_score(text):
    match = re.search(r"\d{1,3}", text)
    if match:
        score = int(match.group())
        return min(max(score, 0), 100)
    return 0

# Main benchmark function that runs all tests at once and saves results automatically.
def run_benchmark_all():
    results = []
    
    for model_name in premium_models:
        gen_pipeline = load_generation_pipeline(model_name)
        if gen_pipeline is None:
            continue
        
        for code, lang in languages.items():
            prompt = build_prompt(lang)
            try:
                gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
            except Exception as e:
                gen_output = f"Error generating text: {e}"
            
            valid = is_palindrome(gen_output)
            cleaned_len = len(clean_text(gen_output))
            
            scores = []
            for rater in rater_models:
                rprompt = grammar_prompt(gen_output, lang)
                try:
                    rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
                    score = extract_score(rtext)
                    scores.append(score)
                except Exception as e:
                    scores.append(0)
            avg_score = np.mean(scores) if scores else 0
            penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
            final_score = round(cleaned_len * penalty, 2)
            
            results.append({
                "Model": model_name,
                "Language": lang,
                "Palindrome": gen_output,
                "Valid": "✅" if valid else "❌",
                "Length": cleaned_len,
                "Grammar Score": avg_score,
                "Final Score": final_score
            })
    
    # Create DataFrame and sort by Final Score.
    df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
    
    # Automatically save results to a CSV file.
    csv_path = "benchmark_results.csv"
    df.to_csv(csv_path, index=False)
    print(f"CSV file saved to {os.path.abspath(csv_path)}")
    
    # Return both the DataFrame and the CSV file path for download.
    return gr.Dataframe(df), csv_path

# Build the Gradio UI using Blocks for a canvas layout.
with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
    gr.Markdown("# Premium Model Palindrome Benchmark")
    gr.Markdown("This benchmark runs automatically over 2 premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese) and saves the results to a CSV file when done.")
    
    with gr.Row():
        run_button = gr.Button("Run All Benchmarks")
    
    output_table = gr.Dataframe(label="Benchmark Results")
    output_file = gr.File(label="Download CSV Results")
    
    run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])

demo.launch()