import gradio as gr from transformers import pipeline, set_seed import re import numpy as np import pandas as pd # Set a seed for reproducibility set_seed(42) # List of premium generation models (as suggested from the Vellum AI leaderboard) generation_model_names = [ "mistralai/Mistral-7B-v0.1", "mistralai/Mixtral-8x7B-v0.1", "meta-llama/Llama-4-Scout", "meta-llama/Llama-4-Maverick", "Qwen/Qwen2.5-72B", "HuggingFaceH4/zephyr-7b-beta", "01-ai/Yi-34B", "deepseek-ai/deepseek-llm-67b-base", "HuggingFaceH4/zephyr-7b-alpha", "microsoft/Marcoroni-7B-v3" ] # List of cost-effective grammar evaluation models grammar_model_names = [ "vennify/t5-base-grammar-correction", "hassaanik/grammar-correction-model" ] # Load a generation pipeline given the model name. def load_generation_pipeline(model_name): try: return pipeline("text-generation", model=model_name) except Exception as e: print(f"Error loading generation model {model_name}: {e}") return None # Load a grammar evaluation pipeline (text2text-generation) def load_grammar_pipeline(model_name): try: return pipeline("text2text-generation", model=model_name) except Exception as e: print(f"Error loading grammar model {model_name}: {e}") return None # Pre-load grammar evaluator models (assumed to be cost-effective and stable) rater_models = [] for model_name in grammar_model_names: p = load_grammar_pipeline(model_name) if p is not None: rater_models.append(p) # Language dictionary languages = { "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian", "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese" } def clean_text(text): return re.sub(r'[^a-zA-Z0-9]', '', text.lower()) def is_palindrome(text): cleaned = clean_text(text) return cleaned == cleaned[::-1] def grammar_prompt(pal, lang): return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n''' def extract_score(text): match = re.search(r"\d{1,3}", text) if match: score = int(match.group()) return min(max(score, 0), 100) return 0 def run_benchmark(selected_model): # Load the selected premium generation pipeline gen_model = load_generation_pipeline(selected_model) if gen_model is None: return "Error loading generation model." results = [] for code, lang in languages.items(): prompt = ( f"Write the longest original palindrome you can in {lang}. " f"It should be creative and not a known palindrome. " f"If it is not a correct palindrome, you will lose points according to how correct it is." ) try: gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip() except Exception as e: gen_output = f"Error generating text: {e}" valid = is_palindrome(gen_output) cleaned_len = len(clean_text(gen_output)) scores = [] for rater in rater_models: rprompt = grammar_prompt(gen_output, lang) try: # For a text2text model, we assume the output contains a number (0-100) rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text'] score = extract_score(rtext) scores.append(score) except Exception as e: scores.append(0) avg_score = np.mean(scores) if scores else 0 penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5 final_score = round(cleaned_len * penalty, 2) results.append({ "Language": lang, "Palindrome": gen_output, "Valid": "✅" if valid else "❌", "Length": cleaned_len, "Grammar Score": avg_score, "Final Score": final_score }) df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True) return gr.Dataframe(df) # Build the Gradio UI using Blocks (canvas layout) with gr.Blocks(title="LLM Palindrome Benchmark - Premium Generation Models") as demo: gr.Markdown("# LLM Palindrome Benchmark") gr.Markdown("Select one of the premium generation models below (for non-commercial, educational usage) and run the benchmark.") with gr.Row(): model_dropdown = gr.Dropdown(choices=generation_model_names, label="Select Premium Generation Model") run_button = gr.Button("Run Benchmark") output_table = gr.Dataframe(label="Benchmark Results") run_button.click(fn=run_benchmark, inputs=model_dropdown, outputs=output_table) demo.launch()