import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# ------------------------------
# Modelos de Classificação de Sentimentos
# ------------------------------
classifier_1 = pipeline(
    "text-classification",
    model=AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest"),
    tokenizer=AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest"),
    device=0 if torch.cuda.is_available() else -1
)

classifier_2 = pipeline(
    "text-classification",
    model=AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis"),
    tokenizer=AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis"),
    device=0 if torch.cuda.is_available() else -1
)

# ------------------------------
# Árbitro (modelo text2text)
# ------------------------------
arbitro = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0 if torch.cuda.is_available() else -1
)

# ------------------------------
# Funções de Classificação
# ------------------------------
def classifier_1_predict(text):
    result = classifier_1(text)[0]['label']
    return result

def classifier_2_predict(text):
    result = classifier_2(text)[0]['label']
    return result

# ------------------------------
# Árbitro decide qual IA acertou e mostra os pesos
# ------------------------------
def judge_sentiment(text, result_1, result_2):
    prompt = (
        f"Sentence: \"{text}\"\n"
        f"Model A prediction: {result_1} (uses labels: negative, neutral, positive)\n"
        f"Model B prediction: {result_2} (uses labels: neg, neu, pos)\n\n"
        "Interpret the real sentiment expressed in the sentence.\n"
        "Judge whether each prediction is good or bad, and explain the parameters (weights) that influenced your decision.\n\n"
        "Your judgment criteria (weights from 0 to 1):\n"
        "- Semantic match (meaning alignment with the sentence)\n"
        "- Tone match (emotional consistency)\n"
        "- Label accuracy (correct label among known sentiment labels)\n\n"
        "Respond ONLY in this format:\n"
        "Model A: good | Model B: bad\n"
        "Weights used:\n"
        "- Semantic match: 0.4\n"
        "- Tone match: 0.4\n"
        "- Label accuracy: 0.2\n"
        "Explanation: [your reasoning here]"
    )
    output = arbitro(prompt, max_new_tokens=150)[0]['generated_text'].strip()
    return f"Model A: {result_1} | Model B: {result_2}\n🤖 Árbitro:\n{output}"

# ------------------------------
# Pipeline principal
# ------------------------------
def process_input(text):
    result_1 = classifier_1_predict(text)
    result_2 = classifier_2_predict(text)
    decision = judge_sentiment(text, result_1, result_2)
    return decision

iface = gr.Interface(
    fn=process_input,
    inputs="text",
    outputs="text",
    title="AI Sentiment Duel: Classificador de Sentimentos",
    description=(
        "Compare duas IAs na tarefa de identificar o sentimento de uma frase. "
        "Uma terceira IA, chamada **árbitro**, decide qual resposta está mais correta e agora **explica os critérios (pesos) usados para julgar**.\n\n"
        "**Critérios de julgamento do árbitro:**\n"
        "- **Semantic match**: o quanto a resposta combina com o significado geral da frase.\n"
        "- **Tone match**: o quanto a resposta combina com o tom emocional.\n"
        "- **Label accuracy**: se o rótulo está entre os mais apropriados.\n\n"
        "⚠️ Melhores resultados com frases em inglês.\n\n"
        "💡 Exemplos:\n"
        "- 'I absolutely loved the movie!'\n"
        "- 'Not bad, but could be better.'"
    )
)

iface.launch(share=True)