Spaces:

non2013
/

SincereQuestions

Sleeping

File size: 3,038 Bytes

e07e824
6687c9a
 
 
 
 
 
 
870d31e
 
 
 
e07e824
6687c9a
 
 
 
 
e07e824
6687c9a
 
 
 
 
 
 
 
 
 
 
 
 
417c147
6687c9a
417c147
 
 
f5b14b4
ca3c933
f5b14b4
6687c9a
 
 
 
 
ba7dd0b
 
6687c9a
ba7dd0b
 
 
 
6687c9a
 
 
ba7dd0b
6687c9a
 
 
f227fd1
dbda7cf
 
6687c9a
 
 
 
 
f227fd1
 
 
 
 
6687c9a
 
 
 
 
 
06dcc43
6687c9a
 
 
 
 
 
f227fd1
c200368
6687c9a

import gradio as gr
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import spacy
from tqdm import tqdm
import gc
import os

# Download the SpaCy model
os.system("python -m spacy download en_core_web_lg")

# Load models
model_1 = tf.keras.models.load_model("model_1.h5")
model_2 = tf.keras.models.load_model("model_2.h5")
model_3 = tf.keras.models.load_model("model_3.h5")
model_4 = tf.keras.models.load_model("model_4.h5")

# Load dictionaries
with open('word_dict.pkl', 'rb') as f:
    word_dict = pickle.load(f)

with open('lemma_dict.pkl', 'rb') as f:
    lemma_dict = pickle.load(f)

# Load SpaCy NLP model
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

def preprocess_text(text):
    """Preprocess the input text using SpaCy and return word indices."""
    docs = nlp.pipe([text], n_process=1)
    word_seq = []
    for doc in docs:
        for token in doc:
            if token.pos_ != "PUNCT":
                if token.text not in word_dict:
                    word_dict[token.text] = 0 # OOV_INDEX
                word_seq.append(word_dict[token.text])
    return word_seq

def classify_question(text):
    # Preprocess the text
    seq = preprocess_text(text)
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)  # Adjust maxlen if needed
    BATCH_SIZE = 512
    # Get predictions from each model
    pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))

    # Combine predictions
    avg_pred = pred1 + pred2 + pred3 + pred4
    label = "Insincere" if avg_pred > 0.35 else "Sincere"

    # Create a list of probabilities for each model
    probs = {
        "Probability": float(avg_pred),
        "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
        "Sequence": seq
    }

    return label, probs

# Example questions
examples = [
    "How do you train a pigeon to send messages?",
    "Is USA a shithole country owing to a shithole president?",
    "Why is Indian educationa total bullshit?",
    "Which person has given the least f**ks and still turned out successful?"
]

# Gradio Interface
interface = gr.Interface(
    fn=classify_question,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
    ],
    outputs=[
        "text",  # Output for label
        "json"   # Output for probabilities
    ],
    title="Quora Insincere Questions Classifier",
    examples=examples,
    description="Enter your question to classify it as sincere or insincere. Select an example question below."
)

interface.launch()