Spaces:
Runtime error
Runtime error
File size: 3,407 Bytes
adc05de 623c9e7 adc05de 191e2cd 623c9e7 1ee9cdc 191e2cd 1ee9cdc 191e2cd 1ee9cdc 623c9e7 adc05de 1ee9cdc adc05de 623c9e7 1ee9cdc 191e2cd adc05de 191e2cd 1ee9cdc 191e2cd 1ee9cdc adc05de 1ee9cdc 191e2cd adc05de 1ee9cdc 191e2cd adc05de 623c9e7 1ee9cdc 623c9e7 191e2cd adc05de 1ee9cdc 623c9e7 adc05de 623c9e7 1ee9cdc 191e2cd adc05de 623c9e7 1ee9cdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import gradio as gr
import torch
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
# Global cache and thread lock for thread-safe caching
CACHE_SIZE = 100
prediction_cache = {}
cache_lock = Lock()
def load_model(model_name):
"""
Loads the model with 8-bit quantization if a GPU is available.
On CPU, it loads the full model.
"""
if torch.cuda.is_available():
# Use 8-bit quantization and auto device mapping for GPU inference.
model = AutoModelForSequenceClassification.from_pretrained(
model_name, load_in_8bit=True, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = 0 # GPU index
else:
# CPU fallback: do not use quantization.
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = -1
return pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
# Load both models concurrently atartup.
with ThreadPoolExecutor() as executor:
sentiment_future = executor.submit(load_model, "cardiffnlp/twitter-roberta-base-sentiment")
emotion_future = executor.submit(load_model, "bhadresh-savani/bert-base-uncased-emotion")
sentiment_pipeline = sentiment_future.result()
emotion_pipeline = emotion_future.result()
def analyze_text(text):
# Check cache first (thread-safe)
with cache_lock:
if text in prediction_cache:
return prediction_cache[text]
try:
# Run both model inferences in parallel.
with ThreadPoolExecutor() as executor:
future_sentiment = executor.submit(sentiment_pipeline, text)
future_emotion = executor.submit(emotion_pipeline, text)
sentiment_result = future_sentiment.result()[0]
emotion_result = future_emotion.result()[0]
# Format the output with rounded scores.
result = {
"Sentiment": {sentiment_result['label']: round(sentiment_result['score'], 4)},
"Emotion": {emotion_result['label']: round(emotion_result['score'], 4)}
}
except Exception as e:
result = {"error": str(e)}
# Update cache with protection.
with cache_lock:
if len(prediction_cache) >= CACHE_SIZE:
prediction_cache.pop(next(iter(prediction_cache)))
prediction_cache[text] = result
return result
# Define the Gradio interface.
demo = gr.Interface(
fn=analyze_text,
inputs=gr.Textbox(placeholder="Enter your text here...", label="Input Text"),
outputs=gr.JSON(label="Analysis Results"),
title="🚀 Fast Sentiment & Emotion Analysis",
description="An optimized application using quantized models (when available) and parallel processing for fast inference.",
examples=[
["I'm thrilled to start this new adventure!"],
["This situation is making me really frustrated."],
["I feel so heartbroken and lost."]
],
theme="soft",
allow_flagging="never"
)
# Warm up the models to reduce first-call latency.
_ = analyze_text("Warming up models...")
if __name__ == "__main__":
# In Spaces, binding to 0.0.0.0 is required.
demo.launch(server_name="0.0.0.0", server_port=7860)
|