Spaces:
Sleeping
Sleeping
import gradio as gr # Import Gradio for creating web interfaces | |
import torch # Import PyTorch for deep learning | |
from PIL import Image # Import PIL for image processing | |
from transformers import pipeline, CLIPProcessor, CLIPModel # Import necessary classes from Hugging Face Transformers | |
import requests # Import requests for making HTTP requests | |
from bs4 import BeautifulSoup # Import BeautifulSoup for web scraping | |
from gtts import gTTS # Import gTTS for text-to-speech conversion | |
# Define the device to use (CPU or GPU) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Load the BLIP model for image captioning | |
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) | |
# Load CLIP model for image classification | |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device) | |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") | |
# Load the English summarization model | |
summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum") | |
# Load the Arabic summarization model | |
arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization") | |
# Load the translation model | |
translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M") | |
# Function to fetch long texts from Wikipedia | |
def get_wikipedia_summary(landmark_name, language='en'): | |
url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" # Construct the URL | |
response = requests.get(url) # Make an HTTP GET request to fetch the page | |
soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup | |
paragraphs = soup.find_all('p') # Extract all paragraph elements | |
summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) # Join text from all paragraphs | |
return summary_text[:2000] # Return the first 2000 characters of the summary | |
# Function to load landmarks from an external file | |
def load_landmarks(filename): | |
landmarks = {} | |
with open(filename, 'r', encoding='utf-8') as file: # Open the file in read mode | |
for line in file: | |
if line.strip(): | |
english_name, arabic_name = line.strip().split('|') # Split by the delimiter | |
landmarks[english_name] = arabic_name # Add to the dictionary | |
return landmarks # Return the dictionary of landmarks | |
# Load landmarks from the file | |
landmarks_dict = load_landmarks("landmarks.txt") | |
# Function to convert text to speech | |
def text_to_speech(text, language='en'): | |
tts = gTTS(text=text, lang=language) # Create a gTTS object for text-to-speech | |
audio_file = "summary.mp3" # Define the audio file name | |
tts.save(audio_file) # Save the audio file | |
return audio_file # Return the path to the audio file | |
# Function to generate a caption for the image | |
def generate_caption(image): | |
return caption_image(image)[0]['generated_text'] # Get generated caption from the model | |
# Function to classify the image using the CLIP model | |
def classify_image(image, labels): | |
inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) # Prepare inputs for CLIP model | |
outputs = clip_model(**inputs) # Get model outputs | |
logits_per_image = outputs.logits_per_image # Get logits for images | |
probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] # Compute probabilities | |
top_label = labels[probs.argmax()] # Get the label with the highest probability | |
top_prob = probs.max() # Get the highest probability value | |
return top_label, top_prob # Return top label and probability | |
# Function to summarize the description | |
def summarize_description(full_description, language): | |
if language == 'ar': | |
return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in Arabic | |
else: | |
return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in English | |
# Function to translate the caption and classification result | |
def translate_results(caption, top_label, top_prob, landmarks_dict, language): | |
if language == 'ar': | |
caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate caption to Arabic | |
classification_result = translation_pipeline(f"ุฃูุถู ู ุทุงุจูุฉ: {landmarks_dict[top_label]} ุจุงุญุชู ุงููุฉ {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate classification result | |
else: | |
caption_translated = caption # Keep caption in English | |
classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" # Create English classification result | |
return caption_translated, classification_result # Return translated results | |
# Function to process the image and generate results | |
def process_image(image, language='en'): | |
try: | |
# Generate caption for the image | |
caption = generate_caption(image) # Call the caption generation function | |
# Classify the image | |
top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) # Use keys for classification | |
# Determine the appropriate name to use based on the language | |
landmark_name = top_label if language == 'en' else landmarks_dict[top_label] | |
full_description = get_wikipedia_summary(landmark_name, language) # Get the Wikipedia summary for the top label | |
# Summarize the full description | |
summarized_description = summarize_description(full_description, language) # Call the summarization function | |
# Translate caption and classification result | |
caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) # Call the translation function | |
# Convert the summarized description to speech | |
audio_file = text_to_speech(summarized_description, language) # Convert summary to audio | |
# Return results formatted for Arabic | |
if language == 'ar': | |
return f"<div style='text-align: right;'>{caption_translated}</div>", \ | |
f"<div style='text-align: right;'>{classification_result}</div>", \ | |
f"<div style='text-align: right;'>{summarized_description}</div>", \ | |
audio_file # Return formatted results for Arabic | |
else: | |
return caption_translated, classification_result, summarized_description, audio_file # Return results for English | |
except Exception as e: | |
return "Error processing the image.", str(e), "", "" # Return error message if any exception occurs | |
# Create Gradio interface for English | |
english_interface = gr.Interface( | |
fn=lambda image: process_image(image, language='en'), # Function to call on image upload | |
inputs=gr.Image(type="pil", label="Upload Image"), # Input field for image upload | |
outputs=[ # Define output fields | |
gr.Textbox(label="Generated Caption"), # Output for generated caption | |
gr.Textbox(label="Classification Result"), # Output for classification result | |
gr.Textbox(label="Summarized Description", lines=10), # Output for summarized description | |
gr.Audio(label="Summary Audio", type="filepath") # Output for audio summary | |
], | |
title="Landmark Recognition", # Title of the interface | |
description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", # Description of the tool | |
examples=[ # Examples for user | |
["SOL.jfif"], | |
["OIP.jfif"] | |
] | |
) | |
# Create Gradio interface for Arabic | |
arabic_interface = gr.Interface( | |
fn=lambda image: process_image(image, language='ar'), # Function to call on image upload | |
inputs=gr.Image(type="pil", label="ุชุญู ูู ุตูุฑุฉ"), # Input field for image upload in Arabic | |
outputs=[ # Define output fields | |
gr.HTML(label="ุงูุชุนููู ุงูู ููุฏ"), # Output for generated caption in Arabic | |
gr.HTML(label="ูุชูุฌุฉ ุงูุชุตููู"), # Output for classification result in Arabic | |
gr.HTML(label="ุงููุตู ุงูู ูุฎุต"), # Output for summarized description in Arabic | |
gr.Audio(label="ุตูุช ุงูู ูุฎุต", type="filepath") # Output for audio summary in Arabic | |
], | |
title="ุงูุชุนุฑู ุนูู ุงูู ุนุงูู ", # Title of the interface in Arabic | |
description="ูู ุจุชุญู ูู ุตูุฑุฉ ูู ุนูู ุ ูุณูุนู ู ุนูู ุฅูุดุงุก ูุตู ูู ูุชุตูููู ูุชูููุฑ ู ุนููู ุงุช ุจุณูุทุฉ", # Description of the tool in Arabic | |
examples=[ # Examples for user | |
["SOL.jfif"], | |
["OIP.jfif"] | |
] | |
) | |
# Merge all interfaces into a tabbed interface | |
demo = gr.TabbedInterface( | |
[english_interface, arabic_interface], # List of interfaces to include | |
["English", "ุงูุนุฑุจูุฉ"] # Names of the tabs | |
) | |
# Launch the interface | |
demo.launch() # Start the Gradio application. |