Spaces:

Alyaboelnasr
/

Landmark_Recognition

Sleeping

File size: 9,163 Bytes

1b0abdc

import gradio as gr  # Import Gradio for creating web interfaces
import torch  # Import PyTorch for deep learning
from PIL import Image  # Import PIL for image processing
from transformers import pipeline, CLIPProcessor, CLIPModel  # Import necessary classes from Hugging Face Transformers
import requests  # Import requests for making HTTP requests
from bs4 import BeautifulSoup  # Import BeautifulSoup for web scraping
from gtts import gTTS  # Import gTTS for text-to-speech conversion

# Define the device to use (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BLIP model for image captioning
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)

# Load CLIP model for image classification
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Load the English summarization model
summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")

# Load the Arabic summarization model
arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")

# Load the translation model
translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")

# Function to fetch long texts from Wikipedia
def get_wikipedia_summary(landmark_name, language='en'):
    url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}"  # Construct the URL
    response = requests.get(url)  # Make an HTTP GET request to fetch the page
    soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content with BeautifulSoup

    paragraphs = soup.find_all('p')  # Extract all paragraph elements
    summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()])  # Join text from all paragraphs

    return summary_text[:2000]  # Return the first 2000 characters of the summary

# Function to load landmarks from an external file
def load_landmarks(filename):
    landmarks = {}
    with open(filename, 'r', encoding='utf-8') as file:  # Open the file in read mode
        for line in file:
            if line.strip():
                english_name, arabic_name = line.strip().split('|')  # Split by the delimiter
                landmarks[english_name] = arabic_name  # Add to the dictionary
    return landmarks  # Return the dictionary of landmarks

# Load landmarks from the file
landmarks_dict = load_landmarks("landmarks.txt")

# Function to convert text to speech
def text_to_speech(text, language='en'):
    tts = gTTS(text=text, lang=language)  # Create a gTTS object for text-to-speech
    audio_file = "summary.mp3"  # Define the audio file name
    tts.save(audio_file)  # Save the audio file
    return audio_file  # Return the path to the audio file

# Function to generate a caption for the image
def generate_caption(image):
    return caption_image(image)[0]['generated_text']  # Get generated caption from the model

# Function to classify the image using the CLIP model
def classify_image(image, labels):
    inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)  # Prepare inputs for CLIP model
    outputs = clip_model(**inputs)  # Get model outputs
    logits_per_image = outputs.logits_per_image  # Get logits for images
    probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0]  # Compute probabilities
    top_label = labels[probs.argmax()]  # Get the label with the highest probability
    top_prob = probs.max()  # Get the highest probability value
    return top_label, top_prob  # Return top label and probability

# Function to summarize the description
def summarize_description(full_description, language):
    if language == 'ar':
        return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in Arabic
    else:
        return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in English

# Function to translate the caption and classification result
def translate_results(caption, top_label, top_prob, landmarks_dict, language):
    if language == 'ar':
        caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate caption to Arabic
        classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate classification result
    else:
        caption_translated = caption  # Keep caption in English
        classification_result = f"Best match: {top_label} with probability {top_prob:.4f}"  # Create English classification result

    return caption_translated, classification_result  # Return translated results

# Function to process the image and generate results
def process_image(image, language='en'):
    try:
        # Generate caption for the image
        caption = generate_caption(image)  # Call the caption generation function

        # Classify the image
        top_label, top_prob = classify_image(image, list(landmarks_dict.keys()))  # Use keys for classification

        # Determine the appropriate name to use based on the language
        landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
        full_description = get_wikipedia_summary(landmark_name, language)  # Get the Wikipedia summary for the top label

        # Summarize the full description
        summarized_description = summarize_description(full_description, language)  # Call the summarization function

        # Translate caption and classification result
        caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language)  # Call the translation function

        # Convert the summarized description to speech
        audio_file = text_to_speech(summarized_description, language)  # Convert summary to audio

        # Return results formatted for Arabic
        if language == 'ar':
            return f"<div style='text-align: right;'>{caption_translated}</div>", \
                   f"<div style='text-align: right;'>{classification_result}</div>", \
                   f"<div style='text-align: right;'>{summarized_description}</div>", \
                   audio_file  # Return formatted results for Arabic
        else:
            return caption_translated, classification_result, summarized_description, audio_file  # Return results for English
    except Exception as e:
        return "Error processing the image.", str(e), "", ""  # Return error message if any exception occurs

# Create Gradio interface for English
english_interface = gr.Interface(
    fn=lambda image: process_image(image, language='en'),  # Function to call on image upload
    inputs=gr.Image(type="pil", label="Upload Image"),  # Input field for image upload
    outputs=[  # Define output fields
        gr.Textbox(label="Generated Caption"),  # Output for generated caption
        gr.Textbox(label="Classification Result"),  # Output for classification result
        gr.Textbox(label="Summarized Description", lines=10),  # Output for summarized description
        gr.Audio(label="Summary Audio", type="filepath")  # Output for audio summary
    ],
    title="Landmark Recognition",  # Title of the interface
    description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.",  # Description of the tool
    examples=[  # Examples for user
        ["SOL.jfif"],
        ["OIP.jfif"]
    ]
)

# Create Gradio interface for Arabic
arabic_interface = gr.Interface(
    fn=lambda image: process_image(image, language='ar'),  # Function to call on image upload
    inputs=gr.Image(type="pil", label="تحميل صورة"),  # Input field for image upload in Arabic
    outputs=[  # Define output fields
        gr.HTML(label="التعليق المولد"),  # Output for generated caption in Arabic
        gr.HTML(label="نتيجة التصنيف"),  # Output for classification result in Arabic
        gr.HTML(label="الوصف الملخص"),  # Output for summarized description in Arabic
        gr.Audio(label="صوت الملخص", type="filepath")  # Output for audio summary in Arabic
    ],
    title="التعرف على المعالم",  # Title of the interface in Arabic
    description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة",  # Description of the tool in Arabic
    examples=[  # Examples for user
        ["SOL.jfif"],
        ["OIP.jfif"]
    ]
)

# Merge all interfaces into a tabbed interface
demo = gr.TabbedInterface(
    [english_interface, arabic_interface],  # List of interfaces to include
    ["English", "العربية"]  # Names of the tabs
)

# Launch the interface
demo.launch()  # Start the Gradio application.