Spaces:

Alyaboelnasr
/

Landmark_Recognition

Sleeping

App Files Files Community

Alyaboelnasr commited on Feb 13

Commit

1b0abdc

verified ·

1 Parent(s): 103dae6

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr  # Import Gradio for creating web interfaces
+import torch  # Import PyTorch for deep learning
+from PIL import Image  # Import PIL for image processing
+from transformers import pipeline, CLIPProcessor, CLIPModel  # Import necessary classes from Hugging Face Transformers
+import requests  # Import requests for making HTTP requests
+from bs4 import BeautifulSoup  # Import BeautifulSoup for web scraping
+from gtts import gTTS  # Import gTTS for text-to-speech conversion
+# Define the device to use (CPU or GPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load the BLIP model for image captioning
+caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
+# Load CLIP model for image classification
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+# Load the English summarization model
+summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")
+# Load the Arabic summarization model
+arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")
+# Load the translation model
+translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")
+# Function to fetch long texts from Wikipedia
+def get_wikipedia_summary(landmark_name, language='en'):
+    url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}"  # Construct the URL
+    response = requests.get(url)  # Make an HTTP GET request to fetch the page
+    soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content with BeautifulSoup
+    paragraphs = soup.find_all('p')  # Extract all paragraph elements
+    summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()])  # Join text from all paragraphs
+    return summary_text[:2000]  # Return the first 2000 characters of the summary
+# Function to load landmarks from an external file
+def load_landmarks(filename):
+    landmarks = {}
+    with open(filename, 'r', encoding='utf-8') as file:  # Open the file in read mode
+        for line in file:
+            if line.strip():
+                english_name, arabic_name = line.strip().split('|')  # Split by the delimiter
+                landmarks[english_name] = arabic_name  # Add to the dictionary
+    return landmarks  # Return the dictionary of landmarks
+# Load landmarks from the file
+landmarks_dict = load_landmarks("landmarks.txt")
+# Function to convert text to speech
+def text_to_speech(text, language='en'):
+    tts = gTTS(text=text, lang=language)  # Create a gTTS object for text-to-speech
+    audio_file = "summary.mp3"  # Define the audio file name
+    tts.save(audio_file)  # Save the audio file
+    return audio_file  # Return the path to the audio file
+# Function to generate a caption for the image
+def generate_caption(image):
+    return caption_image(image)[0]['generated_text']  # Get generated caption from the model
+# Function to classify the image using the CLIP model
+def classify_image(image, labels):
+    inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)  # Prepare inputs for CLIP model
+    outputs = clip_model(**inputs)  # Get model outputs
+    logits_per_image = outputs.logits_per_image  # Get logits for images
+    probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0]  # Compute probabilities
+    top_label = labels[probs.argmax()]  # Get the label with the highest probability
+    top_prob = probs.max()  # Get the highest probability value
+    return top_label, top_prob  # Return top label and probability
+# Function to summarize the description
+def summarize_description(full_description, language):
+    if language == 'ar':
+        return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in Arabic
+    else:
+        return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text']  # Summarize in English
+# Function to translate the caption and classification result
+def translate_results(caption, top_label, top_prob, landmarks_dict, language):
+    if language == 'ar':
+        caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate caption to Arabic
+        classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text']  # Translate classification result
+    else:
+        caption_translated = caption  # Keep caption in English
+        classification_result = f"Best match: {top_label} with probability {top_prob:.4f}"  # Create English classification result
+    return caption_translated, classification_result  # Return translated results
+# Function to process the image and generate results
+def process_image(image, language='en'):
+    try:
+        # Generate caption for the image
+        caption = generate_caption(image)  # Call the caption generation function
+        # Classify the image
+        top_label, top_prob = classify_image(image, list(landmarks_dict.keys()))  # Use keys for classification
+        # Determine the appropriate name to use based on the language
+        landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
+        full_description = get_wikipedia_summary(landmark_name, language)  # Get the Wikipedia summary for the top label
+        # Summarize the full description
+        summarized_description = summarize_description(full_description, language)  # Call the summarization function
+        # Translate caption and classification result
+        caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language)  # Call the translation function
+        # Convert the summarized description to speech
+        audio_file = text_to_speech(summarized_description, language)  # Convert summary to audio
+        # Return results formatted for Arabic
+        if language == 'ar':
+            return f"<div style='text-align: right;'>{caption_translated}</div>", \
+                   f"<div style='text-align: right;'>{classification_result}</div>", \
+                   f"<div style='text-align: right;'>{summarized_description}</div>", \
+                   audio_file  # Return formatted results for Arabic
+        else:
+            return caption_translated, classification_result, summarized_description, audio_file  # Return results for English
+    except Exception as e:
+        return "Error processing the image.", str(e), "", ""  # Return error message if any exception occurs
+# Create Gradio interface for English
+english_interface = gr.Interface(
+    fn=lambda image: process_image(image, language='en'),  # Function to call on image upload
+    inputs=gr.Image(type="pil", label="Upload Image"),  # Input field for image upload
+    outputs=[  # Define output fields
+        gr.Textbox(label="Generated Caption"),  # Output for generated caption
+        gr.Textbox(label="Classification Result"),  # Output for classification result
+        gr.Textbox(label="Summarized Description", lines=10),  # Output for summarized description
+        gr.Audio(label="Summary Audio", type="filepath")  # Output for audio summary
+    ],
+    title="Landmark Recognition",  # Title of the interface
+    description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.",  # Description of the tool
+    examples=[  # Examples for user
+        ["SOL.jfif"],
+        ["OIP.jfif"]
+    ]
+)
+# Create Gradio interface for Arabic
+arabic_interface = gr.Interface(
+    fn=lambda image: process_image(image, language='ar'),  # Function to call on image upload
+    inputs=gr.Image(type="pil", label="تحميل صورة"),  # Input field for image upload in Arabic
+    outputs=[  # Define output fields
+        gr.HTML(label="التعليق المولد"),  # Output for generated caption in Arabic
+        gr.HTML(label="نتيجة التصنيف"),  # Output for classification result in Arabic
+        gr.HTML(label="الوصف الملخص"),  # Output for summarized description in Arabic
+        gr.Audio(label="صوت الملخص", type="filepath")  # Output for audio summary in Arabic
+    ],
+    title="التعرف على المعالم",  # Title of the interface in Arabic
+    description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة",  # Description of the tool in Arabic
+    examples=[  # Examples for user
+        ["SOL.jfif"],
+        ["OIP.jfif"]
+    ]
+)
+# Merge all interfaces into a tabbed interface
+demo = gr.TabbedInterface(
+    [english_interface, arabic_interface],  # List of interfaces to include
+    ["English", "العربية"]  # Names of the tabs
+)
+# Launch the interface
+demo.launch()  # Start the Gradio application.