Spaces:
Sleeping
Sleeping
File size: 9,163 Bytes
1b0abdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import gradio as gr # Import Gradio for creating web interfaces
import torch # Import PyTorch for deep learning
from PIL import Image # Import PIL for image processing
from transformers import pipeline, CLIPProcessor, CLIPModel # Import necessary classes from Hugging Face Transformers
import requests # Import requests for making HTTP requests
from bs4 import BeautifulSoup # Import BeautifulSoup for web scraping
from gtts import gTTS # Import gTTS for text-to-speech conversion
# Define the device to use (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the BLIP model for image captioning
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Load CLIP model for image classification
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
# Load the English summarization model
summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")
# Load the Arabic summarization model
arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")
# Load the translation model
translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")
# Function to fetch long texts from Wikipedia
def get_wikipedia_summary(landmark_name, language='en'):
url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" # Construct the URL
response = requests.get(url) # Make an HTTP GET request to fetch the page
soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup
paragraphs = soup.find_all('p') # Extract all paragraph elements
summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) # Join text from all paragraphs
return summary_text[:2000] # Return the first 2000 characters of the summary
# Function to load landmarks from an external file
def load_landmarks(filename):
landmarks = {}
with open(filename, 'r', encoding='utf-8') as file: # Open the file in read mode
for line in file:
if line.strip():
english_name, arabic_name = line.strip().split('|') # Split by the delimiter
landmarks[english_name] = arabic_name # Add to the dictionary
return landmarks # Return the dictionary of landmarks
# Load landmarks from the file
landmarks_dict = load_landmarks("landmarks.txt")
# Function to convert text to speech
def text_to_speech(text, language='en'):
tts = gTTS(text=text, lang=language) # Create a gTTS object for text-to-speech
audio_file = "summary.mp3" # Define the audio file name
tts.save(audio_file) # Save the audio file
return audio_file # Return the path to the audio file
# Function to generate a caption for the image
def generate_caption(image):
return caption_image(image)[0]['generated_text'] # Get generated caption from the model
# Function to classify the image using the CLIP model
def classify_image(image, labels):
inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) # Prepare inputs for CLIP model
outputs = clip_model(**inputs) # Get model outputs
logits_per_image = outputs.logits_per_image # Get logits for images
probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] # Compute probabilities
top_label = labels[probs.argmax()] # Get the label with the highest probability
top_prob = probs.max() # Get the highest probability value
return top_label, top_prob # Return top label and probability
# Function to summarize the description
def summarize_description(full_description, language):
if language == 'ar':
return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in Arabic
else:
return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in English
# Function to translate the caption and classification result
def translate_results(caption, top_label, top_prob, landmarks_dict, language):
if language == 'ar':
caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate caption to Arabic
classification_result = translation_pipeline(f"ุฃูุถู ู
ุทุงุจูุฉ: {landmarks_dict[top_label]} ุจุงุญุชู
ุงููุฉ {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate classification result
else:
caption_translated = caption # Keep caption in English
classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" # Create English classification result
return caption_translated, classification_result # Return translated results
# Function to process the image and generate results
def process_image(image, language='en'):
try:
# Generate caption for the image
caption = generate_caption(image) # Call the caption generation function
# Classify the image
top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) # Use keys for classification
# Determine the appropriate name to use based on the language
landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
full_description = get_wikipedia_summary(landmark_name, language) # Get the Wikipedia summary for the top label
# Summarize the full description
summarized_description = summarize_description(full_description, language) # Call the summarization function
# Translate caption and classification result
caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) # Call the translation function
# Convert the summarized description to speech
audio_file = text_to_speech(summarized_description, language) # Convert summary to audio
# Return results formatted for Arabic
if language == 'ar':
return f"<div style='text-align: right;'>{caption_translated}</div>", \
f"<div style='text-align: right;'>{classification_result}</div>", \
f"<div style='text-align: right;'>{summarized_description}</div>", \
audio_file # Return formatted results for Arabic
else:
return caption_translated, classification_result, summarized_description, audio_file # Return results for English
except Exception as e:
return "Error processing the image.", str(e), "", "" # Return error message if any exception occurs
# Create Gradio interface for English
english_interface = gr.Interface(
fn=lambda image: process_image(image, language='en'), # Function to call on image upload
inputs=gr.Image(type="pil", label="Upload Image"), # Input field for image upload
outputs=[ # Define output fields
gr.Textbox(label="Generated Caption"), # Output for generated caption
gr.Textbox(label="Classification Result"), # Output for classification result
gr.Textbox(label="Summarized Description", lines=10), # Output for summarized description
gr.Audio(label="Summary Audio", type="filepath") # Output for audio summary
],
title="Landmark Recognition", # Title of the interface
description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", # Description of the tool
examples=[ # Examples for user
["SOL.jfif"],
["OIP.jfif"]
]
)
# Create Gradio interface for Arabic
arabic_interface = gr.Interface(
fn=lambda image: process_image(image, language='ar'), # Function to call on image upload
inputs=gr.Image(type="pil", label="ุชุญู
ูู ุตูุฑุฉ"), # Input field for image upload in Arabic
outputs=[ # Define output fields
gr.HTML(label="ุงูุชุนููู ุงูู
ููุฏ"), # Output for generated caption in Arabic
gr.HTML(label="ูุชูุฌุฉ ุงูุชุตููู"), # Output for classification result in Arabic
gr.HTML(label="ุงููุตู ุงูู
ูุฎุต"), # Output for summarized description in Arabic
gr.Audio(label="ุตูุช ุงูู
ูุฎุต", type="filepath") # Output for audio summary in Arabic
],
title="ุงูุชุนุฑู ุนูู ุงูู
ุนุงูู
", # Title of the interface in Arabic
description="ูู
ุจุชุญู
ูู ุตูุฑุฉ ูู
ุนูู
ุ ูุณูุนู
ู ุนูู ุฅูุดุงุก ูุตู ูู ูุชุตูููู ูุชูููุฑ ู
ุนููู
ุงุช ุจุณูุทุฉ", # Description of the tool in Arabic
examples=[ # Examples for user
["SOL.jfif"],
["OIP.jfif"]
]
)
# Merge all interfaces into a tabbed interface
demo = gr.TabbedInterface(
[english_interface, arabic_interface], # List of interfaces to include
["English", "ุงูุนุฑุจูุฉ"] # Names of the tabs
)
# Launch the interface
demo.launch() # Start the Gradio application. |