Spaces:

Alyaboelnasr
/

Landmark_Recognition

Sleeping

App Files Files Community

Landmark_Recognition / app.py

Alyaboelnasr

Create app.py

1b0abdc verified 7 months ago

raw

history blame

9.16 kB

	import gradio as gr # Import Gradio for creating web interfaces
	import torch # Import PyTorch for deep learning
	from PIL import Image # Import PIL for image processing
	from transformers import pipeline, CLIPProcessor, CLIPModel # Import necessary classes from Hugging Face Transformers
	import requests # Import requests for making HTTP requests
	from bs4 import BeautifulSoup # Import BeautifulSoup for web scraping
	from gtts import gTTS # Import gTTS for text-to-speech conversion

	# Define the device to use (CPU or GPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load the BLIP model for image captioning
	caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)

	# Load CLIP model for image classification
	clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
	clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

	# Load the English summarization model
	summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")

	# Load the Arabic summarization model
	arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")

	# Load the translation model
	translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")

	# Function to fetch long texts from Wikipedia
	def get_wikipedia_summary(landmark_name, language='en'):
	url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" # Construct the URL
	response = requests.get(url) # Make an HTTP GET request to fetch the page
	soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup

	paragraphs = soup.find_all('p') # Extract all paragraph elements
	summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) # Join text from all paragraphs

	return summary_text[:2000] # Return the first 2000 characters of the summary

	# Function to load landmarks from an external file
	def load_landmarks(filename):
	landmarks = {}
	with open(filename, 'r', encoding='utf-8') as file: # Open the file in read mode
	for line in file:
	if line.strip():
	english_name, arabic_name = line.strip().split('\|') # Split by the delimiter
	landmarks[english_name] = arabic_name # Add to the dictionary
	return landmarks # Return the dictionary of landmarks

	# Load landmarks from the file
	landmarks_dict = load_landmarks("landmarks.txt")

	# Function to convert text to speech
	def text_to_speech(text, language='en'):
	tts = gTTS(text=text, lang=language) # Create a gTTS object for text-to-speech
	audio_file = "summary.mp3" # Define the audio file name
	tts.save(audio_file) # Save the audio file
	return audio_file # Return the path to the audio file

	# Function to generate a caption for the image
	def generate_caption(image):
	return caption_image(image)[0]['generated_text'] # Get generated caption from the model

	# Function to classify the image using the CLIP model
	def classify_image(image, labels):
	inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) # Prepare inputs for CLIP model
	outputs = clip_model(**inputs) # Get model outputs
	logits_per_image = outputs.logits_per_image # Get logits for images
	probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] # Compute probabilities
	top_label = labels[probs.argmax()] # Get the label with the highest probability
	top_prob = probs.max() # Get the highest probability value
	return top_label, top_prob # Return top label and probability

	# Function to summarize the description
	def summarize_description(full_description, language):
	if language == 'ar':
	return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in Arabic
	else:
	return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in English

	# Function to translate the caption and classification result
	def translate_results(caption, top_label, top_prob, landmarks_dict, language):
	if language == 'ar':
	caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate caption to Arabic
	classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate classification result
	else:
	caption_translated = caption # Keep caption in English
	classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" # Create English classification result

	return caption_translated, classification_result # Return translated results

	# Function to process the image and generate results
	def process_image(image, language='en'):
	try:
	# Generate caption for the image
	caption = generate_caption(image) # Call the caption generation function

	# Classify the image
	top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) # Use keys for classification

	# Determine the appropriate name to use based on the language
	landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
	full_description = get_wikipedia_summary(landmark_name, language) # Get the Wikipedia summary for the top label

	# Summarize the full description
	summarized_description = summarize_description(full_description, language) # Call the summarization function

	# Translate caption and classification result
	caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) # Call the translation function

	# Convert the summarized description to speech
	audio_file = text_to_speech(summarized_description, language) # Convert summary to audio

	# Return results formatted for Arabic
	if language == 'ar':
	return f"<div style='text-align: right;'>{caption_translated}</div>", \
	f"<div style='text-align: right;'>{classification_result}</div>", \
	f"<div style='text-align: right;'>{summarized_description}</div>", \
	audio_file # Return formatted results for Arabic
	else:
	return caption_translated, classification_result, summarized_description, audio_file # Return results for English
	except Exception as e:
	return "Error processing the image.", str(e), "", "" # Return error message if any exception occurs

	# Create Gradio interface for English
	english_interface = gr.Interface(
	fn=lambda image: process_image(image, language='en'), # Function to call on image upload
	inputs=gr.Image(type="pil", label="Upload Image"), # Input field for image upload
	outputs=[ # Define output fields
	gr.Textbox(label="Generated Caption"), # Output for generated caption
	gr.Textbox(label="Classification Result"), # Output for classification result
	gr.Textbox(label="Summarized Description", lines=10), # Output for summarized description
	gr.Audio(label="Summary Audio", type="filepath") # Output for audio summary
	],
	title="Landmark Recognition", # Title of the interface
	description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", # Description of the tool
	examples=[ # Examples for user
	["SOL.jfif"],
	["OIP.jfif"]
	]
	)

	# Create Gradio interface for Arabic
	arabic_interface = gr.Interface(
	fn=lambda image: process_image(image, language='ar'), # Function to call on image upload
	inputs=gr.Image(type="pil", label="تحميل صورة"), # Input field for image upload in Arabic
	outputs=[ # Define output fields
	gr.HTML(label="التعليق المولد"), # Output for generated caption in Arabic
	gr.HTML(label="نتيجة التصنيف"), # Output for classification result in Arabic
	gr.HTML(label="الوصف الملخص"), # Output for summarized description in Arabic
	gr.Audio(label="صوت الملخص", type="filepath") # Output for audio summary in Arabic
	],
	title="التعرف على المعالم", # Title of the interface in Arabic
	description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة", # Description of the tool in Arabic
	examples=[ # Examples for user
	["SOL.jfif"],
	["OIP.jfif"]
	]
	)

	# Merge all interfaces into a tabbed interface
	demo = gr.TabbedInterface(
	[english_interface, arabic_interface], # List of interfaces to include
	["English", "العربية"] # Names of the tabs
	)

	# Launch the interface
	demo.launch() # Start the Gradio application.