# Install necessary libraries pip install torch transformers gradio Pillow scikit-learn requests import numpy as np import gradio as gr from sklearn.cluster import KMeans from transformers import ( BlipProcessor, BlipForConditionalGeneration, MBartForConditionalGeneration, MBart50TokenizerFast, ) from PIL import Image, ImageDraw import requests from io import BytesIO # Load Hugging Face models globally print("Loading Hugging Face models...") processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") # Download example images def download_example_images(): image_urls = [ ("Sunset over Mountains", "https://images.unsplash.com/photo-1501785888041-af3ef285b470"), ("Forest Path", "https://images.unsplash.com/photo-1502082553048-f009c37129b9"), ("City Skyline", "https://images.unsplash.com/photo-1498598453737-8913e843c47b"), ("Beach and Ocean", "https://images.unsplash.com/photo-1507525428034-b723cf961d3e"), ("Desert Dunes", "https://images.unsplash.com/photo-1501594907352-04cda38ebc29"), ] example_images = [] for idx, (description, url) in enumerate(image_urls, start=1): try: response = requests.get(url) response.raise_for_status() img = Image.open(BytesIO(response.content)) img.save(f'example{idx}.jpg') example_images.append([f'example{idx}.jpg']) except requests.RequestException as e: print(f"Failed to download image from {url}: {e}") return example_images # Download example images and prepare examples list examples = download_example_images() # Load and Process the Entire Image def load_image(image): resized_image = image.resize((300, 300), resample=Image.LANCZOS) return np.array(resized_image) # Extract Dominant Colors from the Image def extract_colors(image, k=8): pixels = image.reshape(-1, 3) / 255.0 kmeans = KMeans(n_clusters=k, random_state=0, n_init=10, max_iter=300) kmeans.fit(pixels) return (kmeans.cluster_centers_ * 255).astype(int) # Create an Image for the Color Palette def create_palette_image(colors): num_colors = len(colors) palette_image = Image.new("RGB", (100 * num_colors, 100)) draw = ImageDraw.Draw(palette_image) for i, color in enumerate(colors): color = tuple(np.clip(color, 0, 255).astype(int)) draw.rectangle([i * 100, 0, (i + 1) * 100, 100], fill=color) return palette_image # Display Color Palette as Hex Codes def display_palette(colors): return ["#{:02x}{:02x}{:02x}".format(*np.clip(color, 0, 255)) for color in colors] # Generate Image Caption Using Hugging Face BLIP def generate_caption(image): inputs = processor(images=image, return_tensors="pt") output = caption_model.generate(**inputs) return processor.decode(output[0], skip_special_tokens=True) # Translate Caption to Arabic Using mBART def translate_to_arabic(text): tokenizer.src_lang = "en_XX" encoded = tokenizer(text, return_tensors="pt") generated_tokens = translation_model.generate( **encoded, forced_bos_token_id=tokenizer.lang_code_to_id["ar_AR"] ) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Gradio Interface Function (Combining Elements) def process_image(image): if isinstance(image, np.ndarray): image = Image.fromarray(image) image_rgb = image.convert("RGB") resized_image_np = load_image(image_rgb) resized_image_pil = Image.fromarray(resized_image_np) caption = generate_caption(image_rgb) caption_arabic = translate_to_arabic(caption) colors = extract_colors(resized_image_np, k=8) color_palette = display_palette(colors) palette_image = create_palette_image(colors) bilingual_caption = f"English: {caption}\nArabic: {caption_arabic}" return bilingual_caption, ", ".join(color_palette), palette_image, resized_image_pil # Create Gradio Interface with gr.Blocks(css=".gradio-container { height: 1000px !important; }") as demo: gr.Markdown("

Palette Generator from Image with Image Captioning

") gr.Markdown( "

Upload an image or select one of the example images below to generate a color palette and a description of the image in both English and Arabic.

" ) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil", label="Upload your image or select an example below") submit_button = gr.Button("Submit") gr.Examples(examples=examples, inputs=image_input, label="Example Images", examples_per_page=5) with gr.Column(scale=1): caption_output = gr.Textbox(label="Bilingual Caption", lines=5, max_lines=10) palette_hex_output = gr.Textbox(label="Color Palette Hex Codes", lines=2) palette_image_output = gr.Image(type="pil", label="Color Palette") resized_image_output = gr.Image(type="pil", label="Resized Image") submit_button.click( fn=process_image, inputs=image_input, outputs=[caption_output, palette_hex_output, palette_image_output, resized_image_output], ) # Launch Gradio Interface demo.launch()