Spaces:
Sleeping
Sleeping
from PIL import Image | |
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast | |
import gradio as gr | |
# Load the model and processor | |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/git-base") | |
feature_extractor = ViTFeatureExtractor.from_pretrained("microsoft/git-base") | |
tokenizer = PreTrainedTokenizerFast.from_pretrained("microsoft/git-base") | |
# Define the captioning function | |
def caption_images(image): | |
# Preprocess the image | |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values | |
# Generate captions | |
encoder_outputs = model.generate(pixel_values.to('cpu'), num_beams=5) | |
generated_sentence = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True) | |
return generated_sentence[0].strip() | |
# Define Gradio interface components | |
inputs = [ | |
gr.inputs.Image(type='pil', label='Original Image') | |
] | |
outputs = [ | |
gr.outputs.Textbox(label='Caption') | |
] | |
# Define Gradio app properties | |
title = "Simple Image Captioning Application" | |
description = "Upload an image to see the caption generated" | |
example = ['messi.jpg'] # Replace with a valid path to an example image | |
# Create and launch the Gradio interface | |
gr.Interface( | |
fn=caption_images, | |
inputs=inputs, | |
outputs=outputs, | |
title=title, | |
description=description, | |
examples=example, | |
).launch(debug=True) | |