cosine-match / app.py
iamrobotbear's picture
Update app.py
893a3e5
raw
history blame
3.94 kB
import gradio as gr
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from transformers import GitProcessor, GitModel, GitConfig, ImageFeatureProcessor
from PIL import Image
# Load models and processors
git_config = GitConfig.from_pretrained("microsoft/git-large-r")
git_processor_large_textcaps = GitProcessor.from_pretrained("microsoft/git-large-r")
git_model_large_textcaps = GitModel.from_pretrained("microsoft/git-large-r")
itm_model = hub.load("https://tfhub.dev/google/LaViT/1")
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
# Read statements from the external file 'statements.txt'
with open('statements.txt', 'r') as file:
statements = file.read().splitlines()
# Function to generate image caption
def generate_caption(processor, model, image):
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
caption = processor.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True)
return caption[0]
# Function to compute textual similarity
def compute_textual_similarity(caption, statement):
captions_embeddings = use_model([caption])[0].numpy()
statements_embeddings = use_model([statement])[0].numpy()
similarity_score = np.inner(captions_embeddings, statements_embeddings)
return similarity_score[0]
# Function to compute ITM score
def compute_itm_score(image, statement):
image_features = itm_model(image)
statement_features = use_model([statement])[0].numpy()
similarity_score = np.inner(image_features, statement_features)
return similarity_score[0][0]
# Function to save DataFrame to CSV
def save_dataframe_to_csv(df):
csv_data = df.to_csv(index=False)
return csv_data
# Main function to perform image captioning and image-text matching for multiple images
def process_images_and_statements(files):
all_results_list = []
# If 'files' is a list, convert it to a dictionary
if isinstance(files, list):
files = {f.name: f for f in files}
for file_name, image_file in files.items():
# Convert the image file to a PIL image
image = Image.open(image_file)
caption = generate_caption(git_processor_large_textcaps, git_model_large_textcaps, image)
for statement in statements:
textual_similarity_score = compute_textual_similarity(caption, statement) * 100
itm_score_statement = compute_itm_score(image, statement) * 100
final_score = 0.5 * textual_similarity_score + 0.5 * itm_score_statement
all_results_list.append({
'Image File Name': file_name, # Include the image file name
'Statement': statement,
'Generated Caption': caption,
'Textual Similarity Score': f"{textual_similarity_score:.2f}%",
'ITM Score': f"{itm_score_statement:.2f}%",
'Final Combined Score': f"{final_score:.2f}%"
})
results_df = pd.concat([pd.DataFrame([result]) for result in all_results_list], ignore_index=True)
csv_results = save_dataframe_to_csv(results_df)
return results_df, csv_results
# Gradio interface with File input to receive
# Gradio interface with File input to receive multiple images and file names
image_input = gr.inputs.File(file_count="multiple", type="file", label="Upload Images")
output_df = gr.outputs.Dataframe(type="pandas", label="Results")
output_csv = gr.outputs.File(label="Download CSV")
iface = gr.Interface(
fn=process_images_and_statements,
inputs=image_input,
outputs=[output_df, output_csv],
title="Image Captioning and Image-Text Matching",
theme='sudeepshouche/minimalist',
css=".output { flex-direction: column; } .output .outputs { width: 100%; }", # Custom CSS
capture_session=True, # Capture errors and exceptions in Gradio interface
)
iface.launch(debug=True)