File size: 3,941 Bytes
8e34f80
 
 
893a3e5
 
 
 
8e34f80
893a3e5
 
 
 
 
 
8e34f80
0bb8ce2
 
 
 
893a3e5
8e34f80
893a3e5
 
 
 
8e34f80
893a3e5
 
 
 
 
 
15b96ac
893a3e5
 
 
 
 
 
15b96ac
893a3e5
 
 
 
f51ad44
74fc255
 
 
1e2e099
 
 
 
 
893a3e5
 
 
 
74fc255
 
1e2e099
 
 
74fc255
 
 
 
1e2e099
 
 
74fc255
 
e981e7f
bbf9e08
15b96ac
893a3e5
74fc255
 
080099f
 
8e34f80
ee4f4a6
 
 
bbf9e08
ee4f4a6
e17785f
893a3e5
 
ee4f4a6
4c0fb4c
141404c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from transformers import GitProcessor, GitModel, GitConfig, ImageFeatureProcessor
from PIL import Image

# Load models and processors
git_config = GitConfig.from_pretrained("microsoft/git-large-r")
git_processor_large_textcaps = GitProcessor.from_pretrained("microsoft/git-large-r")
git_model_large_textcaps = GitModel.from_pretrained("microsoft/git-large-r")
itm_model = hub.load("https://tfhub.dev/google/LaViT/1")
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

# Read statements from the external file 'statements.txt'
with open('statements.txt', 'r') as file:
    statements = file.read().splitlines()

# Function to generate image caption
def generate_caption(processor, model, image):
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    caption = processor.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True)
    return caption[0]

# Function to compute textual similarity
def compute_textual_similarity(caption, statement):
    captions_embeddings = use_model([caption])[0].numpy()
    statements_embeddings = use_model([statement])[0].numpy()
    similarity_score = np.inner(captions_embeddings, statements_embeddings)
    return similarity_score[0]

# Function to compute ITM score
def compute_itm_score(image, statement):
    image_features = itm_model(image)
    statement_features = use_model([statement])[0].numpy()
    similarity_score = np.inner(image_features, statement_features)
    return similarity_score[0][0]

# Function to save DataFrame to CSV
def save_dataframe_to_csv(df):
    csv_data = df.to_csv(index=False)
    return csv_data

# Main function to perform image captioning and image-text matching for multiple images
def process_images_and_statements(files):
    all_results_list = []
    
    # If 'files' is a list, convert it to a dictionary
    if isinstance(files, list):
        files = {f.name: f for f in files}
    
    for file_name, image_file in files.items():
        # Convert the image file to a PIL image
        image = Image.open(image_file)
        
        caption = generate_caption(git_processor_large_textcaps, git_model_large_textcaps, image)
        for statement in statements:
            textual_similarity_score = compute_textual_similarity(caption, statement) * 100
            itm_score_statement = compute_itm_score(image, statement) * 100
            final_score = 0.5 * textual_similarity_score + 0.5 * itm_score_statement
            all_results_list.append({
                'Image File Name': file_name,  # Include the image file name
                'Statement': statement,
                'Generated Caption': caption,
                'Textual Similarity Score': f"{textual_similarity_score:.2f}%",
                'ITM Score': f"{itm_score_statement:.2f}%",
                'Final Combined Score': f"{final_score:.2f}%"
            })
    results_df = pd.concat([pd.DataFrame([result]) for result in all_results_list], ignore_index=True)
    csv_results = save_dataframe_to_csv(results_df)
    return results_df, csv_results

# Gradio interface with File input to receive
# Gradio interface with File input to receive multiple images and file names
image_input = gr.inputs.File(file_count="multiple", type="file", label="Upload Images")
output_df = gr.outputs.Dataframe(type="pandas", label="Results")
output_csv = gr.outputs.File(label="Download CSV")

iface = gr.Interface(
    fn=process_images_and_statements,
    inputs=image_input,
    outputs=[output_df, output_csv],
    title="Image Captioning and Image-Text Matching",
    theme='sudeepshouche/minimalist',
    css=".output { flex-direction: column; } .output .outputs { width: 100%; }", # Custom CSS
    capture_session=True,  # Capture errors and exceptions in Gradio interface
)

iface.launch(debug=True)