File size: 6,835 Bytes
5a7d5c7
9af5fdb
5a7d5c7
 
9af5fdb
5a7d5c7
 
 
 
 
9af5fdb
 
5a7d5c7
 
 
 
9af5fdb
 
 
 
 
 
 
 
 
 
 
 
 
5a7d5c7
 
9af5fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a7d5c7
 
 
 
9af5fdb
5a7d5c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d798ab
5a7d5c7
6d798ab
5a7d5c7
 
 
6d798ab
 
5a7d5c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9af5fdb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import easyocr
from fastapi import FastAPI
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
import tempfile
import os
from gtts import gTTS
from fpdf import FPDF
import datetime
from PIL import Image
import torch

# Initialize components
app = FastAPI()

# Load models - Using microsoft/git-large-coco
try:
    # Try loading the better model first
    processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
    git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
    print("Successfully loaded microsoft/git-large-coco model")
    USE_GIT = True
except Exception as e:
    print(f"Failed to load GIT model: {e}. Falling back to smaller model")
    captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    USE_GIT = False

# Initialize EasyOCR
reader = easyocr.Reader(['en', 'fr'])  # English and French OCR

def generate_caption(image_path):
    """Generate caption using the best available model"""
    try:
        if USE_GIT:
            image = Image.open(image_path)
            inputs = processor(images=image, return_tensors="pt")
            outputs = git_model.generate(**inputs, max_length=50)
            return processor.batch_decode(outputs, skip_special_tokens=True)[0]
        else:
            result = captioner(image_path)
            return result[0]['generated_text']
    except Exception as e:
        print(f"Caption generation error: {e}")
        return "Could not generate caption"

def analyze_image(image_path):
    """Process image with both captioning and OCR"""
    try:
        # Generate image caption
        caption = generate_caption(image_path)
        
        # Extract text with EasyOCR
        ocr_result = reader.readtext(image_path, detail=0)
        extracted_text = "\n".join(ocr_result) if ocr_result else "No text detected"
        
        return {
            "caption": caption,
            "extracted_text": extracted_text
        }
    except Exception as e:
        return {"error": str(e)}

def text_to_speech(text: str) -> str:
    """Convert text to speech"""
    try:
        tts = gTTS(text)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)
        return temp_audio.name
    except Exception as e:
        print(f"Text-to-speech error: {e}")
        return ""

def create_pdf(content: dict, original_filename: str) -> str:
    """Create PDF report"""
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        
        # Title
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(200, 10, txt="Image Analysis Report", ln=1, align='C')
        pdf.set_font("Arial", size=12)
        
        # Metadata
        pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
        pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
        pdf.ln(10)
        
        # Caption
        pdf.set_font("", 'B')
        pdf.cell(200, 10, txt="Image Caption:", ln=1)
        pdf.set_font("")
        pdf.multi_cell(0, 10, txt=content['caption'])
        pdf.ln(5)
        
        # Extracted Text
        pdf.set_font("", 'B')
        pdf.cell(200, 10, txt="Extracted Text:", ln=1)
        pdf.set_font("")
        pdf.multi_cell(0, 10, txt=content['extracted_text'])
        
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        pdf.output(temp_pdf.name)
        return temp_pdf.name
    except Exception as e:
        print(f"PDF creation error: {e}")
        return ""

def process_image(file_path: str, enable_tts: bool):
    """Handle image processing for Gradio interface"""
    if not file_path:
        return "Please upload an image first", "Ready", None, None
    
    try:
        original_filename = os.path.basename(file_path)
        
        # Analyze image
        result = analyze_image(file_path)
        if "error" in result:
            return result["error"], "Error", None, None
        
        # Format output
        output_text = f"📷 Image Caption:\n{result['caption']}\n\n✍️ Extracted Text:\n{result['extracted_text']}"
        
        # Generate audio
        audio_path = text_to_speech(f"Image caption: {result['caption']}. Extracted text: {result['extracted_text']}") if enable_tts else None
        
        # Generate PDF
        pdf_path = create_pdf(result, original_filename)
        
        return output_text, "Analysis complete", audio_path, pdf_path
    except Exception as e:
        return f"Analysis error: {str(e)}", "Error", None, None

# Gradio Interface
with gr.Blocks(title="Image Analysis Service", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🖼️ Image Analysis Service")
    gr.Markdown("Upload an image to get automatic captioning and text extraction")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(label="Upload Image", type="filepath")
            tts_checkbox = gr.Checkbox(
                label="Enable Text-to-Speech",
                value=False
            )
            analyze_btn = gr.Button("Analyze Image", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(label="Analysis Results", lines=10)
            status = gr.Textbox(label="Status", interactive=False)
            audio_output = gr.Audio(label="Audio Summary", visible=False)
            pdf_download = gr.File(label="Download Report", visible=False)
    
    def toggle_audio_visibility(enable_tts):
        return gr.Audio(visible=enable_tts)
    
    def update_ui(result, status, audio_path, pdf_path):
        return (
            result,
            status,
            gr.Audio(visible=audio_path is not None, value=audio_path),
            gr.File(visible=pdf_path is not None, value=pdf_path)
        )
    
    tts_checkbox.change(
        fn=toggle_audio_visibility,
        inputs=tts_checkbox,
        outputs=audio_output
    )
    
    analyze_btn.click(
        fn=process_image,
        inputs=[image_input, tts_checkbox],
        outputs=[output, status, audio_output, pdf_download]
    ).then(
        fn=update_ui,
        inputs=[output, status, audio_output, pdf_download],
        outputs=[output, status, audio_output, pdf_download]
    )

# FastAPI setup
@app.get("/files/{file_name}")
async def get_file(file_name: str):
    file_path = os.path.join(tempfile.gettempdir(), file_name)
    if os.path.exists(file_path):
        return FileResponse(file_path)
    return JSONResponse({"error": "File not found"}, status_code=404)

app = gr.mount_gradio_app(app, demo, path="/")

@app.get("/")
def redirect_to_interface():
    return RedirectResponse(url="/")