Spaces:
Running
Running
File size: 6,835 Bytes
5a7d5c7 9af5fdb 5a7d5c7 9af5fdb 5a7d5c7 9af5fdb 5a7d5c7 9af5fdb 5a7d5c7 9af5fdb 5a7d5c7 9af5fdb 5a7d5c7 6d798ab 5a7d5c7 6d798ab 5a7d5c7 6d798ab 5a7d5c7 9af5fdb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import easyocr
from fastapi import FastAPI
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
import tempfile
import os
from gtts import gTTS
from fpdf import FPDF
import datetime
from PIL import Image
import torch
# Initialize components
app = FastAPI()
# Load models - Using microsoft/git-large-coco
try:
# Try loading the better model first
processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
print("Successfully loaded microsoft/git-large-coco model")
USE_GIT = True
except Exception as e:
print(f"Failed to load GIT model: {e}. Falling back to smaller model")
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
USE_GIT = False
# Initialize EasyOCR
reader = easyocr.Reader(['en', 'fr']) # English and French OCR
def generate_caption(image_path):
"""Generate caption using the best available model"""
try:
if USE_GIT:
image = Image.open(image_path)
inputs = processor(images=image, return_tensors="pt")
outputs = git_model.generate(**inputs, max_length=50)
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
else:
result = captioner(image_path)
return result[0]['generated_text']
except Exception as e:
print(f"Caption generation error: {e}")
return "Could not generate caption"
def analyze_image(image_path):
"""Process image with both captioning and OCR"""
try:
# Generate image caption
caption = generate_caption(image_path)
# Extract text with EasyOCR
ocr_result = reader.readtext(image_path, detail=0)
extracted_text = "\n".join(ocr_result) if ocr_result else "No text detected"
return {
"caption": caption,
"extracted_text": extracted_text
}
except Exception as e:
return {"error": str(e)}
def text_to_speech(text: str) -> str:
"""Convert text to speech"""
try:
tts = gTTS(text)
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_audio.name)
return temp_audio.name
except Exception as e:
print(f"Text-to-speech error: {e}")
return ""
def create_pdf(content: dict, original_filename: str) -> str:
"""Create PDF report"""
try:
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# Title
pdf.set_font("Arial", 'B', 16)
pdf.cell(200, 10, txt="Image Analysis Report", ln=1, align='C')
pdf.set_font("Arial", size=12)
# Metadata
pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
pdf.ln(10)
# Caption
pdf.set_font("", 'B')
pdf.cell(200, 10, txt="Image Caption:", ln=1)
pdf.set_font("")
pdf.multi_cell(0, 10, txt=content['caption'])
pdf.ln(5)
# Extracted Text
pdf.set_font("", 'B')
pdf.cell(200, 10, txt="Extracted Text:", ln=1)
pdf.set_font("")
pdf.multi_cell(0, 10, txt=content['extracted_text'])
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
pdf.output(temp_pdf.name)
return temp_pdf.name
except Exception as e:
print(f"PDF creation error: {e}")
return ""
def process_image(file_path: str, enable_tts: bool):
"""Handle image processing for Gradio interface"""
if not file_path:
return "Please upload an image first", "Ready", None, None
try:
original_filename = os.path.basename(file_path)
# Analyze image
result = analyze_image(file_path)
if "error" in result:
return result["error"], "Error", None, None
# Format output
output_text = f"📷 Image Caption:\n{result['caption']}\n\n✍️ Extracted Text:\n{result['extracted_text']}"
# Generate audio
audio_path = text_to_speech(f"Image caption: {result['caption']}. Extracted text: {result['extracted_text']}") if enable_tts else None
# Generate PDF
pdf_path = create_pdf(result, original_filename)
return output_text, "Analysis complete", audio_path, pdf_path
except Exception as e:
return f"Analysis error: {str(e)}", "Error", None, None
# Gradio Interface
with gr.Blocks(title="Image Analysis Service", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🖼️ Image Analysis Service")
gr.Markdown("Upload an image to get automatic captioning and text extraction")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Upload Image", type="filepath")
tts_checkbox = gr.Checkbox(
label="Enable Text-to-Speech",
value=False
)
analyze_btn = gr.Button("Analyze Image", variant="primary")
with gr.Column():
output = gr.Textbox(label="Analysis Results", lines=10)
status = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(label="Audio Summary", visible=False)
pdf_download = gr.File(label="Download Report", visible=False)
def toggle_audio_visibility(enable_tts):
return gr.Audio(visible=enable_tts)
def update_ui(result, status, audio_path, pdf_path):
return (
result,
status,
gr.Audio(visible=audio_path is not None, value=audio_path),
gr.File(visible=pdf_path is not None, value=pdf_path)
)
tts_checkbox.change(
fn=toggle_audio_visibility,
inputs=tts_checkbox,
outputs=audio_output
)
analyze_btn.click(
fn=process_image,
inputs=[image_input, tts_checkbox],
outputs=[output, status, audio_output, pdf_download]
).then(
fn=update_ui,
inputs=[output, status, audio_output, pdf_download],
outputs=[output, status, audio_output, pdf_download]
)
# FastAPI setup
@app.get("/files/{file_name}")
async def get_file(file_name: str):
file_path = os.path.join(tempfile.gettempdir(), file_name)
if os.path.exists(file_path):
return FileResponse(file_path)
return JSONResponse({"error": "File not found"}, status_code=404)
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/")
def redirect_to_interface():
return RedirectResponse(url="/")
|