Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import easyocr | |
from fastapi import FastAPI | |
from fastapi.responses import RedirectResponse | |
import tempfile | |
import os | |
from gtts import gTTS | |
from fpdf import FPDF | |
import datetime | |
# Initialize components | |
app = FastAPI() | |
# Load models | |
captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
reader = easyocr.Reader(['en', 'fr']) # English and French OCR | |
def analyze_image(image_path): | |
"""Process image with both captioning and OCR""" | |
try: | |
# Generate image caption | |
caption_result = captioner(image_path) | |
caption = caption_result[0]['generated_text'] | |
# Extract text with EasyOCR | |
ocr_result = reader.readtext(image_path, detail=0) | |
extracted_text = "\n".join(ocr_result) if ocr_result else "No text detected" | |
return { | |
"caption": caption, | |
"extracted_text": extracted_text | |
} | |
except Exception as e: | |
return {"error": str(e)} | |
def text_to_speech(text: str) -> str: | |
"""Convert text to speech""" | |
try: | |
tts = gTTS(text) | |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tts.save(temp_audio.name) | |
return temp_audio.name | |
except Exception as e: | |
print(f"Text-to-speech error: {e}") | |
return "" | |
def create_pdf(content: dict, original_filename: str) -> str: | |
"""Create PDF report""" | |
try: | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
# Title | |
pdf.set_font("Arial", 'B', 16) | |
pdf.cell(200, 10, txt="Image Analysis Report", ln=1, align='C') | |
pdf.set_font("Arial", size=12) | |
# Metadata | |
pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1) | |
pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1) | |
pdf.ln(10) | |
# Caption | |
pdf.set_font("", 'B') | |
pdf.cell(200, 10, txt="Image Caption:", ln=1) | |
pdf.set_font("") | |
pdf.multi_cell(0, 10, txt=content['caption']) | |
pdf.ln(5) | |
# Extracted Text | |
pdf.set_font("", 'B') | |
pdf.cell(200, 10, txt="Extracted Text:", ln=1) | |
pdf.set_font("") | |
pdf.multi_cell(0, 10, txt=content['extracted_text']) | |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") | |
pdf.output(temp_pdf.name) | |
return temp_pdf.name | |
except Exception as e: | |
print(f"PDF creation error: {e}") | |
return "" | |
def process_image(file, enable_tts: bool): | |
"""Handle image processing for Gradio interface""" | |
if file is None: | |
return "Please upload an image first", "Ready", None, None | |
file_path = file.name | |
original_filename = os.path.basename(file_path) | |
try: | |
# Analyze image | |
result = analyze_image(file_path) | |
if "error" in result: | |
return result["error"], "Error", None, None | |
# Format output | |
output_text = f"📷 Image Caption:\n{result['caption']}\n\n✍️ Extracted Text:\n{result['extracted_text']}" | |
# Generate audio | |
audio_path = text_to_speech(f"Image caption: {result['caption']}. Extracted text: {result['extracted_text']}") if enable_tts else None | |
# Generate PDF | |
pdf_path = create_pdf(result, original_filename) | |
return output_text, "Analysis complete", audio_path, pdf_path | |
except Exception as e: | |
return f"Analysis error: {str(e)}", "Error", None, None | |
# Gradio Interface | |
with gr.Blocks(title="Image Analysis Service", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# 🖼️ Image Analysis Service") | |
gr.Markdown("Upload an image to get automatic captioning and text extraction") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image(label="Upload Image", type="filepath") | |
tts_checkbox = gr.Checkbox( | |
label="Enable Text-to-Speech", | |
value=False | |
) | |
analyze_btn = gr.Button("Analyze Image", variant="primary") | |
with gr.Column(): | |
output = gr.Textbox(label="Analysis Results", lines=10) | |
status = gr.Textbox(label="Status", interactive=False) | |
audio_output = gr.Audio(label="Audio Summary", visible=False) | |
pdf_download = gr.File(label="Download Report", visible=False) | |
def toggle_audio_visibility(enable_tts): | |
return gr.Audio(visible=enable_tts) | |
def update_ui(result, status, audio_path, pdf_path): | |
return ( | |
result, | |
status, | |
gr.Audio(visible=audio_path is not None, value=audio_path), | |
gr.File(visible=pdf_path is not None, value=pdf_path) | |
) | |
tts_checkbox.change( | |
fn=toggle_audio_visibility, | |
inputs=tts_checkbox, | |
outputs=audio_output | |
) | |
analyze_btn.click( | |
fn=process_image, | |
inputs=[image_input, tts_checkbox], | |
outputs=[output, status, audio_output, pdf_download] | |
).then( | |
fn=update_ui, | |
inputs=[output, status, audio_output, pdf_download], | |
outputs=[output, status, audio_output, pdf_download] | |
) | |
# FastAPI setup | |
async def get_file(file_name: str): | |
file_path = os.path.join(tempfile.gettempdir(), file_name) | |
if os.path.exists(file_path): | |
return FileResponse(file_path) | |
return JSONResponse({"error": "File not found"}, status_code=404) | |
app = gr.mount_gradio_app(app, demo, path="/") | |
def redirect_to_interface(): | |
return RedirectResponse(url="/") | |