import gradio as gr from transformers import pipeline import easyocr from fastapi import FastAPI from fastapi.responses import RedirectResponse import tempfile import os from gtts import gTTS from fpdf import FPDF import datetime # Initialize components app = FastAPI() # Load models captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") reader = easyocr.Reader(['en', 'fr']) # English and French OCR def analyze_image(image_path): """Process image with both captioning and OCR""" try: # Generate image caption caption_result = captioner(image_path) caption = caption_result[0]['generated_text'] # Extract text with EasyOCR ocr_result = reader.readtext(image_path, detail=0) extracted_text = "\n".join(ocr_result) if ocr_result else "No text detected" return { "caption": caption, "extracted_text": extracted_text } except Exception as e: return {"error": str(e)} def text_to_speech(text: str) -> str: """Convert text to speech""" try: tts = gTTS(text) temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_audio.name) return temp_audio.name except Exception as e: print(f"Text-to-speech error: {e}") return "" def create_pdf(content: dict, original_filename: str) -> str: """Create PDF report""" try: pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # Title pdf.set_font("Arial", 'B', 16) pdf.cell(200, 10, txt="Image Analysis Report", ln=1, align='C') pdf.set_font("Arial", size=12) # Metadata pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1) pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1) pdf.ln(10) # Caption pdf.set_font("", 'B') pdf.cell(200, 10, txt="Image Caption:", ln=1) pdf.set_font("") pdf.multi_cell(0, 10, txt=content['caption']) pdf.ln(5) # Extracted Text pdf.set_font("", 'B') pdf.cell(200, 10, txt="Extracted Text:", ln=1) pdf.set_font("") pdf.multi_cell(0, 10, txt=content['extracted_text']) temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") pdf.output(temp_pdf.name) return temp_pdf.name except Exception as e: print(f"PDF creation error: {e}") return "" def process_image(file_path: str, enable_tts: bool): """Handle image processing for Gradio interface""" if not file_path: return "Please upload an image first", "Ready", None, None try: original_filename = os.path.basename(file_path) # Analyze image result = analyze_image(file_path) if "error" in result: return result["error"], "Error", None, None # Format output output_text = f"📷 Image Caption:\n{result['caption']}\n\n✍️ Extracted Text:\n{result['extracted_text']}" # Generate audio audio_path = text_to_speech(f"Image caption: {result['caption']}. Extracted text: {result['extracted_text']}") if enable_tts else None # Generate PDF pdf_path = create_pdf(result, original_filename) return output_text, "Analysis complete", audio_path, pdf_path except Exception as e: return f"Analysis error: {str(e)}", "Error", None, None # Gradio Interface with gr.Blocks(title="Image Analysis Service", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🖼️ Image Analysis Service") gr.Markdown("Upload an image to get automatic captioning and text extraction") with gr.Row(): with gr.Column(): image_input = gr.Image(label="Upload Image", type="filepath") tts_checkbox = gr.Checkbox( label="Enable Text-to-Speech", value=False ) analyze_btn = gr.Button("Analyze Image", variant="primary") with gr.Column(): output = gr.Textbox(label="Analysis Results", lines=10) status = gr.Textbox(label="Status", interactive=False) audio_output = gr.Audio(label="Audio Summary", visible=False) pdf_download = gr.File(label="Download Report", visible=False) def toggle_audio_visibility(enable_tts): return gr.Audio(visible=enable_tts) def update_ui(result, status, audio_path, pdf_path): return ( result, status, gr.Audio(visible=audio_path is not None, value=audio_path), gr.File(visible=pdf_path is not None, value=pdf_path) ) tts_checkbox.change( fn=toggle_audio_visibility, inputs=tts_checkbox, outputs=audio_output ) analyze_btn.click( fn=process_image, inputs=[image_input, tts_checkbox], outputs=[output, status, audio_output, pdf_download] ).then( fn=update_ui, inputs=[output, status, audio_output, pdf_download], outputs=[output, status, audio_output, pdf_download] ) # FastAPI setup @app.get("/files/{file_name}") async def get_file(file_name: str): file_path = os.path.join(tempfile.gettempdir(), file_name) if os.path.exists(file_path): return FileResponse(file_path) return JSONResponse({"error": "File not found"}, status_code=404) app = gr.mount_gradio_app(app, demo, path="/") @app.get("/") def redirect_to_interface(): return RedirectResponse(url="/")