Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import easyocr | |
| from fastapi import FastAPI | |
| from fastapi.responses import RedirectResponse | |
| import tempfile | |
| import os | |
| from gtts import gTTS | |
| from fpdf import FPDF | |
| import datetime | |
| # Initialize components | |
| app = FastAPI() | |
| # Load models | |
| captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
| reader = easyocr.Reader(['en', 'fr']) # English and French OCR | |
| def analyze_image(image_path): | |
| """Process image with both captioning and OCR""" | |
| try: | |
| # Generate image caption | |
| caption_result = captioner(image_path) | |
| caption = caption_result[0]['generated_text'] | |
| # Extract text with EasyOCR | |
| ocr_result = reader.readtext(image_path, detail=0) | |
| extracted_text = "\n".join(ocr_result) if ocr_result else "No text detected" | |
| return { | |
| "caption": caption, | |
| "extracted_text": extracted_text | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def text_to_speech(text: str) -> str: | |
| """Convert text to speech""" | |
| try: | |
| tts = gTTS(text) | |
| temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
| tts.save(temp_audio.name) | |
| return temp_audio.name | |
| except Exception as e: | |
| print(f"Text-to-speech error: {e}") | |
| return "" | |
| def create_pdf(content: dict, original_filename: str) -> str: | |
| """Create PDF report""" | |
| try: | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.set_font("Arial", size=12) | |
| # Title | |
| pdf.set_font("Arial", 'B', 16) | |
| pdf.cell(200, 10, txt="Image Analysis Report", ln=1, align='C') | |
| pdf.set_font("Arial", size=12) | |
| # Metadata | |
| pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1) | |
| pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1) | |
| pdf.ln(10) | |
| # Caption | |
| pdf.set_font("", 'B') | |
| pdf.cell(200, 10, txt="Image Caption:", ln=1) | |
| pdf.set_font("") | |
| pdf.multi_cell(0, 10, txt=content['caption']) | |
| pdf.ln(5) | |
| # Extracted Text | |
| pdf.set_font("", 'B') | |
| pdf.cell(200, 10, txt="Extracted Text:", ln=1) | |
| pdf.set_font("") | |
| pdf.multi_cell(0, 10, txt=content['extracted_text']) | |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") | |
| pdf.output(temp_pdf.name) | |
| return temp_pdf.name | |
| except Exception as e: | |
| print(f"PDF creation error: {e}") | |
| return "" | |
| def process_image(file, enable_tts: bool): | |
| """Handle image processing for Gradio interface""" | |
| if file is None: | |
| return "Please upload an image first", "Ready", None, None | |
| file_path = file.name | |
| original_filename = os.path.basename(file_path) | |
| try: | |
| # Analyze image | |
| result = analyze_image(file_path) | |
| if "error" in result: | |
| return result["error"], "Error", None, None | |
| # Format output | |
| output_text = f"📷 Image Caption:\n{result['caption']}\n\n✍️ Extracted Text:\n{result['extracted_text']}" | |
| # Generate audio | |
| audio_path = text_to_speech(f"Image caption: {result['caption']}. Extracted text: {result['extracted_text']}") if enable_tts else None | |
| # Generate PDF | |
| pdf_path = create_pdf(result, original_filename) | |
| return output_text, "Analysis complete", audio_path, pdf_path | |
| except Exception as e: | |
| return f"Analysis error: {str(e)}", "Error", None, None | |
| # Gradio Interface | |
| with gr.Blocks(title="Image Analysis Service", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🖼️ Image Analysis Service") | |
| gr.Markdown("Upload an image to get automatic captioning and text extraction") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(label="Upload Image", type="filepath") | |
| tts_checkbox = gr.Checkbox( | |
| label="Enable Text-to-Speech", | |
| value=False | |
| ) | |
| analyze_btn = gr.Button("Analyze Image", variant="primary") | |
| with gr.Column(): | |
| output = gr.Textbox(label="Analysis Results", lines=10) | |
| status = gr.Textbox(label="Status", interactive=False) | |
| audio_output = gr.Audio(label="Audio Summary", visible=False) | |
| pdf_download = gr.File(label="Download Report", visible=False) | |
| def toggle_audio_visibility(enable_tts): | |
| return gr.Audio(visible=enable_tts) | |
| def update_ui(result, status, audio_path, pdf_path): | |
| return ( | |
| result, | |
| status, | |
| gr.Audio(visible=audio_path is not None, value=audio_path), | |
| gr.File(visible=pdf_path is not None, value=pdf_path) | |
| ) | |
| tts_checkbox.change( | |
| fn=toggle_audio_visibility, | |
| inputs=tts_checkbox, | |
| outputs=audio_output | |
| ) | |
| analyze_btn.click( | |
| fn=process_image, | |
| inputs=[image_input, tts_checkbox], | |
| outputs=[output, status, audio_output, pdf_download] | |
| ).then( | |
| fn=update_ui, | |
| inputs=[output, status, audio_output, pdf_download], | |
| outputs=[output, status, audio_output, pdf_download] | |
| ) | |
| # FastAPI setup | |
| async def get_file(file_name: str): | |
| file_path = os.path.join(tempfile.gettempdir(), file_name) | |
| if os.path.exists(file_path): | |
| return FileResponse(file_path) | |
| return JSONResponse({"error": "File not found"}, status_code=404) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| def redirect_to_interface(): | |
| return RedirectResponse(url="/") | |