Spaces:

awacke1
/

TorchTransformers-CV-SFT

Running

File size: 30,362 Bytes

#!/usr/bin/env python
"""
Combined Multimodal AI Suite
- TorchTransformers-Diffusion-CV-SFT functionality (Camera, PDF, OCR, diffusion image gen, etc.)
- GPT-4o Omni: Text, Audio, Image, Video processing with chat and paper search
- Python Code Interpreter for code generation and execution

This app integrates all modalities and adds an “Integrated Workflow” tab that enables you to:
  • Upload documents (e.g. double-page papers)
  • Extract text via OCR and image processing
  • Prompt GPT to generate Python code based on the extracted text
  • Display and execute the generated code

Developed with Streamlit.
"""

import aiofiles
import asyncio
import base64
import fitz
import glob
import logging
import os
import pandas as pd
import pytz
import random
import re
import requests
import shutil
import streamlit as st
import time
import torch
import zipfile

from dataclasses import dataclass
from datetime import datetime
from diffusers import StableDiffusionPipeline
from io import BytesIO
from openai import OpenAI
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from typing import Optional

# --- Additional Imports from GPT-4o Omni ---
import cv2
import json
import streamlit.components.v1 as components
import textract
from audio_recorder_streamlit import audio_recorder
from bs4 import BeautifulSoup
from collections import deque
from dotenv import load_dotenv
from gradio_client import Client, handle_file
from huggingface_hub import InferenceClient
from moviepy import VideoFileClip
from urllib.parse import quote
from xml.etree import ElementTree as ET
import openai

# --- Code Interpreter Imports ---
import io
import sys
from contextlib import redirect_stdout
import mistune

# Load environment variables
load_dotenv()

# ------------------ Global Configuration ------------------
st.set_page_config(
    page_title="Combined Multimodal AI Suite 🚀",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://huggingface.co/awacke1',
        'Report a Bug': 'https://huggingface.co/spaces/awacke1',
        'About': "Combined Multimodal AI Suite: Camera, OCR, Chat, Code Generation & Execution"
    }
)

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
log_records = []
class LogCaptureHandler(logging.Handler):
    def emit(self, record):
        log_records.append(record)
logger.addHandler(LogCaptureHandler())

# ------------------ Session State Defaults ------------------
if 'history' not in st.session_state:
    st.session_state.history = []
if 'messages' not in st.session_state:
    st.session_state.messages = []
if 'gallery_files' not in st.session_state:
    st.session_state.gallery_files = []
if 'builder' not in st.session_state:
    st.session_state.builder = None
if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False
if 'processing' not in st.session_state:
    st.session_state.processing = {}
if 'asset_checkboxes' not in st.session_state:
    st.session_state.asset_checkboxes = {}
if 'downloaded_pdfs' not in st.session_state:
    st.session_state.downloaded_pdfs = {}
if 'unique_counter' not in st.session_state:
    st.session_state.unique_counter = 0

# ------------------ Utility Functions ------------------
def generate_filename(prompt, file_type):
    """Generates a safe filename based on prompt and file type."""
    central = pytz.timezone('US/Central')
    safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
    replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
    safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
    return f"{safe_date_time}_{safe_prompt}.{file_type}"

def get_download_link(file_path, mime_type="application/octet-stream", label="Download"):
    with open(file_path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode()
    return f'<a href="data:{mime_type};base64,{b64}" download="{os.path.basename(file_path)}">{label}</a>'

def zip_directory(directory_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file), os.path.dirname(directory_path)))

def get_gallery_files(file_types=["png", "pdf", "md"]):
    return sorted(list({f for ext in file_types for f in glob.glob(f"*.{ext}")}))

def download_pdf(url, output_path):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(output_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            return True
    except requests.RequestException as e:
        logger.error(f"Failed to download {url}: {e}")
    return False

# ------------------ Model & Diffusion Builders ------------------
@dataclass
class ModelConfig:
    name: str
    base_model: str
    size: str
    domain: Optional[str] = None
    model_type: str = "causal_lm"
    @property
    def model_path(self):
        return f"models/{self.name}"

@dataclass
class DiffusionConfig:
    name: str
    base_model: str
    size: str
    domain: Optional[str] = None
    @property
    def model_path(self):
        return f"diffusion_models/{self.name}"

class ModelBuilder:
    def __init__(self):
        self.config = None
        self.model = None
        self.tokenizer = None
        self.jokes = [
            "Why did the AI go to therapy? Too many layers to unpack! 😂",
            "Training complete! Time for a binary coffee break. ☕",
            "I told my neural network a joke; it couldn't stop dropping bits! 🤖"
        ]
    def load_model(self, model_path: str, config: Optional[ModelConfig] = None):
        with st.spinner(f"Loading model from {model_path}..."):
            self.model = AutoModelForCausalLM.from_pretrained(model_path)
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            if config:
                self.config = config
            device = "cuda" if torch.cuda.is_available() else "cpu"
            self.model.to(device)
        st.success(f"Model loaded! {random.choice(self.jokes)}")
        return self
    def save_model(self, path: str):
        with st.spinner("Saving model..."):
            os.makedirs(os.path.dirname(path), exist_ok=True)
            self.model.save_pretrained(path)
            self.tokenizer.save_pretrained(path)
        st.success(f"Model saved at {path}!")
        
class DiffusionBuilder:
    def __init__(self):
        self.config = None
        self.pipeline = None
    def load_model(self, model_path: str, config: Optional[DiffusionConfig] = None):
        with st.spinner(f"Loading diffusion model from {model_path}..."):
            self.pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cpu")
            if config:
                self.config = config
        st.success("Diffusion model loaded!")
        return self
    def save_model(self, path: str):
        with st.spinner("Saving diffusion model..."):
            os.makedirs(os.path.dirname(path), exist_ok=True)
            self.pipeline.save_pretrained(path)
        st.success(f"Diffusion model saved at {path}!")
    def generate(self, prompt: str):
        return self.pipeline(prompt, num_inference_steps=20).images[0]

# ------------------ OCR & Image Processing Functions ------------------
async def process_ocr(image, output_file):
    start_time = time.time()
    status = st.empty()
    status.text("Processing OCR... (0s)")
    tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
    model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
    temp_file = f"temp_{int(time.time())}.png"
    image.save(temp_file)
    result = model.chat(tokenizer, temp_file, ocr_type='ocr')
    os.remove(temp_file)
    elapsed = int(time.time() - start_time)
    status.text(f"OCR completed in {elapsed}s!")
    async with aiofiles.open(output_file, "w") as f:
        await f.write(result)
    return result

async def process_image_gen(prompt, output_file):
    start_time = time.time()
    status = st.empty()
    status.text("Generating image... (0s)")
    # Use diffusion builder from session if available; otherwise load a default
    if st.session_state.get('builder') and isinstance(st.session_state.builder, DiffusionBuilder):
        pipeline = st.session_state.builder.pipeline
    else:
        pipeline = StableDiffusionPipeline.from_pretrained("OFA-Sys/small-stable-diffusion-v0", torch_dtype=torch.float32).to("cpu")
    gen_image = pipeline(prompt, num_inference_steps=20).images[0]
    elapsed = int(time.time() - start_time)
    status.text(f"Image generation completed in {elapsed}s!")
    gen_image.save(output_file)
    return gen_image

def process_image_with_prompt(image, prompt, model="gpt-4o-mini", detail="auto"):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": detail}}
        ]
    }]
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
    try:
        response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

def process_text_with_prompt(text, prompt, model="gpt-4o-mini"):
    messages = [{"role": "user", "content": f"{prompt}\n\n{text}"}]
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
    try:
        response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

# ------------------ PDF Processing Functions ------------------
async def process_pdf_snapshot(pdf_path, mode="single"):
    start_time = time.time()
    status = st.empty()
    status.text(f"Processing PDF Snapshot ({mode})... (0s)")
    try:
        doc = fitz.open(pdf_path)
        output_files = []
        if mode == "single":
            page = doc[0]
            pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
            output_file = generate_filename("single_snapshot", "png")
            pix.save(output_file)
            output_files.append(output_file)
        elif mode == "twopage":
            for i in range(min(2, len(doc))):
                page = doc[i]
                pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
                output_file = generate_filename(f"twopage_{i}", "png")
                pix.save(output_file)
                output_files.append(output_file)
        elif mode == "allpages":
            for i in range(len(doc)):
                page = doc[i]
                pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
                output_file = generate_filename(f"page_{i}", "png")
                pix.save(output_file)
                output_files.append(output_file)
        doc.close()
        elapsed = int(time.time() - start_time)
        status.text(f"PDF Snapshot ({mode}) completed in {elapsed}s!")
        return output_files
    except Exception as e:
        status.error(f"Error: {str(e)}")
        return []

# ------------------ GPT & Chat Functions ------------------
def process_text(text_input):
    if text_input:
        st.session_state.messages.append({"role": "user", "content": text_input})
        with st.chat_message("user"):
            st.markdown(text_input)
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
        with st.chat_message("assistant"):
            completion = client.chat.completions.create(
                model="gpt-4o-2024-05-13",
                messages=st.session_state.messages,
                stream=False
            )
            return_text = completion.choices[0].message.content
            st.write("Assistant: " + return_text)
            st.session_state.messages.append({"role": "assistant", "content": return_text})
            return return_text

def process_text2(text_input, model="gpt-4o-2024-05-13"):
    if text_input:
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
        completion = client.chat.completions.create(
            model=model,
            messages=st.session_state.messages,
            stream=False
        )
        return_text = completion.choices[0].message.content
        st.write("Assistant: " + return_text)
        st.session_state.messages.append({"role": "assistant", "content": return_text})
        return return_text

# ------------------ Audio & Video Processing Functions ------------------
def SpeechSynthesis(result):
    documentHTML5 = f'''
    <!DOCTYPE html>
    <html>
    <head>
        <title>Read It Aloud</title>
        <script type="text/javascript">
            function readAloud() {{
                const text = document.getElementById("textArea").value;
                const speech = new SpeechSynthesisUtterance(text);
                window.speechSynthesis.speak(speech);
            }}
        </script>
    </head>
    <body>
        <h1>🔊 Read It Aloud</h1>
        <textarea id="textArea" rows="10" cols="80">{result}</textarea>
        <br>
        <button onclick="readAloud()">🔊 Read Aloud</button>
    </body>
    </html>
    '''
    components.html(documentHTML5, width=1280, height=300)

def process_audio(audio_input, text_input=''):
    if audio_input:
        # Save and read audio bytes
        with open("temp_audio.wav", "wb") as file:
            file.write(audio_input.getvalue())
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
        transcription = client.audio.transcriptions.create(model="whisper-1", file=open("temp_audio.wav", "rb"))
        st.session_state.messages.append({"role": "user", "content": transcription.text})
        with st.chat_message("assistant"):
            st.markdown(transcription.text)
            SpeechSynthesis(transcription.text)
        filename = generate_filename(transcription.text, "md")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(transcription.text)
        return transcription.text

def process_video_and_audio(video_input):
    if video_input:
        # Save video file
        video_path = video_input.name
        with open(video_path, "wb") as f:
            f.write(video_input.getbuffer())
        # Extract frames
        base64Frames = []
        video = cv2.VideoCapture(video_path)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = video.get(cv2.CAP_PROP_FPS)
        frames_to_skip = int(fps * 1)  # 1 second per frame
        curr_frame = 0
        while curr_frame < total_frames - 1:
            video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
            success, frame = video.read()
            if not success:
                break
            _, buffer = cv2.imencode(".jpg", frame)
            base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
            curr_frame += frames_to_skip
        video.release()
        # Audio transcription from video
        try:
            clip = VideoFileClip(video_path)
            audio_path = f"{os.path.splitext(video_path)[0]}.mp3"
            clip.audio.write_audiofile(audio_path, bitrate="32k")
            clip.audio.close()
            clip.close()
            with open(audio_path, "rb") as f:
                audio_data = f.read()
            client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
            transcription = client.audio.transcriptions.create(model="whisper-1", file=BytesIO(audio_data))
        except Exception as e:
            transcription = type("Dummy", (), {"text": "No transcript available."})()
        # Display frames and transcript
        st.markdown("### Video Frames")
        for frame_b64 in base64Frames:
            st.image(f"data:image/jpg;base64,{frame_b64}", use_container_width=True)
        st.markdown("### Audio Transcription")
        st.write(transcription.text)
        return transcription.text

# ------------------ Python Code Executor Functions ------------------
def extract_python_code(markdown_text):
    pattern = r"```python\s*(.*?)\s*```"
    matches = re.findall(pattern, markdown_text, re.DOTALL)
    return matches

def execute_code(code):
    buffer = io.StringIO()
    local_vars = {}
    try:
        with redirect_stdout(buffer):
            exec(code, {}, local_vars)
        output = buffer.getvalue()
        return output, None
    except Exception as e:
        return None, str(e)
    finally:
        buffer.close()

def create_and_save_file(filename, prompt, response, should_save=True):
    if not should_save:
        return
    base_filename, ext = os.path.splitext(filename)
    if ext in ['.txt', '.htm', '.md']:
        with open(f"{base_filename}.md", 'w', encoding='utf-8') as file:
            file.write(response)

# ------------------ Integrated Workflow Function ------------------
def integrated_workflow():
    st.header("Integrated Workflow: From Paper to Code")
    st.markdown("""
    1. **Upload a PDF or Image** of a paper (double-page images work best).
    2. **Run OCR** to extract text.
    3. **Generate Python Code** based on the extracted text using GPT.
    4. **Review and Execute** the generated code.
    """)
    uploaded_file = st.file_uploader("Upload PDF or Image", type=["pdf", "png", "jpg", "jpeg"], key="integrated_file")
    if uploaded_file:
        # Save the uploaded file
        file_path = f"uploaded_{uploaded_file.name}"
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getvalue())
        st.success(f"Uploaded file saved as {file_path}")
        # If PDF, show first page snapshot; if image, load directly.
        if uploaded_file.type == "application/pdf":
            mode = st.selectbox("Snapshot Mode", ["single", "twopage", "allpages"])
            snapshots = asyncio.run(process_pdf_snapshot(file_path, mode))
            for snapshot in snapshots:
                st.image(Image.open(snapshot), caption=f"Snapshot: {snapshot}", use_container_width=True)
        else:
            st.image(Image.open(file_path), caption="Uploaded Image", use_container_width=True)
        # Run OCR on the file (using first page or the image itself)
        if st.button("Run OCR on File"):
            if uploaded_file.type == "application/pdf":
                doc = fitz.open(file_path)
                page = doc[0]
                pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
                temp_img = f"ocr_{os.path.basename(file_path)}.png"
                pix.save(temp_img)
                doc.close()
                image = Image.open(temp_img)
            else:
                image = Image.open(file_path)
            ocr_output_file = generate_filename("ocr_output", "txt")
            ocr_result = asyncio.run(process_ocr(image, ocr_output_file))
            st.text_area("OCR Output", ocr_result, height=200)
            # Use extracted OCR text as prompt to generate python code
            st.markdown("### Generate Python Code from OCR Text")
            code_prompt = st.text_area("Edit Prompt for Code Generation", value=f"Generate a Python script that processes the following scientific text:\n\n{ocr_result}", height=200)
            if st.button("Generate Code"):
                code_generated = process_text_with_prompt(ocr_result, code_prompt, model="gpt-4o-mini")
                st.code(code_generated, language="python")
                # Save generated code
                code_filename = generate_filename("generated_code", "py")
                with open(code_filename, "w", encoding="utf-8") as f:
                    f.write(code_generated)
                st.markdown(get_download_link(code_filename, "text/plain", "Download Generated Code"), unsafe_allow_html=True)
                # Optionally execute the generated code
                if st.button("Execute Generated Code"):
                    output, error = execute_code(code_generated)
                    if error:
                        st.error(f"Error executing code:\n{error}")
                    else:
                        st.success("Code executed successfully. Output:")
                        st.code(output)

# ------------------ Sidebar: Asset Gallery & Logs ------------------
def update_gallery():
    container = st.sidebar.empty()
    all_files = get_gallery_files()
    if all_files:
        container.markdown("### Asset Gallery")
        cols = container.columns(2)
        for idx, file in enumerate(all_files[:st.session_state.get('gallery_size', 5)]):
            with cols[idx % 2]:
                if file.endswith('.png'):
                    st.image(Image.open(file), caption=os.path.basename(file), use_container_width=True)
                else:
                    st.markdown(os.path.basename(file))
                if st.button("Delete "+os.path.basename(file), key="del_"+file):
                    os.remove(file)
                    st.experimental_rerun()

update_gallery()
st.sidebar.subheader("Action Logs")
for record in log_records:
    st.sidebar.write(f"{record.asctime} - {record.levelname} - {record.message}")

# ------------------ Main App Navigation ------------------
st.title("Combined Multimodal AI Suite")

tabs = st.tabs(["Home", "Camera & Images", "PDF & Documents", "Multimodal Chat", "Code Executor", "Integrated Workflow"])

# --- Home Tab ---
with tabs[0]:
    st.header("Welcome to the Combined Multimodal AI Suite")
    st.markdown("""
    This application integrates multiple AI functionalities:
    
    - **Camera & Image Processing:** Capture images, generate new images using diffusion models.
    - **PDF & Document Processing:** Download PDFs, perform OCR, and generate markdown summaries.
    - **Multimodal Chat:** Chat with GPT-4o using text, audio, image, and video inputs.
    - **Code Executor:** Write, generate, and execute Python code interactively.
    - **Integrated Workflow:** Seamlessly extract text from papers and generate & run Python code.
    
    Use the tabs above to explore each modality.
    """)

# --- Camera & Images Tab ---
with tabs[1]:
    st.header("Camera & Image Processing")
    st.subheader("Capture and Process Images")
    col1, col2 = st.columns(2)
    with col1:
        cam0_img = st.camera_input("Take a picture - Cam 0", key="cam0")
        if cam0_img:
            filename = generate_filename("cam0_snapshot", "png")
            with open(filename, "wb") as f:
                f.write(cam0_img.getvalue())
            st.image(Image.open(filename), caption="Camera 0 Snapshot", use_container_width=True)
            st.session_state.history.append(f"Captured {filename}")
    with col2:
        cam1_img = st.camera_input("Take a picture - Cam 1", key="cam1")
        if cam1_img:
            filename = generate_filename("cam1_snapshot", "png")
            with open(filename, "wb") as f:
                f.write(cam1_img.getvalue())
            st.image(Image.open(filename), caption="Camera 1 Snapshot", use_container_width=True)
            st.session_state.history.append(f"Captured {filename}")
    st.markdown("---")
    st.subheader("Generate New Image with Diffusion")
    prompt_img = st.text_input("Enter prompt for image generation", "A neon futuristic cityscape")
    if st.button("Generate Image"):
        output_file = generate_filename("gen_output", "png")
        result_img = asyncio.run(process_image_gen(prompt_img, output_file))
        st.image(result_img, caption="Generated Image", use_container_width=True)

# --- PDF & Documents Tab ---
with tabs[2]:
    st.header("PDF & Document Processing")
    st.subheader("Download and Process PDFs")
    url_input = st.text_area("Enter PDF URLs (one per line)", height=100)
    if st.button("Download PDFs"):
        urls = [u.strip() for u in url_input.splitlines() if u.strip()]
        progress_bar = st.progress(0)
        for idx, url in enumerate(urls):
            output_path = generate_filename(url, "pdf")
            if download_pdf(url, output_path):
                st.session_state.downloaded_pdfs[url] = output_path
                st.success(f"Downloaded: {output_path}")
            progress_bar.progress((idx + 1) / len(urls))
    st.markdown("---")
    st.subheader("OCR & PDF Snapshot")
    all_assets = get_gallery_files()
    selected_asset = st.selectbox("Select an asset", all_assets) if all_assets else None
    if selected_asset and st.button("Run OCR on Selected"):
        if selected_asset.endswith('.png'):
            image = Image.open(selected_asset)
        else:
            doc = fitz.open(selected_asset)
            pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            doc.close()
        output_file = generate_filename("ocr_output", "txt")
        ocr_result = asyncio.run(process_ocr(image, output_file))
        st.text_area("OCR Result", ocr_result, height=200)
    st.markdown("---")
    st.subheader("Markdown Gallery")
    md_files = sorted(glob.glob("*.md"))
    if md_files:
        for md in md_files:
            st.markdown(f"**{md}**")
            st.markdown(get_download_link(md, "text/markdown", "Download MD"), unsafe_allow_html=True)

# --- Multimodal Chat Tab ---
with tabs[3]:
    st.header("Multimodal Chat")
    st.markdown("Chat with GPT-4o using text, audio, image, or video inputs.")
    mode = st.selectbox("Select Mode", ["Text", "Image", "Audio", "Video"])
    if mode == "Text":
        text_input = st.text_input("Enter your text prompt")
        if st.button("Send Text"):
            response = process_text(text_input)
            st.markdown(response)
    elif mode == "Image":
        text_prompt = st.text_input("Enter prompt for image analysis", "Describe this image and list 10 facts.")
        image_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"], key="chat_image")
        if image_file:
            image = Image.open(image_file)
            st.image(image, caption="Uploaded Image", use_container_width=True)
            response = process_image_with_prompt(image, text_prompt)
            st.markdown(response)
    elif mode == "Audio":
        st.markdown("Record or upload an audio file for transcription.")
        audio_bytes = audio_recorder()
        if audio_bytes:
            st.audio(audio_bytes, format="audio/wav")
            transcription = process_audio(audio_bytes)
            st.markdown(transcription)
    elif mode == "Video":
        video_file = st.file_uploader("Upload a video file", type=["mp4", "webm"], key="chat_video")
        if video_file:
            transcript = process_video_and_audio(video_file)
            st.markdown("Video Transcript:")
            st.write(transcript)

    st.markdown("---")
    st.subheader("Chat History")
    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]):
            st.markdown(msg["content"])

# --- Code Executor Tab ---
with tabs[4]:
    st.header("Python Code Executor")
    st.markdown("Enter Python code below or upload a .py/.md file. The code will be executed in a sandboxed environment.")
    uploaded_file = st.file_uploader("Upload Python (.py) or Markdown (.md) file", type=["py", "md"], key="code_file")
    if 'code' not in st.session_state:
        st.session_state.code = """import streamlit as st
st.write("Hello from the Python Code Executor!")"""
    if uploaded_file is None:
        code_input = st.text_area("Python Code Editor:", value=st.session_state.code, height=400, key="code_editor")
    else:
        content = uploaded_file.getvalue().decode()
        if uploaded_file.type == "text/markdown":
            code_blocks = extract_python_code(content)
            if code_blocks:
                code_input = code_blocks[0]
            else:
                st.error("No Python code block found in the markdown file!")
                code_input = ""
        else:
            code_input = content
        st.code(code_input, language='python')
    col1, col2 = st.columns([1,1])
    with col1:
        if st.button("▶️ Run Code"):
            if code_input:
                output, error = execute_code(code_input)
                if error:
                    st.error(f"Error:\n{error}")
                elif output:
                    st.code(output)
                else:
                    st.success("Code executed with no output.")
            else:
                st.warning("Please enter some code!")
    with col2:
        if st.button("🗑️ Clear Code"):
            st.session_state.code = ""
            st.experimental_rerun()
    with st.expander("How to use the Code Executor"):
        st.markdown("""
        - Enter or upload Python code.
        - Click **Run Code** to execute.
        - The output (or any errors) will be displayed below.
        """)

# --- Integrated Workflow Tab ---
with tabs[5]:
    integrated_workflow()

# ------------------ Chat Input at Bottom ------------------
if prompt := st.chat_input("GPT-4o Multimodal ChatBot - How can I help you?"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)
    with st.chat_message("assistant"):
        response = process_text2(prompt)
    st.session_state.messages.append({"role": "assistant", "content": response})