Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running

App Files Files Community

awacke1 commited on Mar 28

Commit

a7ab48f

verified ·

1 Parent(s): e82eace

Create app.py

Browse files

Files changed (1) hide show

app.py +539 -0

app.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import base64
+import cv2
+import glob
+import json
+import math
+import os
+import pytz
+import random
+import re
+import requests
+import streamlit as st
+import streamlit.components.v1 as components
+import textract
+import time
+import zipfile
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import concurrent
+from audio_recorder_streamlit import audio_recorder
+from bs4 import BeautifulSoup
+from collections import deque
+from datetime import datetime
+from dotenv import load_dotenv
+from gradio_client import Client
+from io import BytesIO
+from moviepy import VideoFileClip
+from PIL import Image
+from PyPDF2 import PdfReader
+from templates import bot_template, css, user_template
+from urllib.parse import quote
+from xml.etree import ElementTree as ET
+import openai
+from openai import OpenAI
+import pandas as pd
+# Configuration
+Site_Name = 'Scholarly-Article-Document-Search-With-Memory'
+title = "🔬🧠ScienceBrain.AI"
+helpURL = 'https://huggingface.co/awacke1'
+bugURL = 'https://huggingface.co/spaces/awacke1'
+icons = Image.open("icons.ico")
+st.set_page_config(
+    page_title=title,
+    page_icon=icons,
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items={'Get Help': helpURL, 'Report a bug': bugURL, 'About': title}
+)
+# API Configuration
+API_KEY = os.getenv('API_KEY')
+HF_KEY = os.getenv('HF_KEY')
+headers = {"Authorization": f"Bearer {HF_KEY}", "Content-Type": "application/json"}
+key = os.getenv('OPENAI_API_KEY')
+client = OpenAI(api_key=key, organization=os.getenv('OPENAI_ORG_ID'))
+MODEL = "gpt-4o-2024-05-13"
+if "openai_model" not in st.session_state:
+    st.session_state["openai_model"] = MODEL
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if st.button("Clear Session"):
+    st.session_state.messages = []
+# Sidebar Options
+should_save = st.sidebar.checkbox("💾 Save", value=True, help="Save your session data.")
+# HTML5 Speech Synthesis
+@st.cache_resource
+def SpeechSynthesis(result):
+    documentHTML5 = '''
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Read It Aloud</title>
+        <script type="text/javascript">
+            function readAloud() {
+                const text = document.getElementById("textArea").value;
+                const speech = new SpeechSynthesisUtterance(text);
+                window.speechSynthesis.speak(speech);
+            }
+        </script>
+    </head>
+    <body>
+        <h1>🔊 Read It Aloud</h1>
+        <textarea id="textArea" rows="10" cols="80">
+    '''
+    documentHTML5 += result + '''
+        </textarea>
+        <br>
+        <button onclick="readAloud()">🔊 Read Aloud</button>
+    </body>
+    </html>
+    '''
+    components.html(documentHTML5, width=1280, height=300)
+# File Naming and Saving
+def generate_filename(prompt, file_type, original_name=None):
+    central = pytz.timezone('US/Central')
+    safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
+    if original_name and file_type == "md":  # For images
+        base_name = os.path.splitext(original_name)[0]
+        safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:100]
+        return f"{safe_date_time}_{safe_prompt}_{base_name}.{file_type}"
+    safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:240]
+    return f"{safe_date_time}_{safe_prompt}.{file_type}"
+def create_and_save_file(content, file_type="md", prompt=None, original_name=None, should_save=True):
+    if not should_save:
+        return None
+    filename = generate_filename(prompt, file_type, original_name)
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(content if not prompt else prompt + "\n\n" + content)
+    return filename
+# Text Processing
+def process_text(text_input):
+    if text_input:
+        st.session_state.messages.append({"role": "user", "content": text_input})
+        with st.chat_message("user"):
+            st.markdown(text_input)
+        with st.chat_message("assistant"):
+            completion = client.chat.completions.create(
+                model=st.session_state["openai_model"],
+                messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
+                stream=False
+            )
+            response = completion.choices[0].message.content
+            st.markdown(response)
+            filename = generate_filename(text_input, "md")
+            create_and_save_file(response, "md", text_input, should_save=should_save)
+            st.session_state.messages.append({"role": "assistant", "content": response})
+            st.rerun()
+# Image Processing
+def process_image(image_input, user_prompt):
+    original_name = image_input.name
+    image_bytes = image_input.read()
+    with open(original_name, "wb") as f:
+        f.write(image_bytes)  # Save original image
+    base64_image = base64.b64encode(image_bytes).decode("utf-8")
+    response = client.chat.completions.create(
+        model=st.session_state["openai_model"],
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant that responds in Markdown."},
+            {"role": "user", "content": [
+                {"type": "text", "text": user_prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
+            ]}
+        ],
+        temperature=0.0
+    )
+    image_response = response.choices[0].message.content
+    filename = generate_filename(user_prompt, "md", original_name)
+    create_and_save_file(image_response, "md", user_prompt, original_name, should_save=should_save)
+    st.rerun()
+    return image_response
+# Audio Processing
+def process_audio(audio_input, text_input=''):
+    if audio_input:
+        audio_bytes = audio_input if isinstance(audio_input, bytes) else audio_input.read()
+        supported_formats = ['flac', 'm4a', 'mp3', 'mp4', 'mpeg', 'mpga', 'oga', 'ogg', 'wav', 'webm']
+        file_ext = "wav" if isinstance(audio_input, bytes) else os.path.splitext(audio_input.name)[1][1:].lower()
+        if file_ext not in supported_formats:
+            st.error(f"Unsupported format: {file_ext}. Supported formats: {supported_formats}")
+            return
+        if len(audio_bytes) > 200 * 1024 * 1024:  # 200MB limit
+            st.error("File exceeds 200MB limit.")
+            return
+        with st.spinner("Transcribing audio..."):
+            try:
+                transcription = client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=BytesIO(audio_bytes)
+                ).text
+                st.session_state.messages.append({"role": "user", "content": transcription})
+                with st.chat_message("user"):
+                    st.markdown(transcription)
+                with st.chat_message("assistant"):
+                    completion = client.chat.completions.create(
+                        model=st.session_state["openai_model"],
+                        messages=[{"role": "user", "content": text_input + "\n\nTranscription: " + transcription}]
+                    )
+                    response = completion.choices[0].message.content
+                    st.markdown(response)
+                    filename = generate_filename(transcription, "md")
+                    create_and_save_file(response, "md", text_input, should_save=should_save)
+                    st.session_state.messages.append({"role": "assistant", "content": response})
+                    st.rerun()
+            except openai.BadRequestError as e:
+                st.error(f"Audio processing error: {str(e)}")
+# Video Processing
+def save_video(video_input):
+    with open(video_input.name, "wb") as f:
+        f.write(video_input.read())
+    return video_input.name
+def process_video(video_path, seconds_per_frame=2):
+    base64Frames = []
+    base_video_path, _ = os.path.splitext(video_path)
+    video = cv2.VideoCapture(video_path)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frames_to_skip = int(fps * seconds_per_frame)
+    curr_frame = 0
+    while curr_frame < total_frames - 1:
+        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        curr_frame += frames_to_skip
+    video.release()
+    audio_path = f"{base_video_path}.mp3"
+    try:
+        clip = VideoFileClip(video_path)
+        if clip.audio:
+            clip.audio.write_audiofile(audio_path, bitrate="32k")
+            clip.audio.close()
+        clip.close()
+    except Exception as e:
+        st.warning(f"No audio track found or error: {str(e)}")
+        audio_path = None
+    return base64Frames, audio_path
+def process_audio_and_video(video_input):
+    if video_input:
+        video_path = save_video(video_input)
+        with st.spinner("Extracting frames and audio..."):
+            base64Frames, audio_path = process_video(video_path)
+        if audio_path:
+            with st.spinner("Transcribing video audio..."):
+                try:
+                    with open(audio_path, "rb") as audio_file:
+                        transcript = client.audio.transcriptions.create(
+                            model="whisper-1",
+                            file=audio_file
+                        ).text
+                    with st.chat_message("user"):
+                        st.markdown(f"Video Transcription: {transcript}")
+                    with st.chat_message("assistant"):
+                        response = client.chat.completions.create(
+                            model=st.session_state["openai_model"],
+                            messages=[
+                                {"role": "system", "content": "Summarize the video and its transcript in Markdown."},
+                                {"role": "user", "content": [
+                                    "Video frames:", *map(lambda x: {"type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{x}"}}, base64Frames),
+                                    {"type": "text", "text": f"Transcription: {transcript}"}
+                                ]}
+                            ]
+                        )
+                        result = response.choices[0].message.content
+                        st.markdown(result)
+                        filename = generate_filename(transcript, "md")
+                        create_and_save_file(result, "md", "Video summary", should_save=should_save)
+                        st.rerun()
+                except openai.BadRequestError as e:
+                    st.error(f"Video audio processing error: {str(e)}")
+        else:
+            st.warning("No audio to transcribe.")
+# ArXiv Search
+def search_arxiv(query):
+    client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
+    response = client.predict(
+        message=query,
+        llm_results_use=5,
+        database_choice="Semantic Search",
+        llm_model_picked="mistralai/Mistral-7B-Instruct-v0.2",
+        api_name="/update_with_rag_md"
+    )
+    result = response[0] + response[1]
+    filename = generate_filename(query, "md")
+    create_and_save_file(result, "md", query, should_save=should_save)
+    st.session_state.messages.append({"role": "assistant", "content": result})
+    st.rerun()
+    return result
+# RAG PDF Gallery
+def upload_pdf_files_to_vector_store(vector_store_id, pdf_files):
+    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}
+    def upload_single_pdf(file_path):
+        file_name = os.path.basename(file_path)
+        try:
+            with open(file_path, "rb") as f:
+                file_response = client.files.create(file=f, purpose="assistants")
+            client.vector_stores.files.create(vector_store_id=vector_store_id, file_id=file_response.id)
+            return {"file": file_name, "status": "success"}
+        except Exception as e:
+            return {"file": file_name, "status": "failed", "error": str(e)}
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = [executor.submit(upload_single_pdf, f) for f in pdf_files]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
+            result = future.result()
+            if result["status"] == "success":
+                stats["successful_uploads"] += 1
+            else:
+                stats["failed_uploads"] += 1
+                stats["errors"].append(result)
+    return stats
+def create_vector_store(store_name):
+    vector_store = client.vector_stores.create(name=store_name)
+    return {"id": vector_store.id, "name": vector_store.name, "created_at": vector_store.created_at, "file_count": vector_store.file_counts.completed}
+def generate_questions(pdf_path):
+    text = ""
+    with open(pdf_path, "rb") as f:
+        pdf = PdfReader(f)
+        for page in pdf.pages:
+            text += page.extract_text() or ""
+    prompt = f"Can you generate a question that can only be answered from this document?:\n{text[:2000]}\n\n"
+    response = client.chat.completions.create(
+        model="gpt-4o-2024-05-13",
+        messages=[{"role": "user", "content": prompt}]
+    )
+    return response.choices[0].message.content
+def process_rag_query(query, vector_store_id):
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-2024-05-13",
+            messages=[{"role": "user", "content": query}],
+            tools=[{"type": "file_search", "file_search": {"vector_store_ids": [vector_store_id]}}],
+            tool_choice="auto"
+        )
+        tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else []
+        return response.choices[0].message.content, tool_calls
+    except openai.BadRequestError as e:
+        st.error(f"RAG query error: {str(e)}")
+        return None, []
+def evaluate_rag(vector_store_id, questions_dict):
+    k = 5
+    total_queries = len(questions_dict)
+    correct_retrievals_at_k = 0
+    reciprocal_ranks = []
+    average_precisions = []
+    for filename, query in questions_dict.items():
+        expected_file = filename
+        response, tool_calls = process_rag_query(query, vector_store_id)
+        if not tool_calls:
+            continue
+        retrieved_files = [call.arguments.get("file_id", "") for call in tool_calls if "file_search" in call.type][:k]
+        if expected_file in retrieved_files:
+            rank = retrieved_files.index(expected_file) + 1
+            correct_retrievals_at_k += 1
+            reciprocal_ranks.append(1 / rank)
+            precisions = [1 if f == expected_file else 0 for f in retrieved_files[:rank]]
+            average_precisions.append(sum(precisions) / len(precisions))
+        else:
+            reciprocal_ranks.append(0)
+            average_precisions.append(0)
+    recall_at_k = correct_retrievals_at_k / total_queries if total_queries else 0
+    mrr = sum(reciprocal_ranks) / total_queries if total_queries else 0
+    map_score = sum(average_precisions) / total_queries if total_queries else 0
+    return {"recall@k": recall_at_k, "mrr": mrr, "map": map_score, "k": k}
+def rag_pdf_gallery():
+    st.subheader("RAG PDF Gallery")
+    pdf_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
+    if pdf_files:
+        pdf_paths = [save_video(f) for f in pdf_files]  # Reuse save_video for simplicity
+        with st.spinner("Creating vector store..."):
+            vector_store_details = create_vector_store("PDF_Gallery_Store")
+            stats = upload_pdf_files_to_vector_store(vector_store_details["id"], pdf_paths)
+            st.json(stats)
+        with st.spinner("Generating evaluation questions..."):
+            questions_dict = {os.path.basename(p): generate_questions(p) for p in pdf_paths}
+            st.json(questions_dict)
+        query = st.text_input("Ask a question about the PDFs:")
+        if query:
+            with st.spinner("Processing RAG query..."):
+                response, tool_calls = process_rag_query(query, vector_store_details["id"])
+                if response:
+                    st.markdown(response)
+                    st.write("Retrieved chunks:")
+                    for call in tool_calls:
+                        if "file_search" in call.type:
+                            st.json(call.arguments)
+        if st.button("Evaluate RAG Performance"):
+            with st.spinner("Evaluating..."):
+                metrics = evaluate_rag(vector_store_details["id"], questions_dict)
+                st.json(metrics)
+# File Sidebar
+def FileSidebar():
+    st.sidebar.title("File Operations")
+    default_types = [".md", ".png", ".pdf"]
+    file_types = st.sidebar.multiselect("Filter by type", [".md", ".wav", ".png", ".mp4", ".mp3", ".pdf"], default=default_types)
+    all_files = [f for f in glob.glob("*.*") if os.path.splitext(f)[1] in file_types and len(os.path.splitext(f)[0]) >= 10]
+    all_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    if st.sidebar.button("🗑 Delete All Filtered"):
+        for file in all_files:
+            os.remove(file)
+        st.rerun()
+    if st.sidebar.button("⬇️ Download All Filtered"):
+        zip_file = create_zip_of_files(all_files)
+        st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
+    for file in all_files:
+        ext = os.path.splitext(file)[1]
+        col1, col2, col3, col4, col5 = st.sidebar.columns([1, 6, 1, 1, 1])
+        with col1:
+            icon = "📜" if ext == ".md" else "📄" if ext == ".pdf" else "🖼️" if ext == ".png" else "🎵" if ext in [".wav", ".mp3"] else "🎥"
+            if st.button(icon, key=f"view_{file}"):
+                with open(file, "rb") as f:
+                    content = f.read()
+                if ext == ".md":
+                    st.markdown(content.decode("utf-8"))
+                    SpeechSynthesis(content.decode("utf-8"))
+                elif ext == ".pdf":
+                    st.download_button("Download PDF", content, file, "application/pdf")
+                    st.write("PDF Viewer not natively supported; download to view.")
+                elif ext == ".png":
+                    st.image(content, use_column_width=True)
+        with col2:
+            st.markdown(get_table_download_link(file), unsafe_allow_html=True)
+        with col3:
+            if st.button("📂", key=f"open_{file}"):
+                st.session_state.update({'filename': file, 'filetext': open(file, "r", encoding="utf-8").read()})
+        with col4:
+            if st.button("▶️", key=f"run_{file}"):
+                process_text(open(file, "r", encoding="utf-8").read())
+        with col5:
+            if st.button("🗑", key=f"delete_{file}"):
+                os.remove(file)
+                st.rerun()
+def create_zip_of_files(files):
+    zip_name = "Files.zip"
+    with zipfile.ZipFile(zip_name, 'w') as zipf:
+        for file in files:
+            zipf.write(file)
+    return zip_name
+def get_zip_download_link(zip_file):
+    with open(zip_file, 'rb') as f:
+        data = f.read()
+    b64 = base64.b64encode(data).decode()
+    return f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
+@st.cache_resource
+def get_table_download_link(file_path):
+    with open(file_path, 'rb') as f:
+        data = f.read()
+    b64 = base64.b64encode(data).decode()
+    file_name = os.path.basename(file_path)
+    ext = os.path.splitext(file_name)[1]
+    mime_type = "text/markdown" if ext == ".md" else "application/pdf" if ext == ".pdf" else "image/png" if ext == ".png" else "audio/wav" if ext == ".wav" else "audio/mpeg" if ext == ".mp3" else "video/mp4" if ext == ".mp4" else "application/octet-stream"
+    return f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>'
+# Main Function
+def main():
+    st.markdown("##### GPT-4o Omni Model: Text, Audio, Image, Video & RAG")
+    model_options = ["gpt-4o-2024-05-13", "gpt-3.5-turbo"]
+    st.session_state["openai_model"] = st.selectbox("Select GPT Model", model_options, index=0)
+    option = st.selectbox("Select Input Type", ("Text", "Image", "Audio", "Video", "ArXiv Search", "RAG PDF Gallery"))
+    if option == "Text":
+        default_text = "emojis in markdown. Maybe a buckeyball feature rating comparing them against each other in markdown emoji outline or tables."
+        text_input = st.text_input("Enter your text:", value=default_text)
+        if text_input:
+            with st.spinner("Processing..."):
+                process_text(text_input)
+    elif option == "Image":
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("📝 Describe"):
+                st.session_state["image_prompt"] = "Describe this image and list ten facts in a markdown outline with emojis."
+        with col2:
+            if st.button("🔍 OCR"):
+                st.session_state["image_prompt"] = "Show electronic text of text in the image."
+        text_input = st.text_input("Image Prompt:", value=st.session_state.get("image_prompt", "Describe this image and list ten facts in a markdown outline with emojis."))
+        image_input = st.file_uploader("Upload an image (max 200MB)", type=["png", "jpg", "jpeg"], accept_multiple_files=False)
+        if image_input and text_input:
+            if image_input.size > 200 * 1024 * 1024:
+                st.error("Image exceeds 200MB limit.")
+            else:
+                with st.spinner("Processing..."):
+                    image_response = process_image(image_input, text_input)
+                    with st.chat_message("ai", avatar="🦖"):
+                        st.markdown(image_response)
+    elif option == "Audio":
+        text_input = st.text_input("Audio Prompt:", value="Summarize this audio transcription in Markdown.")
+        audio_input = st.file_uploader("Upload an audio file (max 200MB)", type=["mp3", "wav", "flac", "m4a"], accept_multiple_files=False)
+        audio_bytes = audio_recorder()
+        if audio_bytes:
+            with open("recorded_audio.wav", "wb") as f:
+                f.write(audio_bytes)
+            process_audio(audio_bytes, text_input)
+        elif audio_input and text_input:
+            with st.spinner("Processing..."):
+                process_audio(audio_input, text_input)
+    elif option == "Video":
+        text_input = st.text_input("Video Prompt:", value="Summarize this video and its transcription in Markdown.")
+        video_input = st.file_uploader("Upload a video file (max 200MB)", type=["mp4"], accept_multiple_files=False)
+        if video_input and text_input:
+            if video_input.size > 200 * 1024 * 1024:
+                st.error("Video exceeds 200MB limit.")
+            else:
+                with st.spinner("Processing..."):
+                    process_audio_and_video(video_input)
+    elif option == "ArXiv Search":
+        query = st.text_input("AI Search ArXiv Scholarly Articles:")
+        if query:
+            with st.spinner("Searching ArXiv..."):
+                result = search_arxiv(query)
+                st.markdown(result)
+    elif option == "RAG PDF Gallery":
+        rag_pdf_gallery()
+# Chat Display and Input
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if prompt := st.chat_input("GPT-4o Multimodal ChatBot - What can I help you with?"):
+    process_text(prompt)
+FileSidebar()
+main()