Spaces:

maguid28
/

TranscriptTool

Running

App Files Files Community

maguid28 commited on Jan 24

Commit

13d3de7

1 Parent(s): 024a632

Implemented smolagent tool

Browse files

Files changed (8) hide show

README.md +76 -7
app.py +40 -0
ffmpeg_setup.py +59 -0
logging_config.py +20 -0
requirements.txt +9 -0
smolagent_example/example_smolagent.py +43 -0
transcription.py +50 -0
transcription_tool.py +91 -0

README.md CHANGED Viewed

@@ -1,14 +1,83 @@
 ---
-title: TranscriptTool
-emoji: 🌖
-colorFrom: gray
-colorTo: gray
 sdk: gradio
-sdk_version: 5.13.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: 'smolagent tool to transcribe audio/video to text '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AudioTranscriptSmolagentTool
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
 sdk: gradio
+sdk_version: 5.12.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: smolagent tool to transcribe audio & video files
 ---
+# TranscriptTool: A SmolAgent Tool for Audio/Video Transcription
+## Overview
+`TranscriptTool` is a SmolAgent tool designed to transcribe audio and video files into text. Leveraging OpenAI's Whisper and `ffmpeg`, this tool empowers agents to process multimedia inputs efficiently. It supports robust file handling, including format conversion to WAV, dynamic device selection (CPU or GPU), and easy use within smolagents via the Hugging Face API.
+The repository contains three main components:
+- **`transcription_tool.py`**: The core smolagent tool for transcription.
+- **`app.py`**: A Gradio-powered web app to test and use the tool interactively.
+- **`example_smolagent.py`**: Toy demonstration of how the tool operates within a smolagent framework.
+---
+## Installation
+1. Clone this repository:
+   ```bash
+   git clone https://huggingface.co/spaces/maguid28/TranscriptTool
+   cd TranscriptTool
+   ```
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+---
+## Usage
+### Testing with Gradio (app.py)
+To quickly test and use the transcription tool, run the provided Gradio app:
+   ```bash
+   python app.py
+   ```
+This launches a local Gradio interface. Upload an audio or video file to transcribe it directly.
+### Running example SmolAgent (example_smolagent.py)
+To see how TranscriptTool operates within a SmolAgent framework:
+   ```bash
+   python example_smolagent.py
+   ```
+### Access via Hugging Face API
+The `TranscriptTool` is also available as a tool through the Hugging Face API.
+#### How to Use the Tool via Hugging Face API
+1. **Install SmolAgents**
+   Ensure you have the SmolAgents library installed:
+   ```bash
+   pip install smolagents
+    ```
+2. **Load the Tool from the Hugging Face Hub**
+You can load the tool directly using the Hugging Face API.
+   ```python
+   from smolagents import load_tool
+   transcription_tool = load_tool("maguid28/TranscriptTool", trust_remote_code=True)
+   ```
+---
+## License
+This project is licensed under the Apache-2.0 License. See the LICENSE file for more details.
+---
+## Contributing
+Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.

app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import gradio as gr
+from logging_config import log_buffer
+from transcription_tool import TranscriptTool  # Assuming TranscriptionTool is in `transcription_tool.py`
+# smolagent transcription tool
+transcript_tool = TranscriptTool()
+def transcribe_and_stream_logs(file):
+    # Use the path to the uploaded file
+    temp_file_path = file.name
+    # Perform transcription
+    transcription_result = transcript_tool.forward(temp_file_path)
+    # Stream logs
+    log_buffer.seek(0)
+    logs = log_buffer.read()
+    return transcription_result, logs
+with gr.Blocks() as app:
+    gr.Markdown("# TranscriptTool: Transcribe Audio/Video")
+    gr.Markdown("TranscriptTool is a smolagent tool used to transcribe audio and video files into text. Leveraging OpenAI's Whisper and `ffmpeg`, this tool empowers agents to process multimedia inputs efficiently. It supports robust file handling, dynamic device selection (CPU or GPU), and easy use within smolagents via the Hugging Face API.")
+    file_input = gr.File(label="Upload Audio/Video File", file_types=["audio", "video"])
+    transcribe_button = gr.Button("Transcribe")
+    transcription_output = gr.Textbox(label="Transcription", lines=10)
+    log_output = gr.Textbox(label="Logs", lines=15)
+    transcribe_button.click(
+        fn=transcribe_and_stream_logs,
+        inputs=file_input,
+        outputs=[transcription_output, log_output]
+    )
+if __name__ == "__main__":
+    app.launch()

ffmpeg_setup.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import stat
+import shutil
+import subprocess
+import imageio_ffmpeg
+from logging_config import logger
+def is_ffmpeg_in_path() -> bool:
+    try:
+        subprocess.run(
+            ["ffmpeg", "-version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True
+        )
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+def ensure_ffmpeg_in_path():
+    if is_ffmpeg_in_path():
+        logger.info("FFmpeg is already available in PATH.")
+        return
+    try:
+        ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
+        ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
+        binary_name = os.path.basename(ffmpeg_path_original)
+        logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
+        logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
+        logger.info(f"Binary name: {binary_name}")
+        expected_binary_name = "ffmpeg"
+        copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
+        if not os.path.exists(copied_path):
+            logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
+            shutil.copy2(ffmpeg_path_original, copied_path)
+            st = os.stat(copied_path)
+            os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
+        else:
+            logger.info(f"'{copied_path}' already exists; skipping copy.")
+        # Add directory to PATH
+        os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
+        logger.info(f"PATH updated to include: {ffmpeg_dir}")
+        if is_ffmpeg_in_path():
+            logger.info("FFmpeg is now accessible in PATH.")
+        else:
+            logger.warning("FFmpeg is still not found in PATH after attempting to add it.")
+            raise RuntimeError("Failed to make FFmpeg accessible in PATH.")
+    except Exception as e:
+        logger.error(f"Failed to ensure FFmpeg is in PATH: {str(e)}")
+        raise RuntimeError("Failed to ensure FFmpeg is in PATH.") from e

logging_config.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import logging
+import io
+# StringIO buffer captures logs for streaming
+log_buffer = io.StringIO()
+logger = logging.getLogger("transcription_logger")
+logger.setLevel(logging.DEBUG)
+# Log handler that writes to the StringIO buffer
+log_handler = logging.StreamHandler(log_buffer)
+log_handler.setLevel(logging.DEBUG)
+# Formatter for the logs
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+log_handler.setFormatter(formatter)
+logger.addHandler(log_handler)
+logger.propagate = False

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+huggingface_hub==0.25.2
+gradio==5.12.0
+youtube-transcript-api==0.6.3
+yt-dlp==2025.1.15
+transformers==4.48.1
+torch==2.2.2
+imageio-ffmpeg==0.6.0
+numpy==1.24.3
+smolagents==1.4.1

smolagent_example/example_smolagent.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from smolagents import CodeAgent, HfApiModel
+from transcription_tool import TranscriptTool
+from huggingface_hub import login
+from config import HUGGINGFACE_API_KEY
+login(HUGGINGFACE_API_KEY)
+#model = HfApiModel("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+#model = HfApiModel("deepseek-ai/DeepSeek-R1-Zero")
+#model = HfApiModel("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
+#model = HfApiModel("deepseek-ai/deepseek-llm-7b-chat")
+model = HfApiModel("HuggingFaceH4/zephyr-7b-alpha")
+# Smolagent transcription tool
+transcript_tool = TranscriptTool()
+agent = CodeAgent(
+    tools=[transcript_tool],
+    model=model,
+    additional_authorized_imports=["string", "subprocess", "librosa", "os", "unicodedata", "datetime", "math", "time", "collections", "random", "itertools"],
+)
+task = """You are a Python assistant that can use preloaded tools to complete tasks.
+        A tool named `TranscriptTool` is available for transcription tasks.
+        To transcribe an audio file:
+        - Call `TranscriptTool.
+        - Do not attempt to import or initialize any other class or module for transcription.
+        do not add any imports
+        the video file is example_video.mp4
+        the audio file is example_audio.opus
+        compare the transcript strings tell me if they are the same
+        last line of code: final_answer(f"Are transcriptions the same? {video_transcription==audio_transcription}!   Video transcription: {video_transcription}, Audio transcription: {audio_transcription}")
+        """
+result = agent.run(task)
+print("Agent Result:", result)

transcription.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import subprocess
+import torch
+from transformers import pipeline
+from logging_config import logger, log_buffer
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def convert_audio_to_wav(input_file: str, output_file: str, ffmpeg_path: str) -> str:
+    logger.info(f"Converting {input_file} to WAV format: {output_file}")
+    cmd = [
+        ffmpeg_path,
+        "-y",  # Overwrite output files without asking
+        "-i", input_file,
+        "-ar", "16000",  # Set audio sampling rate to 16kHz
+        "-ac", "1",      # Set number of audio channels to mono
+        output_file
+    ]
+    try:
+        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        logger.info("Audio conversion to WAV completed successfully.")
+        return output_file
+    except subprocess.CalledProcessError as e:
+        ffmpeg_error = e.stderr.decode()
+        logger.error(f"ffmpeg error: {ffmpeg_error}")
+        raise RuntimeError("Failed to convert audio to WAV.") from e
+def run_whisper_transcription(wav_file_path: str, device: str):
+    try:
+        asr_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-small",
+            device=0 if device == "cuda" else -1,
+            return_timestamps=True,
+            generate_kwargs={"task": "transcribe", "language": "en"}
+        )
+        logger.info("Whisper ASR pipeline initialised.")
+        logger.info("Starting transcription...")
+        # Perform transcription
+        result = asr_pipeline(wav_file_path)
+        transcription = result.get("text", "")
+        logger.info("Transcription completed successfully.")
+        yield transcription, log_buffer.getvalue()
+    except Exception as e:
+        err_msg = f"Error during transcription: {str(e)}"
+        logger.error(err_msg)
+        yield err_msg, log_buffer.getvalue()

transcription_tool.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from smolagents import Tool
+import os
+import tempfile
+import shutil
+import torch
+import subprocess
+from transcription import run_whisper_transcription
+from logging_config import logger, log_buffer
+from ffmpeg_setup import ensure_ffmpeg_in_path
+class TranscriptTool(Tool):
+    name = "TranscriptTool"
+    description = """
+    A smolagent tool for transcribing audio and video files into text. This tool utilises Whisper for transcription
+    and ffmpeg for media conversion, enabling agents to process multimedia inputs into text. The tool supports robust
+    file handling, including format conversion to WAV and dynamic device selection for optimal performance.
+    """
+    inputs = {
+        "file_path": {
+            "type": "string",
+            "description": "Path to the audio or video file for transcription."
+        }
+    }
+    output_type = "string"
+    def __init__(self, audio_directory=None):
+        super().__init__()
+        ensure_ffmpeg_in_path()
+        self.audio_directory = audio_directory or os.getcwd()
+    def locate_audio_file(self, file_name):
+        for root, _, files in os.walk(self.audio_directory):
+            if file_name in files:
+                return os.path.join(root, file_name)
+        return None
+    def convert_audio_to_wav(self, input_file: str, output_file: str, ffmpeg_path: str) -> str:
+        logger.info(f"Converting {input_file} to WAV format: {output_file}")
+        cmd = [
+            ffmpeg_path,
+            "-y",  # Overwrite output files without asking
+            "-i", input_file,
+            "-ar", "16000",  # Set audio sampling rate to 16kHz
+            "-ac", "1",      # Set number of audio channels to mono
+            output_file
+        ]
+        try:
+            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("Audio conversion to WAV completed successfully.")
+            return output_file
+        except subprocess.CalledProcessError as e:
+            ffmpeg_error = e.stderr.decode()
+            logger.error(f"ffmpeg error: {ffmpeg_error}")
+            raise RuntimeError("Failed to convert audio to WAV.") from e
+    def forward(self, file_path: str) -> str:
+        log_buffer.seek(0)
+        log_buffer.truncate()
+        try:
+            # Locate the file if it does not exist
+            if not os.path.isfile(file_path):
+                file_name = os.path.basename(file_path)
+                file_path = self.locate_audio_file(file_name)
+                if not file_path:
+                    return f"Error: File '{file_name}' not found in '{self.audio_directory}'."
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Copy file to temp dir
+                filename = os.path.basename(file_path)
+                input_file_path = os.path.join(tmpdir, filename)
+                shutil.copy(file_path, input_file_path)
+                # Convert to wav
+                wav_file_path = os.path.join(tmpdir, "converted_audio.wav")
+                ffmpeg_path = shutil.which("ffmpeg")
+                if not ffmpeg_path:
+                    raise RuntimeError("ffmpeg is not accessible in PATH.")
+                self.convert_audio_to_wav(input_file_path, wav_file_path, ffmpeg_path)
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                # Transcribe audio
+                transcription_generator = run_whisper_transcription(wav_file_path, device)
+                for transcription, _ in transcription_generator:
+                    return transcription
+        except Exception as e:
+            logger.error(f"Error in transcription: {str(e)}")
+            return f"An error occurred: {str(e)}"