maguid28 commited on
Commit
13d3de7
·
1 Parent(s): 024a632

Implemented smolagent tool

Browse files
README.md CHANGED
@@ -1,14 +1,83 @@
1
  ---
2
- title: TranscriptTool
3
- emoji: 🌖
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.13.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: 'smolagent tool to transcribe audio/video to text '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AudioTranscriptSmolagentTool
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: smolagent tool to transcribe audio & video files
12
  ---
13
 
14
+ # TranscriptTool: A SmolAgent Tool for Audio/Video Transcription
15
+
16
+ ## Overview
17
+
18
+ `TranscriptTool` is a SmolAgent tool designed to transcribe audio and video files into text. Leveraging OpenAI's Whisper and `ffmpeg`, this tool empowers agents to process multimedia inputs efficiently. It supports robust file handling, including format conversion to WAV, dynamic device selection (CPU or GPU), and easy use within smolagents via the Hugging Face API.
19
+
20
+ The repository contains three main components:
21
+ - **`transcription_tool.py`**: The core smolagent tool for transcription.
22
+ - **`app.py`**: A Gradio-powered web app to test and use the tool interactively.
23
+ - **`example_smolagent.py`**: Toy demonstration of how the tool operates within a smolagent framework.
24
+
25
+ ---
26
+
27
+ ## Installation
28
+
29
+ 1. Clone this repository:
30
+ ```bash
31
+ git clone https://huggingface.co/spaces/maguid28/TranscriptTool
32
+ cd TranscriptTool
33
+ ```
34
+ 2. Install dependencies:
35
+ ```bash
36
+ pip install -r requirements.txt
37
+ ```
38
+
39
+ ---
40
+ ## Usage
41
+
42
+ ### Testing with Gradio (app.py)
43
+
44
+ To quickly test and use the transcription tool, run the provided Gradio app:
45
+ ```bash
46
+ python app.py
47
+ ```
48
+
49
+ This launches a local Gradio interface. Upload an audio or video file to transcribe it directly.
50
+ ### Running example SmolAgent (example_smolagent.py)
51
+
52
+ To see how TranscriptTool operates within a SmolAgent framework:
53
+ ```bash
54
+ python example_smolagent.py
55
+ ```
56
+
57
+ ### Access via Hugging Face API
58
+
59
+ The `TranscriptTool` is also available as a tool through the Hugging Face API.
60
+
61
+ #### How to Use the Tool via Hugging Face API
62
+
63
+ 1. **Install SmolAgents**
64
+
65
+ Ensure you have the SmolAgents library installed:
66
+ ```bash
67
+ pip install smolagents
68
+ ```
69
+ 2. **Load the Tool from the Hugging Face Hub**
70
+
71
+ You can load the tool directly using the Hugging Face API.
72
+
73
+ ```python
74
+ from smolagents import load_tool
75
+ transcription_tool = load_tool("maguid28/TranscriptTool", trust_remote_code=True)
76
+ ```
77
+ ---
78
+ ## License
79
+ This project is licensed under the Apache-2.0 License. See the LICENSE file for more details.
80
+
81
+ ---
82
+ ## Contributing
83
+ Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from logging_config import log_buffer
3
+ from transcription_tool import TranscriptTool # Assuming TranscriptionTool is in `transcription_tool.py`
4
+
5
+ # smolagent transcription tool
6
+ transcript_tool = TranscriptTool()
7
+
8
+
9
+ def transcribe_and_stream_logs(file):
10
+ # Use the path to the uploaded file
11
+ temp_file_path = file.name
12
+
13
+ # Perform transcription
14
+ transcription_result = transcript_tool.forward(temp_file_path)
15
+
16
+ # Stream logs
17
+ log_buffer.seek(0)
18
+ logs = log_buffer.read()
19
+
20
+ return transcription_result, logs
21
+
22
+
23
+ with gr.Blocks() as app:
24
+ gr.Markdown("# TranscriptTool: Transcribe Audio/Video")
25
+ gr.Markdown("TranscriptTool is a smolagent tool used to transcribe audio and video files into text. Leveraging OpenAI's Whisper and `ffmpeg`, this tool empowers agents to process multimedia inputs efficiently. It supports robust file handling, dynamic device selection (CPU or GPU), and easy use within smolagents via the Hugging Face API.")
26
+
27
+ file_input = gr.File(label="Upload Audio/Video File", file_types=["audio", "video"])
28
+ transcribe_button = gr.Button("Transcribe")
29
+
30
+ transcription_output = gr.Textbox(label="Transcription", lines=10)
31
+ log_output = gr.Textbox(label="Logs", lines=15)
32
+
33
+ transcribe_button.click(
34
+ fn=transcribe_and_stream_logs,
35
+ inputs=file_input,
36
+ outputs=[transcription_output, log_output]
37
+ )
38
+
39
+ if __name__ == "__main__":
40
+ app.launch()
ffmpeg_setup.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import stat
3
+ import shutil
4
+ import subprocess
5
+ import imageio_ffmpeg
6
+ from logging_config import logger
7
+
8
+
9
+ def is_ffmpeg_in_path() -> bool:
10
+ try:
11
+ subprocess.run(
12
+ ["ffmpeg", "-version"],
13
+ stdout=subprocess.PIPE,
14
+ stderr=subprocess.PIPE,
15
+ check=True
16
+ )
17
+ return True
18
+ except (subprocess.CalledProcessError, FileNotFoundError):
19
+ return False
20
+
21
+
22
+ def ensure_ffmpeg_in_path():
23
+
24
+ if is_ffmpeg_in_path():
25
+ logger.info("FFmpeg is already available in PATH.")
26
+ return
27
+
28
+ try:
29
+ ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
30
+ ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
31
+ binary_name = os.path.basename(ffmpeg_path_original)
32
+
33
+ logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
34
+ logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
35
+ logger.info(f"Binary name: {binary_name}")
36
+
37
+ expected_binary_name = "ffmpeg"
38
+ copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
39
+
40
+ if not os.path.exists(copied_path):
41
+ logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
42
+ shutil.copy2(ffmpeg_path_original, copied_path)
43
+ st = os.stat(copied_path)
44
+ os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
45
+ else:
46
+ logger.info(f"'{copied_path}' already exists; skipping copy.")
47
+
48
+ # Add directory to PATH
49
+ os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
50
+ logger.info(f"PATH updated to include: {ffmpeg_dir}")
51
+
52
+ if is_ffmpeg_in_path():
53
+ logger.info("FFmpeg is now accessible in PATH.")
54
+ else:
55
+ logger.warning("FFmpeg is still not found in PATH after attempting to add it.")
56
+ raise RuntimeError("Failed to make FFmpeg accessible in PATH.")
57
+ except Exception as e:
58
+ logger.error(f"Failed to ensure FFmpeg is in PATH: {str(e)}")
59
+ raise RuntimeError("Failed to ensure FFmpeg is in PATH.") from e
logging_config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import io
3
+
4
+ # StringIO buffer captures logs for streaming
5
+ log_buffer = io.StringIO()
6
+
7
+ logger = logging.getLogger("transcription_logger")
8
+ logger.setLevel(logging.DEBUG)
9
+
10
+ # Log handler that writes to the StringIO buffer
11
+ log_handler = logging.StreamHandler(log_buffer)
12
+ log_handler.setLevel(logging.DEBUG)
13
+
14
+ # Formatter for the logs
15
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
16
+ log_handler.setFormatter(formatter)
17
+
18
+ logger.addHandler(log_handler)
19
+
20
+ logger.propagate = False
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ gradio==5.12.0
3
+ youtube-transcript-api==0.6.3
4
+ yt-dlp==2025.1.15
5
+ transformers==4.48.1
6
+ torch==2.2.2
7
+ imageio-ffmpeg==0.6.0
8
+ numpy==1.24.3
9
+ smolagents==1.4.1
smolagent_example/example_smolagent.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import CodeAgent, HfApiModel
2
+ from transcription_tool import TranscriptTool
3
+ from huggingface_hub import login
4
+ from config import HUGGINGFACE_API_KEY
5
+
6
+ login(HUGGINGFACE_API_KEY)
7
+
8
+ #model = HfApiModel("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
9
+ #model = HfApiModel("deepseek-ai/DeepSeek-R1-Zero")
10
+ #model = HfApiModel("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
11
+ #model = HfApiModel("deepseek-ai/deepseek-llm-7b-chat")
12
+ model = HfApiModel("HuggingFaceH4/zephyr-7b-alpha")
13
+
14
+ # Smolagent transcription tool
15
+ transcript_tool = TranscriptTool()
16
+
17
+ agent = CodeAgent(
18
+ tools=[transcript_tool],
19
+ model=model,
20
+ additional_authorized_imports=["string", "subprocess", "librosa", "os", "unicodedata", "datetime", "math", "time", "collections", "random", "itertools"],
21
+ )
22
+
23
+ task = """You are a Python assistant that can use preloaded tools to complete tasks.
24
+
25
+ A tool named `TranscriptTool` is available for transcription tasks.
26
+
27
+ To transcribe an audio file:
28
+ - Call `TranscriptTool.
29
+ - Do not attempt to import or initialize any other class or module for transcription.
30
+
31
+ do not add any imports
32
+
33
+ the video file is example_video.mp4
34
+
35
+ the audio file is example_audio.opus
36
+
37
+ compare the transcript strings tell me if they are the same
38
+
39
+ last line of code: final_answer(f"Are transcriptions the same? {video_transcription==audio_transcription}! Video transcription: {video_transcription}, Audio transcription: {audio_transcription}")
40
+
41
+ """
42
+ result = agent.run(task)
43
+ print("Agent Result:", result)
transcription.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import torch
3
+ from transformers import pipeline
4
+ from logging_config import logger, log_buffer
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+
9
+ def convert_audio_to_wav(input_file: str, output_file: str, ffmpeg_path: str) -> str:
10
+ logger.info(f"Converting {input_file} to WAV format: {output_file}")
11
+ cmd = [
12
+ ffmpeg_path,
13
+ "-y", # Overwrite output files without asking
14
+ "-i", input_file,
15
+ "-ar", "16000", # Set audio sampling rate to 16kHz
16
+ "-ac", "1", # Set number of audio channels to mono
17
+ output_file
18
+ ]
19
+ try:
20
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
21
+ logger.info("Audio conversion to WAV completed successfully.")
22
+ return output_file
23
+ except subprocess.CalledProcessError as e:
24
+ ffmpeg_error = e.stderr.decode()
25
+ logger.error(f"ffmpeg error: {ffmpeg_error}")
26
+ raise RuntimeError("Failed to convert audio to WAV.") from e
27
+
28
+
29
+ def run_whisper_transcription(wav_file_path: str, device: str):
30
+ try:
31
+ asr_pipeline = pipeline(
32
+ "automatic-speech-recognition",
33
+ model="openai/whisper-small",
34
+ device=0 if device == "cuda" else -1,
35
+ return_timestamps=True,
36
+ generate_kwargs={"task": "transcribe", "language": "en"}
37
+ )
38
+ logger.info("Whisper ASR pipeline initialised.")
39
+ logger.info("Starting transcription...")
40
+
41
+ # Perform transcription
42
+ result = asr_pipeline(wav_file_path)
43
+ transcription = result.get("text", "")
44
+ logger.info("Transcription completed successfully.")
45
+
46
+ yield transcription, log_buffer.getvalue()
47
+ except Exception as e:
48
+ err_msg = f"Error during transcription: {str(e)}"
49
+ logger.error(err_msg)
50
+ yield err_msg, log_buffer.getvalue()
transcription_tool.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ import torch
6
+ import subprocess
7
+ from transcription import run_whisper_transcription
8
+ from logging_config import logger, log_buffer
9
+ from ffmpeg_setup import ensure_ffmpeg_in_path
10
+
11
+
12
+ class TranscriptTool(Tool):
13
+ name = "TranscriptTool"
14
+ description = """
15
+ A smolagent tool for transcribing audio and video files into text. This tool utilises Whisper for transcription
16
+ and ffmpeg for media conversion, enabling agents to process multimedia inputs into text. The tool supports robust
17
+ file handling, including format conversion to WAV and dynamic device selection for optimal performance.
18
+ """
19
+ inputs = {
20
+ "file_path": {
21
+ "type": "string",
22
+ "description": "Path to the audio or video file for transcription."
23
+ }
24
+ }
25
+ output_type = "string"
26
+
27
+ def __init__(self, audio_directory=None):
28
+ super().__init__()
29
+ ensure_ffmpeg_in_path()
30
+ self.audio_directory = audio_directory or os.getcwd()
31
+
32
+ def locate_audio_file(self, file_name):
33
+ for root, _, files in os.walk(self.audio_directory):
34
+ if file_name in files:
35
+ return os.path.join(root, file_name)
36
+ return None
37
+
38
+ def convert_audio_to_wav(self, input_file: str, output_file: str, ffmpeg_path: str) -> str:
39
+ logger.info(f"Converting {input_file} to WAV format: {output_file}")
40
+ cmd = [
41
+ ffmpeg_path,
42
+ "-y", # Overwrite output files without asking
43
+ "-i", input_file,
44
+ "-ar", "16000", # Set audio sampling rate to 16kHz
45
+ "-ac", "1", # Set number of audio channels to mono
46
+ output_file
47
+ ]
48
+ try:
49
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
50
+ logger.info("Audio conversion to WAV completed successfully.")
51
+ return output_file
52
+ except subprocess.CalledProcessError as e:
53
+ ffmpeg_error = e.stderr.decode()
54
+ logger.error(f"ffmpeg error: {ffmpeg_error}")
55
+ raise RuntimeError("Failed to convert audio to WAV.") from e
56
+
57
+ def forward(self, file_path: str) -> str:
58
+ log_buffer.seek(0)
59
+ log_buffer.truncate()
60
+
61
+ try:
62
+ # Locate the file if it does not exist
63
+ if not os.path.isfile(file_path):
64
+ file_name = os.path.basename(file_path)
65
+ file_path = self.locate_audio_file(file_name)
66
+ if not file_path:
67
+ return f"Error: File '{file_name}' not found in '{self.audio_directory}'."
68
+
69
+ with tempfile.TemporaryDirectory() as tmpdir:
70
+ # Copy file to temp dir
71
+ filename = os.path.basename(file_path)
72
+ input_file_path = os.path.join(tmpdir, filename)
73
+ shutil.copy(file_path, input_file_path)
74
+
75
+ # Convert to wav
76
+ wav_file_path = os.path.join(tmpdir, "converted_audio.wav")
77
+ ffmpeg_path = shutil.which("ffmpeg")
78
+ if not ffmpeg_path:
79
+ raise RuntimeError("ffmpeg is not accessible in PATH.")
80
+ self.convert_audio_to_wav(input_file_path, wav_file_path, ffmpeg_path)
81
+
82
+ device = "cuda" if torch.cuda.is_available() else "cpu"
83
+
84
+ # Transcribe audio
85
+ transcription_generator = run_whisper_transcription(wav_file_path, device)
86
+ for transcription, _ in transcription_generator:
87
+ return transcription
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error in transcription: {str(e)}")
91
+ return f"An error occurred: {str(e)}"