Spaces:

maguid28
/

YoutubeTranscriptTool

Running

File size: 3,248 Bytes

4f48868

import os
import subprocess
import tempfile
import yt_dlp
import torch
from transformers import pipeline
from logging_config import logger, log_buffer

device = "cuda" if torch.cuda.is_available() else "cpu"


def convert_audio_to_wav(input_file: str, output_file: str) -> str:
    logger.info(f"Converting {input_file} to WAV: {output_file}")
    cmd = [
        "ffmpeg",
        "-y",
        "-i", input_file,
        "-ar", "16000",  # sample rate
        "-ac", "1",  # mono
        output_file
    ]
    subprocess.run(cmd, check=True)
    return output_file


def fallback_whisper_transcription(youtube_url: str):
    # returns (transcript, logs).
    try:
        with tempfile.TemporaryDirectory() as tmpdir:
            # Create temp dir
            logger.info("")
            logger.info(f"Created temporary directory: {tmpdir}")
            logger.info("")
            yield "", log_buffer.getvalue()

            # Download best audio
            logger.info("Downloading best audio via yt-dlp...")
            logger.info("")
            yield "", log_buffer.getvalue()

            download_path = os.path.join(tmpdir, "audio.%(ext)s")
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': download_path,
                'quiet': True,
                'postprocessors': []
            }

            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([youtube_url])

            logger.info("Audio downloaded. Locating the audio file in the temp folder...")
            logger.info("")
            yield "", log_buffer.getvalue()

            # confirm audio file
            downloaded_files = os.listdir(tmpdir)
            if not downloaded_files:
                raise RuntimeError("No audio file was downloaded via yt-dlp.")

            audio_file_path = os.path.join(tmpdir, downloaded_files[0])
            logger.info(f"Found audio file: {audio_file_path}")
            logger.info("Video has downloaded!")
            logger.info("")
            yield "", log_buffer.getvalue()

            # Convert to wav
            wav_file_path = os.path.join(tmpdir, "audio.wav")
            convert_audio_to_wav(audio_file_path, wav_file_path)
            logger.info("Audio converted to WAV successfully.")
            logger.info("")
            yield "", log_buffer.getvalue()

            # Run whisper
            logger.info("Running Whisper ASR pipeline on the WAV file...")
            logger.info("")
            yield "", log_buffer.getvalue()

            asr_pipeline = pipeline(
                "automatic-speech-recognition",
                model="openai/whisper-small",
                return_timestamps=True,
                device=device,
                generate_kwargs={"task": "transcribe", "language": "<|en|>"}
            )
            result = asr_pipeline(inputs=wav_file_path)
            transcription = result["text"]

            logger.info("Whisper transcription completed successfully.")
            logger.info("")
            yield transcription, log_buffer.getvalue()

    except Exception as e:
        err_msg = f"Error in fallback transcription: {str(e)}"
        logger.error(err_msg)
        yield err_msg, log_buffer.getvalue()