Spaces:

Xenova
/

whisper-speaker-diarization

Running

App Files Files Community

pr/fixes_plus_loading

by nbpe97 - opened Jan 21

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+584

-347

Files changed (5) hide show

whisper-speaker-diarization/package.json +3 -1
whisper-speaker-diarization/src/App.jsx +257 -218
whisper-speaker-diarization/src/components/MediaInput.jsx +44 -4
whisper-speaker-diarization/src/components/Progress.jsx +8 -0
whisper-speaker-diarization/src/worker.js +272 -124

whisper-speaker-diarization/package.json CHANGED Viewed

@@ -10,11 +10,13 @@
     "preview": "vite preview"
   },
   "dependencies": {
-    "@xenova/transformers": "github:xenova/transformers.js#v3",
     "react": "^18.3.1",
     "react-dom": "^18.3.1"
   },
   "devDependencies": {
     "@types/react": "^18.3.3",
     "@types/react-dom": "^18.3.0",
     "@vitejs/plugin-react": "^4.3.1",

     "preview": "vite preview"
   },
   "dependencies": {
+    "@huggingface/transformers": "^3.3.1",
+    "prop-types": "^15.8.1",
     "react": "^18.3.1",
     "react-dom": "^18.3.1"
   },
   "devDependencies": {
+    "@rollup/plugin-commonjs": "^28.0.1",
     "@types/react": "^18.3.3",
     "@types/react-dom": "^18.3.0",
     "@vitejs/plugin-react": "^4.3.1",

whisper-speaker-diarization/src/App.jsx CHANGED Viewed

@@ -1,218 +1,257 @@
-import { useEffect, useState, useRef, useCallback } from 'react';
-import Progress from './components/Progress';
-import MediaInput from './components/MediaInput';
-import Transcript from './components/Transcript';
-import LanguageSelector from './components/LanguageSelector';
-async function hasWebGPU() {
-    if (!navigator.gpu) {
-        return false;
-    }
-    try {
-        const adapter = await navigator.gpu.requestAdapter();
-        return !!adapter;
-    } catch (e) {
-        return false;
-    }
-}
-function App() {
-    // Create a reference to the worker object.
-    const worker = useRef(null);
-    // Model loading and progress
-    const [status, setStatus] = useState(null);
-    const [loadingMessage, setLoadingMessage] = useState('');
-    const [progressItems, setProgressItems] = useState([]);
-    const mediaInputRef = useRef(null);
-    const [audio, setAudio] = useState(null);
-    const [language, setLanguage] = useState('en');
-    const [result, setResult] = useState(null);
-    const [time, setTime] = useState(null);
-    const [currentTime, setCurrentTime] = useState(0);
-    const [device, setDevice] = useState('webgpu'); // Try use WebGPU first
-    const [modelSize, setModelSize] = useState('gpu' in navigator ? 196 : 77); // WebGPU=196MB, WebAssembly=77MB
-    useEffect(() => {
-        hasWebGPU().then((b) => {
-            setModelSize(b ? 196 : 77);
-            setDevice(b ? 'webgpu' : 'wasm');
-        });
-    }, []);
-    // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
-    useEffect(() => {
-        if (!worker.current) {
-            // Create the worker if it does not yet exist.
-            worker.current = new Worker(new URL('./worker.js', import.meta.url), {
-                type: 'module'
-            });
-        }
-        // Create a callback function for messages from the worker thread.
-        const onMessageReceived = (e) => {
-            switch (e.data.status) {
-                case 'loading':
-                    // Model file start load: add a new progress item to the list.
-                    setStatus('loading');
-                    setLoadingMessage(e.data.data);
-                    break;
-                case 'initiate':
-                    setProgressItems(prev => [...prev, e.data]);
-                    break;
-                case 'progress':
-                    // Model file progress: update one of the progress items.
-                    setProgressItems(
-                        prev => prev.map(item => {
-                            if (item.file === e.data.file) {
-                                return { ...item, ...e.data }
-                            }
-                            return item;
-                        })
-                    );
-                    break;
-                case 'done':
-                    // Model file loaded: remove the progress item from the list.
-                    setProgressItems(
-                        prev => prev.filter(item => item.file !== e.data.file)
-                    );
-                    break;
-                case 'loaded':
-                    // Pipeline ready: the worker is ready to accept messages.
-                    setStatus('ready');
-                    break;
-                case 'complete':
-                    setResult(e.data.result);
-                    setTime(e.data.time);
-                    setStatus('ready');
-                    break;
-            }
-        };
-        // Attach the callback function as an event listener.
-        worker.current.addEventListener('message', onMessageReceived);
-        // Define a cleanup function for when the component is unmounted.
-        return () => {
-            worker.current.removeEventListener('message', onMessageReceived);
-        };
-    }, []);
-    const handleClick = useCallback(() => {
-        setResult(null);
-        setTime(null);
-        if (status === null) {
-            setStatus('loading');
-            worker.current.postMessage({ type: 'load', data: { device } });
-        } else {
-            setStatus('running');
-            worker.current.postMessage({
-                type: 'run', data: { audio, language }
-            });
-        }
-    }, [status, audio, language, device]);
-    return (
-        <div className="flex flex-col h-screen mx-auto text-gray-800 dark:text-gray-200 bg-white dark:bg-gray-900 max-w-[600px]">
-            {status === 'loading' && (
-                <div className="flex justify-center items-center fixed w-screen h-screen bg-black z-10 bg-opacity-[92%] top-0 left-0">
-                    <div className="w-[500px]">
-                        <p className="text-center mb-1 text-white text-md">{loadingMessage}</p>
-                        {progressItems.map(({ file, progress, total }, i) => (
-                            <Progress key={i} text={file} percentage={progress} total={total} />
-                        ))}
-                    </div>
-                </div>
-            )}
-            <div className="my-auto">
-                <div className="flex flex-col items-center mb-2 text-center">
-                    <h1 className="text-5xl font-bold mb-2">Whisper Diarization</h1>
-                    <h2 className="text-xl font-semibold">In-browser automatic speech recognition w/ <br />word-level timestamps and speaker segmentation</h2>
-                </div>
-                <div className="w-full min-h-[220px] flex flex-col justify-center items-center">
-                    {
-                        !audio && (
-                            <p className="mb-2">
-                                You are about to download <a href="https://huggingface.co/onnx-community/whisper-base_timestamped" target="_blank" rel="noreferrer" className="font-medium underline">whisper-base</a> and <a href="https://huggingface.co/onnx-community/pyannote-segmentation-3.0" target="_blank" rel="noreferrer" className="font-medium underline">pyannote-segmentation-3.0</a>,
-                                two powerful speech recognition models for generating word-level timestamps across 100 different languages and speaker segmentation, respectively.
-                                Once loaded, the models ({modelSize}MB + 6MB) will be cached and reused when you revisit the page.<br />
-                                <br />
-                                Everything runs locally in your browser using <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">🤗&nbsp;Transformers.js</a> and ONNX Runtime Web,
-                                meaning no API calls are made to a server for inference. You can even disconnect from the internet after the model has loaded!
-                            </p>
-                        )
-                    }
-                    <div className="flex flex-col w-full m-3 max-w-[520px]">
-                        <span className="text-sm mb-0.5">Input audio/video</span>
-                        <MediaInput
-                            ref={mediaInputRef}
-                            className="flex items-center border rounded-md cursor-pointer min-h-[100px] max-h-[500px] overflow-hidden"
-                            onInputChange={(audio) => {
-                                setResult(null);
-                                setAudio(audio);
-                            }}
-                            onTimeUpdate={(time) => setCurrentTime(time)}
-                        />
-                    </div>
-                    <div className="relative w-full flex justify-center items-center">
-                        <button
-                            className="border px-4 py-2 rounded-lg bg-blue-400 text-white hover:bg-blue-500 disabled:bg-blue-100 disabled:cursor-not-allowed select-none"
-                            onClick={handleClick}
-                            disabled={status === 'running' || (status !== null && audio === null)}
-                        >
-                            {status === null ? 'Load model' :
-                                status === 'running'
-                                    ? 'Running...'
-                                    : 'Run model'
-                            }
-                        </button>
-                        {status !== null &&
-                            <div className='absolute right-0 bottom-0'>
-                                <span className="text-xs">Language:</span>
-                                <br />
-                                <LanguageSelector className="border rounded-lg p-1 max-w-[100px]" language={language} setLanguage={setLanguage} />
-                            </div>
-                        }
-                    </div>
-                    {
-                        result && time && (
-                            <>
-                                <div className="w-full mt-4 border rounded-md">
-                                    <Transcript
-                                        className="p-2 max-h-[200px] overflow-y-auto scrollbar-thin select-none"
-                                        transcript={result.transcript}
-                                        segments={result.segments}
-                                        currentTime={currentTime}
-                                        setCurrentTime={(time) => {
-                                            setCurrentTime(time);
-                                            mediaInputRef.current.setMediaTime(time);
-                                        }}
-                                    />
-                                </div>
-                                <p className="text-sm text-gray-600 text-end p-1">Generation time: <span className="text-gray-800 font-semibold">{time.toFixed(2)}ms</span></p>
-                            </>
-                        )
-                    }
-                </div>
-            </div>
-        </div >
-    )
-}
-export default App

+import { useEffect, useState, useRef, useCallback } from 'react';
+import Progress from './components/Progress';
+import MediaInput from './components/MediaInput';
+import Transcript from './components/Transcript';
+import LanguageSelector from './components/LanguageSelector';
+async function hasWebGPU() {
+    if (!navigator.gpu) {
+        return false;
+    }
+    try {
+        const adapter = await navigator.gpu.requestAdapter();
+        return !!adapter;
+    } catch (e) {
+        return false;
+    }
+}
+function App() {
+    // Create a reference to the worker object.
+    const worker = useRef(null);
+    // Model loading and progress
+    const [status, setStatus] = useState(null);
+    const [loadingMessage, setLoadingMessage] = useState('');
+    const [progressItems, setProgressItems] = useState([]);
+    const mediaInputRef = useRef(null);
+    const [audio, setAudio] = useState(null);
+    const [language, setLanguage] = useState('en');
+    const [result, setResult] = useState(null);
+    const [time, setTime] = useState(null);
+    const [audioLength, setAudioLength] = useState(null);
+    const [currentTime, setCurrentTime] = useState(0);
+    const [device, setDevice] = useState('webgpu'); // Try use WebGPU first
+    const [modelSize, setModelSize] = useState('gpu' in navigator ? 196 : 77); // WebGPU=196MB, WebAssembly=77MB
+    useEffect(() => {
+        hasWebGPU().then((b) => {
+            setModelSize(b ? 196 : 77);
+            setDevice(b ? 'webgpu' : 'wasm');
+        });
+    }, []);
+    // Create a callback function for messages from the worker thread.
+    const onMessageReceived = (e) => {
+        switch (e.data.status) {
+            case 'loading':
+                // Model file start load: add a new progress item to the list.
+                setStatus('loading');
+                setLoadingMessage(e.data.data);
+                break;
+            case 'initiate':
+                setProgressItems(prev => [...prev, e.data]);
+                break;
+            case 'progress':
+                // Model file progress: update one of the progress items.
+                setProgressItems(
+                    prev => prev.map(item => {
+                        if (item.file === e.data.file) {
+                            return { ...item, ...e.data }
+                        }
+                        return item;
+                    })
+                );
+                break;
+            case 'done':
+                // Model file loaded: remove the progress item from the list.
+                setProgressItems(
+                    prev => prev.filter(item => item.file !== e.data.file)
+                );
+                break;
+            case 'loaded':
+                // Pipeline ready: the worker is ready to accept messages.
+                setStatus('ready');
+                break;
+            case 'transcribe-progress': {
+                // Update progress for transcription/diarization
+                const { task, progress, total } = e.data.data;
+                setProgressItems(prev => {
+                    const existingIndex = prev.findIndex(item => item.file === task);
+                    if (existingIndex >= 0) {
+                        return prev.map((item, i) =>
+                            i === existingIndex ? { ...item, progress, total } : item
+                        );
+                    }
+                    const newItem = { file: task, progress, total };
+                    return [...prev, newItem];
+                });
+                break;
+            }
+            case 'complete':
+                setResult(e.data.result);
+                setTime(e.data.time);
+                setAudioLength(e.data.audio_length);
+                setStatus('ready');
+                break;
+        }
+    };
+    // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
+    useEffect(() => {
+        if (!worker.current) {
+            // Create the worker if it does not yet exist.
+            worker.current = new Worker(new URL('./worker.js', import.meta.url), {
+                type: 'module'
+            });
+        }
+        // Attach the callback function as an event listener.
+        worker.current.addEventListener('message', onMessageReceived);
+        // Define a cleanup function for when the component is unmounted.
+        return () => {
+            worker.current.removeEventListener('message', onMessageReceived);
+        };
+    }, []);
+    const handleClick = useCallback(() => {
+        setResult(null);
+        setTime(null);
+        if (status === null) {
+            setStatus('loading');
+            worker.current.postMessage({ type: 'load', data: { device } });
+        } else {
+            setStatus('running');
+            worker.current.postMessage({
+                type: 'run', data: { audio, language }
+            });
+        }
+    }, [status, audio, language, device]);
+    return (
+        <div className="flex flex-col h-screen mx-auto text-gray-800 dark:text-gray-200 bg-white dark:bg-gray-900 max-w-[600px]">
+            {(status === 'loading' || status === 'running') && (
+                <div className="flex justify-center items-center fixed w-screen h-screen bg-black z-10 bg-opacity-[92%] top-0 left-0">
+                    <div className="w-[500px]">
+                        <p className="text-center mb-1 text-white text-md">{loadingMessage}</p>
+                        {progressItems
+                            .sort((a, b) => {
+                                // Define the order: transcription -> segmentation -> diarization
+                                const order = { 'transcription': 0, 'segmentation': 1, 'diarization': 2 };
+                                return (order[a.file] ?? 3) - (order[b.file] ?? 3);
+                            })
+                            .map(({ file, progress, total }, i) => (
+                                <Progress
+                                    key={i}
+                                    text={file === 'transcription' ? 'Converting speech to text' :
+                                         file === 'segmentation' ? 'Detecting word timestamps' :
+                                         file === 'diarization' ? 'Identifying speakers' :
+                                         file}
+                                    percentage={progress}
+                                    total={total}
+                                />
+                            ))
+                        }
+                    </div>
+                </div>
+            )}
+            <div className="my-auto">
+                <div className="flex flex-col items-center mb-2 text-center">
+                    <h1 className="text-5xl font-bold mb-2">Whisper Diarization</h1>
+                    <h2 className="text-xl font-semibold">In-browser automatic speech recognition w/ <br />word-level timestamps and speaker segmentation</h2>
+                </div>
+                <div className="w-full min-h-[220px] flex flex-col justify-center items-center">
+                    {
+                        !audio && (
+                            <p className="mb-2">
+                                You are about to download <a href="https://huggingface.co/onnx-community/whisper-base_timestamped" target="_blank" rel="noreferrer" className="font-medium underline">whisper-base</a> and <a href="https://huggingface.co/onnx-community/pyannote-segmentation-3.0" target="_blank" rel="noreferrer" className="font-medium underline">pyannote-segmentation-3.0</a>,
+                                two powerful speech recognition models for generating word-level timestamps across 100 different languages and speaker segmentation, respectively.
+                                Once loaded, the models ({modelSize}MB + 6MB) will be cached and reused when you revisit the page.<br />
+                                <br />
+                                Everything runs locally in your browser using <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">🤗&nbsp;Transformers.js</a> and ONNX Runtime Web,
+                                meaning no API calls are made to a server for inference. You can even disconnect from the internet after the model has loaded!
+                            </p>
+                        )
+                    }
+                    <div className="flex flex-col w-full m-3 max-w-[520px]">
+                        <span className="text-sm mb-0.5">Input audio/video</span>
+                        <MediaInput
+                            ref={mediaInputRef}
+                            className="flex items-center border rounded-md cursor-pointer min-h-[100px] max-h-[500px] overflow-hidden"
+                            onInputChange={(audio) => {
+                                setResult(null);
+                                setAudio(audio);
+                            }}
+                            onTimeUpdate={(time) => setCurrentTime(time)}
+                            onMessage={onMessageReceived}
+                        />
+                    </div>
+                    <div className="relative w-full flex justify-center items-center">
+                        <button
+                            className="border px-4 py-2 rounded-lg bg-blue-400 text-white hover:bg-blue-500 disabled:bg-blue-100 disabled:cursor-not-allowed select-none"
+                            onClick={handleClick}
+                            disabled={status === 'running' || (status !== null && audio === null)}
+                        >
+                            {status === null ? 'Load model' :
+                                status === 'running'
+                                    ? 'Running...'
+                                    : 'Run model'
+                            }
+                        </button>
+                        {status !== null &&
+                            <div className='absolute right-0 bottom-0'>
+                                <span className="text-xs">Language:</span>
+                                <br />
+                                <LanguageSelector className="border rounded-lg p-1 max-w-[100px]" language={language} setLanguage={setLanguage} />
+                            </div>
+                        }
+                    </div>
+                    {
+                        result && time && (
+                            <>
+                                <div className="w-full mt-4 border rounded-md">
+                                    <Transcript
+                                        className="p-2 max-h-[200px] overflow-y-auto scrollbar-thin select-none"
+                                        transcript={result.transcript}
+                                        segments={result.segments}
+                                        currentTime={currentTime}
+                                        setCurrentTime={(time) => {
+                                            setCurrentTime(time);
+                                            mediaInputRef.current.setMediaTime(time);
+                                        }}
+                                    />
+                                </div>
+                                <p className="text-sm text-end p-1">Generation time:
+                                    <span className="font-semibold">{(time / 1000).toLocaleString()} s</span>
+                                </p>
+                                <p className="text-sm text-end p-1">
+                                    <span className="font-semibold">{(audioLength / (time / 1000)).toFixed(2)}x transcription!</span>
+                                </p>
+                            </>
+                        )
+                    }
+                </div>
+            </div>
+        </div >
+    )
+}
+export default App

whisper-speaker-diarization/src/components/MediaInput.jsx CHANGED Viewed

@@ -1,8 +1,8 @@
 import { useState, forwardRef, useRef, useImperativeHandle, useEffect, useCallback } from 'react';
 const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/hopper.webm';
-const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) => {
     // UI states
     const [dragging, setDragging] = useState(false);
     const fileInputRef = useRef(null);
@@ -89,7 +89,40 @@ const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) =
         const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16_000 });
         try {
             const audioBuffer = await audioContext.decodeAudioData(buffer);
             let audio;
             if (audioBuffer.numberOfChannels === 2) {
                 // Merge channels
@@ -145,8 +178,8 @@ const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) =
             onClick={handleClick}
             onDragOver={handleDragOver}
             onDrop={handleDrop}
-            onDragEnter={(e) => setDragging(true)}
-            onDragLeave={(e) => setDragging(false)}
         >
             <input
                 type="file"
@@ -189,6 +222,13 @@ const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) =
         </div>
     );
 });
 MediaInput.displayName = 'MediaInput';
 export default MediaInput;

 import { useState, forwardRef, useRef, useImperativeHandle, useEffect, useCallback } from 'react';
+import PropTypes from 'prop-types';
 const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/hopper.webm';
+const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, onMessage, ...props }, ref) => {
     // UI states
     const [dragging, setDragging] = useState(false);
     const fileInputRef = useRef(null);
         const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16_000 });
         try {
+            // Start audio decoding
+            onMessage({
+                data: {
+                    status: 'loading',
+                    data: 'Decoding audio buffer...'
+                }
+            });
+            onMessage({
+                data: {
+                    status: 'initiate',
+                    name: 'audio-decoder',
+                    file: 'audio-buffer'
+                }
+            });
             const audioBuffer = await audioContext.decodeAudioData(buffer);
+            // Audio decoding complete
+            onMessage({
+                data: {
+                    status: 'done',
+                    name: 'audio-decoder',
+                    file: 'audio-buffer'
+                }
+            });
+            // Audio decoding complete
+            onMessage({
+                data: {
+                    status: 'loaded'
+                }
+            });
             let audio;
             if (audioBuffer.numberOfChannels === 2) {
                 // Merge channels
             onClick={handleClick}
             onDragOver={handleDragOver}
             onDrop={handleDrop}
+            onDragEnter={() => setDragging(true)}
+            onDragLeave={() => setDragging(false)}
         >
             <input
                 type="file"
         </div>
     );
 });
+MediaInput.propTypes = {
+    onInputChange: PropTypes.func.isRequired,
+    onTimeUpdate: PropTypes.func.isRequired,
+    onMessage: PropTypes.func.isRequired
+};
 MediaInput.displayName = 'MediaInput';
 export default MediaInput;

whisper-speaker-diarization/src/components/Progress.jsx CHANGED Viewed

@@ -1,3 +1,5 @@
 function formatBytes(size) {
     const i = size == 0 ? 0 : Math.floor(Math.log(size) / Math.log(1024));
     return +((size / Math.pow(1024, i)).toFixed(2)) * 1 + ['B', 'kB', 'MB', 'GB', 'TB'][i];
@@ -13,3 +15,9 @@ export default function Progress({ text, percentage, total }) {
         </div>
     );
 }

+import PropTypes from 'prop-types';
 function formatBytes(size) {
     const i = size == 0 ? 0 : Math.floor(Math.log(size) / Math.log(1024));
     return +((size / Math.pow(1024, i)).toFixed(2)) * 1 + ['B', 'kB', 'MB', 'GB', 'TB'][i];
         </div>
     );
 }
+Progress.propTypes = {
+    text: PropTypes.string.isRequired,
+    percentage: PropTypes.number,
+    total: PropTypes.number
+};

whisper-speaker-diarization/src/worker.js CHANGED Viewed

@@ -1,124 +1,272 @@
-import { pipeline, AutoProcessor, AutoModelForAudioFrameClassification } from '@xenova/transformers';
-const PER_DEVICE_CONFIG = {
-    webgpu: {
-        dtype: {
-            encoder_model: 'fp32',
-            decoder_model_merged: 'q4',
-        },
-        device: 'webgpu',
-    },
-    wasm: {
-        dtype: 'q8',
-        device: 'wasm',
-    },
-};
-/**
- * This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
- */
-class PipelineSingeton {
-    static asr_model_id = 'onnx-community/whisper-base_timestamped';
-    static asr_instance = null;
-    static segmentation_model_id = 'onnx-community/pyannote-segmentation-3.0';
-    static segmentation_instance = null;
-    static segmentation_processor = null;
-    static async getInstance(progress_callback = null, device = 'webgpu') {
-        this.asr_instance ??= pipeline('automatic-speech-recognition', this.asr_model_id, {
-            ...PER_DEVICE_CONFIG[device],
-            progress_callback,
-        });
-        this.segmentation_processor ??= AutoProcessor.from_pretrained(this.segmentation_model_id, {
-            progress_callback,
-        });
-        this.segmentation_instance ??= AutoModelForAudioFrameClassification.from_pretrained(this.segmentation_model_id, {
-            // NOTE: WebGPU is not currently supported for this model
-            // See https://github.com/microsoft/onnxruntime/issues/21386
-            device: 'wasm',
-            dtype: 'fp32',
-            progress_callback,
-        });
-        return Promise.all([this.asr_instance, this.segmentation_processor, this.segmentation_instance]);
-    }
-}
-async function load({ device }) {
-    self.postMessage({
-        status: 'loading',
-        data: `Loading models (${device})...`
-    });
-    // Load the pipeline and save it for future use.
-    const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance(x => {
-        // We also add a progress callback to the pipeline so that we can
-        // track model loading.
-        self.postMessage(x);
-    }, device);
-    if (device === 'webgpu') {
-        self.postMessage({
-            status: 'loading',
-            data: 'Compiling shaders and warming up model...'
-        });
-        await transcriber(new Float32Array(16_000), {
-            language: 'en',
-        });
-    }
-    self.postMessage({ status: 'loaded' });
-}
-async function segment(processor, model, audio) {
-    const inputs = await processor(audio);
-    const { logits } = await model(inputs);
-    const segments = processor.post_process_speaker_diarization(logits, audio.length)[0];
-    // Attach labels
-    for (const segment of segments) {
-        segment.label = model.config.id2label[segment.id];
-    }
-    return segments;
-}
-async function run({ audio, language }) {
-    const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance();
-    const start = performance.now();
-    // Run transcription and segmentation in parallel
-    const [transcript, segments] = await Promise.all([
-        transcriber(audio, {
-            language,
-            return_timestamps: 'word',
-            chunk_length_s: 30,
-        }),
-        segment(segmentation_processor, segmentation_model, audio)
-    ]);
-    console.table(segments, ['start', 'end', 'id', 'label', 'confidence']);
-    const end = performance.now();
-    self.postMessage({ status: 'complete', result: { transcript, segments }, time: end - start });
-}
-// Listen for messages from the main thread
-self.addEventListener('message', async (e) => {
-    const { type, data } = e.data;
-    switch (type) {
-        case 'load':
-            load(data);
-            break;
-        case 'run':
-            run(data);
-            break;
-    }
-});

+import { pipeline, AutoProcessor, AutoModelForAudioFrameClassification } from '@huggingface/transformers';
+const PER_DEVICE_CONFIG = {
+    webgpu: {
+        dtype: {
+            encoder_model: 'fp32',
+            decoder_model_merged: 'q4',
+        },
+        device: 'webgpu',
+    },
+    wasm: {
+        dtype: 'q8',
+        device: 'wasm',
+    },
+};
+/**
+ * This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
+ */
+class PipelineSingeton {
+    static asr_model_id = 'onnx-community/whisper-base_timestamped';
+    static asr_instance = null;
+    static segmentation_model_id = 'onnx-community/pyannote-segmentation-3.0';
+    static segmentation_instance = null;
+    static segmentation_processor = null;
+    static async getInstance(progress_callback = null, device = 'webgpu') {
+        this.asr_instance ??= pipeline('automatic-speech-recognition', this.asr_model_id, {
+            ...PER_DEVICE_CONFIG[device],
+            progress_callback,
+        });
+        this.segmentation_processor ??= AutoProcessor.from_pretrained(this.segmentation_model_id, {
+            progress_callback,
+        });
+        this.segmentation_instance ??= AutoModelForAudioFrameClassification.from_pretrained(this.segmentation_model_id, {
+            // NOTE: WebGPU is not currently supported for this model
+            // See https://github.com/microsoft/onnxruntime/issues/21386
+            device: 'wasm',
+            dtype: 'fp32',
+            progress_callback,
+        });
+        return Promise.all([this.asr_instance, this.segmentation_processor, this.segmentation_instance]);
+    }
+}
+async function load({ device }) {
+    try {
+        const message = {
+            status: 'loading',
+            data: `Loading models (${device})...`
+        };
+        self.postMessage(message);
+        const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance(x => {
+            // We also add a progress callback to the pipeline so that we can
+            // track model loading.
+            self.postMessage(x);
+        }, device);
+        if (device === 'webgpu') {
+            const warmupMessage = {
+                status: 'loading',
+                data: 'Compiling shaders and warming up model...'
+            };
+            self.postMessage(warmupMessage);
+            await transcriber(new Float32Array(16_000), {
+                language: 'en',
+            });
+        }
+        self.postMessage({ status: 'loaded' });
+    } catch (error) {
+        console.error('Loading error:', error);
+        const errorMessage = {
+            status: 'error',
+            error: error.message || 'Failed to load models'
+        };
+        self.postMessage(errorMessage);
+    }
+}
+async function segment(processor, model, audio) {
+    try {
+        // Report start of segmentation
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'segmentation',
+                progress: 0,
+                total: audio.length
+            }
+        });
+        // Process audio in chunks to show progress
+        const inputs = await processor(audio);
+        // Report segmentation feature extraction progress
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'segmentation',
+                progress: 50,
+                total: audio.length
+            }
+        });
+        const { logits } = await model(inputs);
+        // Report segmentation completion
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'segmentation',
+                progress: 100,
+                total: audio.length
+            }
+        });
+        // Start diarization
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'diarization',
+                progress: 0,
+                total: audio.length
+            }
+        });
+        const segments = processor.post_process_speaker_diarization(logits, audio.length)[0];
+        // Attach labels and report diarization completion
+        for (const segment of segments) {
+            segment.label = model.config.id2label[segment.id];
+        }
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'diarization',
+                progress: 100,
+                total: audio.length
+            }
+        });
+        return segments;
+    } catch (error) {
+        console.error('Segmentation error:', error);
+        return [{
+            id: 0,
+            start: 0,
+            end: (audio.length / 480016) * 30,
+            label: 'SPEAKER_00',
+            confidence: 1.0
+        }];
+    }
+}
+async function run({ audio, language }) {
+    try {
+        const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance();
+        const audioLengthSeconds = (audio.length / 16000);
+        // Initialize transcription progress
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'transcription',
+                progress: 0,
+                total: audio.length
+            }
+        });
+        const start = performance.now();
+        // Process in 30-second chunks
+        const CHUNK_SIZE = 3 * 30 * 16000; // 30 seconds * 16000 samples/second
+        const numChunks = Math.ceil(audio.length / CHUNK_SIZE);
+        let transcriptResults = [];
+        for (let i = 0; i < numChunks; i++) {
+            const start = i * CHUNK_SIZE;
+            const end = Math.min((i + 1) * CHUNK_SIZE, audio.length);
+            const chunk = audio.slice(start, end);
+            // Process chunk
+            const chunkResult = await transcriber(chunk, {
+                language,
+                return_timestamps: 'word',
+                chunk_length_s: 30,
+            });
+            const progressMessage = {
+                status: 'transcribe-progress',
+                data: {
+                    task: 'transcription',
+                    progress: Math.round((i+1) / numChunks * 100),
+                    total: audio.length
+                }
+            };
+            self.postMessage(progressMessage);
+            // Adjust timestamps for this chunk
+            if (chunkResult.chunks) {
+                chunkResult.chunks.forEach(chunk => {
+                    if (chunk.timestamp) {
+                        chunk.timestamp[0] += start / 16000; // Convert samples to seconds
+                        chunk.timestamp[1] += start / 16000;
+                    }
+                });
+            }
+            transcriptResults.push(chunkResult);
+        }
+        // Combine results
+        const transcript = {
+            text: transcriptResults.map(r => r.text).join(''),
+            chunks: transcriptResults.flatMap(r => r.chunks || [])
+        };
+        // Run segmentation in parallel with the last chunk
+        const segments = await segment(segmentation_processor, segmentation_model, audio);
+        // Ensure transcription shows as complete
+        self.postMessage({
+            status: 'transcribe-progress',
+            data: {
+                task: 'transcription',
+                progress: 100,
+                total: audio.length
+            }
+        });
+        const end = performance.now();
+        const completeMessage = {
+            status: 'complete',
+            result: { transcript, segments },
+            audio_length: audioLengthSeconds,
+            time: end - start
+        };
+        self.postMessage(completeMessage);
+    } catch (error) {
+        console.error('Processing error:', error);
+        const errorMessage = {
+            status: 'error',
+            error: error.message || 'Failed to process audio'
+        };
+        console.log('Worker sending error:', errorMessage);
+        self.postMessage(errorMessage);
+    }
+}
+// Listen for messages from the main thread
+self.addEventListener('message', async (e) => {
+    const { type, data } = e.data;
+    switch (type) {
+        case 'load':
+            load(data);
+            break;
+        case 'run':
+            run(data);
+            break;
+    }
+});