Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
machineuser
commited on
Commit
Β·
405a395
1
Parent(s):
a899be4
Sync widgets demo
Browse files
packages/tasks/src/automatic-speech-recognition/about.md
CHANGED
@@ -25,7 +25,7 @@ import json
|
|
25 |
import requests
|
26 |
|
27 |
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
28 |
-
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-
|
29 |
|
30 |
def query(filename):
|
31 |
with open(filename, "rb") as f:
|
@@ -63,7 +63,7 @@ await inference.automaticSpeechRecognition({
|
|
63 |
|
64 |
## Solving ASR for your own data
|
65 |
|
66 |
-
We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-
|
67 |
|
68 |
The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info!
|
69 |
|
@@ -75,6 +75,7 @@ These events help democratize ASR for all languages, including low-resource lang
|
|
75 |
|
76 |
## Useful Resources
|
77 |
|
|
|
78 |
- [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters)
|
79 |
- [Making automatic speech recognition work on large files with Wav2Vec2 in π€ Transformers](https://huggingface.co/blog/asr-chunking)
|
80 |
- [Boosting Wav2Vec2 with n-grams in π€ Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
|
|
|
25 |
import requests
|
26 |
|
27 |
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
28 |
+
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
29 |
|
30 |
def query(filename):
|
31 |
with open(filename, "rb") as f:
|
|
|
63 |
|
64 |
## Solving ASR for your own data
|
65 |
|
66 |
+
We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-v3) is a large multilingual model trained on 100+ languages and with 4 Million hours of speech.
|
67 |
|
68 |
The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info!
|
69 |
|
|
|
75 |
|
76 |
## Useful Resources
|
77 |
|
78 |
+
- [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter5/introduction)
|
79 |
- [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters)
|
80 |
- [Making automatic speech recognition work on large files with Wav2Vec2 in π€ Transformers](https://huggingface.co/blog/asr-chunking)
|
81 |
- [Boosting Wav2Vec2 with n-grams in π€ Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
|
packages/tasks/src/automatic-speech-recognition/data.ts
CHANGED
@@ -44,7 +44,7 @@ const taskData: TaskDataCustom = {
|
|
44 |
models: [
|
45 |
{
|
46 |
description: "A powerful ASR model by OpenAI.",
|
47 |
-
id: "openai/whisper-large-
|
48 |
},
|
49 |
{
|
50 |
description: "A good generic ASR model by MetaAI.",
|
@@ -58,20 +58,20 @@ const taskData: TaskDataCustom = {
|
|
58 |
spaces: [
|
59 |
{
|
60 |
description: "A powerful general-purpose speech recognition application.",
|
61 |
-
id: "
|
62 |
},
|
63 |
{
|
64 |
description: "Fastest speech recognition application.",
|
65 |
id: "sanchit-gandhi/whisper-jax",
|
66 |
},
|
67 |
{
|
68 |
-
description: "
|
69 |
-
id: "
|
70 |
},
|
71 |
],
|
72 |
summary:
|
73 |
"Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
|
74 |
-
widgetModels: ["openai/whisper-large-
|
75 |
youtubeId: "TksaY_FDgnk",
|
76 |
};
|
77 |
|
|
|
44 |
models: [
|
45 |
{
|
46 |
description: "A powerful ASR model by OpenAI.",
|
47 |
+
id: "openai/whisper-large-v3",
|
48 |
},
|
49 |
{
|
50 |
description: "A good generic ASR model by MetaAI.",
|
|
|
58 |
spaces: [
|
59 |
{
|
60 |
description: "A powerful general-purpose speech recognition application.",
|
61 |
+
id: "hf-audio/whisper-large-v3",
|
62 |
},
|
63 |
{
|
64 |
description: "Fastest speech recognition application.",
|
65 |
id: "sanchit-gandhi/whisper-jax",
|
66 |
},
|
67 |
{
|
68 |
+
description: "A high quality speech and text translation model by Meta.",
|
69 |
+
id: "facebook/seamless_m4t",
|
70 |
},
|
71 |
],
|
72 |
summary:
|
73 |
"Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
|
74 |
+
widgetModels: ["openai/whisper-large-v3"],
|
75 |
youtubeId: "TksaY_FDgnk",
|
76 |
};
|
77 |
|