machineuser commited on
Commit
405a395
Β·
1 Parent(s): a899be4

Sync widgets demo

Browse files
packages/tasks/src/automatic-speech-recognition/about.md CHANGED
@@ -25,7 +25,7 @@ import json
25
  import requests
26
 
27
  headers = {"Authorization": f"Bearer {API_TOKEN}"}
28
- API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
29
 
30
  def query(filename):
31
  with open(filename, "rb") as f:
@@ -63,7 +63,7 @@ await inference.automaticSpeechRecognition({
63
 
64
  ## Solving ASR for your own data
65
 
66
- We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-v2) is a large multilingual model trained on 100+ languages and with 680K hours of speech.
67
 
68
  The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info!
69
 
@@ -75,6 +75,7 @@ These events help democratize ASR for all languages, including low-resource lang
75
 
76
  ## Useful Resources
77
 
 
78
  - [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters)
79
  - [Making automatic speech recognition work on large files with Wav2Vec2 in πŸ€— Transformers](https://huggingface.co/blog/asr-chunking)
80
  - [Boosting Wav2Vec2 with n-grams in πŸ€— Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
 
25
  import requests
26
 
27
  headers = {"Authorization": f"Bearer {API_TOKEN}"}
28
+ API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
29
 
30
  def query(filename):
31
  with open(filename, "rb") as f:
 
63
 
64
  ## Solving ASR for your own data
65
 
66
+ We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-v3) is a large multilingual model trained on 100+ languages and with 4 Million hours of speech.
67
 
68
  The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info!
69
 
 
75
 
76
  ## Useful Resources
77
 
78
+ - [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter5/introduction)
79
  - [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters)
80
  - [Making automatic speech recognition work on large files with Wav2Vec2 in πŸ€— Transformers](https://huggingface.co/blog/asr-chunking)
81
  - [Boosting Wav2Vec2 with n-grams in πŸ€— Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
packages/tasks/src/automatic-speech-recognition/data.ts CHANGED
@@ -44,7 +44,7 @@ const taskData: TaskDataCustom = {
44
  models: [
45
  {
46
  description: "A powerful ASR model by OpenAI.",
47
- id: "openai/whisper-large-v2",
48
  },
49
  {
50
  description: "A good generic ASR model by MetaAI.",
@@ -58,20 +58,20 @@ const taskData: TaskDataCustom = {
58
  spaces: [
59
  {
60
  description: "A powerful general-purpose speech recognition application.",
61
- id: "openai/whisper",
62
  },
63
  {
64
  description: "Fastest speech recognition application.",
65
  id: "sanchit-gandhi/whisper-jax",
66
  },
67
  {
68
- description: "An application that transcribes speeches in YouTube videos.",
69
- id: "jeffistyping/Youtube-Whisperer",
70
  },
71
  ],
72
  summary:
73
  "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
74
- widgetModels: ["openai/whisper-large-v2"],
75
  youtubeId: "TksaY_FDgnk",
76
  };
77
 
 
44
  models: [
45
  {
46
  description: "A powerful ASR model by OpenAI.",
47
+ id: "openai/whisper-large-v3",
48
  },
49
  {
50
  description: "A good generic ASR model by MetaAI.",
 
58
  spaces: [
59
  {
60
  description: "A powerful general-purpose speech recognition application.",
61
+ id: "hf-audio/whisper-large-v3",
62
  },
63
  {
64
  description: "Fastest speech recognition application.",
65
  id: "sanchit-gandhi/whisper-jax",
66
  },
67
  {
68
+ description: "A high quality speech and text translation model by Meta.",
69
+ id: "facebook/seamless_m4t",
70
  },
71
  ],
72
  summary:
73
  "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
74
+ widgetModels: ["openai/whisper-large-v3"],
75
  youtubeId: "TksaY_FDgnk",
76
  };
77