File size: 1,906 Bytes
b2ecf7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import type { TaskDataCustom } from "../Types";

const taskData: TaskDataCustom = {
	datasets: [
		{
			description: "18,000 hours of multilingual audio-text dataset in 108 languages.",
			id: "mozilla-foundation/common_voice_13_0",
		},
		{
			description: "An English dataset with 1,000 hours of data.",
			id: "librispeech_asr",
		},
		{
			description: "High quality, multi-speaker audio data and their transcriptions in various languages.",
			id: "openslr",
		},
	],
	demo: {
		inputs: [
			{
				filename: "input.flac",
				type: "audio",
			},
		],
		outputs: [
			{
				/// GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES I
				label: "Transcript",
				content: "Going along slushy country roads and speaking to damp audiences in...",
				type: "text",
			},
		],
	},
	metrics: [
		{
			description: "",
			id: "wer",
		},
		{
			description: "",
			id: "cer",
		},
	],
	models: [
		{
			description: "A powerful ASR model by OpenAI.",
			id: "openai/whisper-large-v2",
		},
		{
			description: "A good generic ASR model by MetaAI.",
			id: "facebook/wav2vec2-base-960h",
		},
		{
			description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
			id: "facebook/s2t-small-mustc-en-fr-st",
		},
	],
	spaces: [
		{
			description: "A powerful general-purpose speech recognition application.",
			id: "openai/whisper",
		},
		{
			description: "Fastest speech recognition application.",
			id: "sanchit-gandhi/whisper-jax",
		},
		{
			description: "An application that transcribes speeches in YouTube videos.",
			id: "jeffistyping/Youtube-Whisperer",
		},
	],
	summary:
		"Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
	widgetModels: ["openai/whisper-large-v2"],
	youtubeId: "TksaY_FDgnk",
};

export default taskData;