Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,148 Bytes
b2ecf7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import type { TaskDataCustom } from "../Types";
const taskData: TaskDataCustom = {
datasets: [
{
description: "Microsoft Research Video to Text is a large-scale dataset for open domain video captioning",
id: "iejMac/CLIP-MSR-VTT",
},
{
description: "UCF101 Human Actions dataset consists of 13,320 video clips from YouTube, with 101 classes.",
id: "quchenyuan/UCF101-ZIP",
},
{
description: "A high-quality dataset for human action recognition in YouTube videos.",
id: "nateraw/kinetics",
},
{
description: "A dataset of video clips of humans performing pre-defined basic actions with everyday objects.",
id: "HuggingFaceM4/something_something_v2",
},
{
description:
"This dataset consists of text-video pairs and contains noisy samples with irrelevant video descriptions",
id: "HuggingFaceM4/webvid",
},
{
description: "A dataset of short Flickr videos for the temporal localization of events with descriptions.",
id: "iejMac/CLIP-DiDeMo",
},
],
demo: {
inputs: [
{
label: "Input",
content: "Darth Vader is surfing on the waves.",
type: "text",
},
],
outputs: [
{
filename: "text-to-video-output.gif",
type: "img",
},
],
},
metrics: [
{
description:
"Inception Score uses an image classification model that predicts class labels and evaluates how distinct and diverse the images are. A higher score indicates better video generation.",
id: "is",
},
{
description:
"Frechet Inception Distance uses an image classification model to obtain image embeddings. The metric compares mean and standard deviation of the embeddings of real and generated images. A smaller score indicates better video generation.",
id: "fid",
},
{
description:
"Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.",
id: "fvd",
},
{
description:
"CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.",
id: "clipsim",
},
],
models: [
{
description: "A strong model for video generation.",
id: "PAIR/text2video-zero-controlnet-canny-arcane",
},
{
description: "A robust model for text-to-video generation.",
id: "damo-vilab/text-to-video-ms-1.7b",
},
{
description: "A text-to-video generation model with high quality and smooth outputs.",
id: "cerspense/zeroscope_v2_576w",
},
],
spaces: [
{
description: "An application that generates video from text.",
id: "fffiloni/zeroscope",
},
{
description: "An application that generates video from image and text.",
id: "TempoFunk/makeavid-sd-jax",
},
{
description: "An application that generates videos from text and provides multi-model support.",
id: "ArtGAN/Video-Diffusion-WebUI",
},
],
summary:
"Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
widgetModels: [],
youtubeId: undefined,
};
export default taskData;
|