import type { TaskDataCustom } from "../Types"; const taskData: TaskDataCustom = { datasets: [ { description: "Microsoft Research Video to Text is a large-scale dataset for open domain video captioning", id: "iejMac/CLIP-MSR-VTT", }, { description: "UCF101 Human Actions dataset consists of 13,320 video clips from YouTube, with 101 classes.", id: "quchenyuan/UCF101-ZIP", }, { description: "A high-quality dataset for human action recognition in YouTube videos.", id: "nateraw/kinetics", }, { description: "A dataset of video clips of humans performing pre-defined basic actions with everyday objects.", id: "HuggingFaceM4/something_something_v2", }, { description: "This dataset consists of text-video pairs and contains noisy samples with irrelevant video descriptions", id: "HuggingFaceM4/webvid", }, { description: "A dataset of short Flickr videos for the temporal localization of events with descriptions.", id: "iejMac/CLIP-DiDeMo", }, ], demo: { inputs: [ { label: "Input", content: "Darth Vader is surfing on the waves.", type: "text", }, ], outputs: [ { filename: "text-to-video-output.gif", type: "img", }, ], }, metrics: [ { description: "Inception Score uses an image classification model that predicts class labels and evaluates how distinct and diverse the images are. A higher score indicates better video generation.", id: "is", }, { description: "Frechet Inception Distance uses an image classification model to obtain image embeddings. The metric compares mean and standard deviation of the embeddings of real and generated images. A smaller score indicates better video generation.", id: "fid", }, { description: "Frechet Video Distance uses a model that captures coherence for changes in frames and the quality of each frame. A smaller score indicates better video generation.", id: "fvd", }, { description: "CLIPSIM measures similarity between video frames and text using an image-text similarity model. A higher score indicates better video generation.", id: "clipsim", }, ], models: [ { description: "A strong model for video generation.", id: "PAIR/text2video-zero-controlnet-canny-arcane", }, { description: "A robust model for text-to-video generation.", id: "damo-vilab/text-to-video-ms-1.7b", }, { description: "A text-to-video generation model with high quality and smooth outputs.", id: "cerspense/zeroscope_v2_576w", }, ], spaces: [ { description: "An application that generates video from text.", id: "fffiloni/zeroscope", }, { description: "An application that generates video from image and text.", id: "TempoFunk/makeavid-sd-jax", }, { description: "An application that generates videos from text and provides multi-model support.", id: "ArtGAN/Video-Diffusion-WebUI", }, ], summary: "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ", widgetModels: [], youtubeId: undefined, }; export default taskData;