import type { TaskDataCustom } from ".."; const taskData: TaskDataCustom = { datasets: [ { // TODO write proper description description: "Dataset from 12M image-text of Reddit", id: "red_caps", }, { // TODO write proper description description: "Dataset from 3.3M images of Google", id: "datasets/conceptual_captions", }, ], demo: { inputs: [ { filename: "savanna.jpg", type: "img", }, ], outputs: [ { label: "Detailed description", content: "a herd of giraffes and zebras grazing in a field", type: "text", }, ], }, metrics: [], models: [ { description: "A robust image captioning model.", id: "Salesforce/blip2-opt-2.7b", }, { description: "A powerful and accurate image-to-text model that can also localize concepts in images.", id: "microsoft/kosmos-2-patch14-224", }, { description: "A strong optical character recognition model.", id: "facebook/nougat-base", }, { description: "A powerful model that lets you have a conversation with the image.", id: "llava-hf/llava-1.5-7b-hf", }, ], spaces: [ { description: "An application that compares various image captioning models.", id: "nielsr/comparing-captioning-models", }, { description: "A robust image captioning application.", id: "flax-community/image-captioning", }, { description: "An application that transcribes handwritings into text.", id: "nielsr/TrOCR-handwritten", }, { description: "An application that can caption images and answer questions about a given image.", id: "Salesforce/BLIP", }, { description: "An application that can caption images and answer questions with a conversational agent.", id: "Salesforce/BLIP2", }, { description: "An image captioning application that demonstrates the effect of noise on captions.", id: "johko/capdec-image-captioning", }, ], summary: "Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.", widgetModels: ["Salesforce/blip-image-captioning-base"], youtubeId: "", }; export default taskData;