File size: 2,293 Bytes
b2ecf7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import type { TaskDataCustom } from "../Types";

const taskData: TaskDataCustom = {
	datasets: [
		{
			// TODO write proper description
			description: "Dataset from 12M image-text of Reddit",
			id: "red_caps",
		},
		{
			// TODO write proper description
			description: "Dataset from 3.3M images of Google",
			id: "datasets/conceptual_captions",
		},
	],
	demo: {
		inputs: [
			{
				filename: "savanna.jpg",
				type: "img",
			},
		],
		outputs: [
			{
				label: "Detailed description",
				content: "a herd of giraffes and zebras grazing in a field",
				type: "text",
			},
		],
	},
	metrics: [],
	models: [
		{
			description: "A robust image captioning model.",
			id: "Salesforce/blip-image-captioning-large",
		},
		{
			description: "A strong image captioning model.",
			id: "nlpconnect/vit-gpt2-image-captioning",
		},
		{
			description: "A strong optical character recognition model.",
			id: "microsoft/trocr-base-printed",
		},
		{
			description: "A strong visual question answering model for scientific diagrams.",
			id: "google/pix2struct-ai2d-base",
		},
		{
			description: "A strong captioning model for UI components.",
			id: "google/pix2struct-widget-captioning-base",
		},
		{
			description: "A captioning model for images that contain text.",
			id: "google/pix2struct-textcaps-base",
		},
	],
	spaces: [
		{
			description: "A robust image captioning application.",
			id: "flax-community/image-captioning",
		},
		{
			description: "An application that transcribes handwritings into text.",
			id: "nielsr/TrOCR-handwritten",
		},
		{
			description: "An application that can caption images and answer questions about a given image.",
			id: "Salesforce/BLIP",
		},
		{
			description: "An application that can caption images and answer questions with a conversational agent.",
			id: "Salesforce/BLIP2",
		},
		{
			description: "An image captioning application that demonstrates the effect of noise on captions.",
			id: "johko/capdec-image-captioning",
		},
	],
	summary:
		"Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
	widgetModels: ["Salesforce/blip-image-captioning-base"],
	youtubeId: "",
};

export default taskData;