import { InferenceOutputError } from "../../lib/InferenceOutputError"; import type { BaseArgs, Options, RequestArgs } from "../../types"; import { request } from "../custom/request"; import { base64FromBytes } from "../../../../shared"; export type VisualQuestionAnsweringArgs = BaseArgs & { inputs: { /** * Raw image * * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()` **/ image: Blob | ArrayBuffer; question: string; }; }; export interface VisualQuestionAnsweringOutput { /** * A string that’s the answer to a visual question. */ answer: string; /** * Answer correctness score. */ score: number; } /** * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa. */ export async function visualQuestionAnswering( args: VisualQuestionAnsweringArgs, options?: Options ): Promise { const reqArgs: RequestArgs = { ...args, inputs: { question: args.inputs.question, // convert Blob or ArrayBuffer to base64 image: base64FromBytes( new Uint8Array( args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer() ) ), }, } as RequestArgs; const res = ( await request<[VisualQuestionAnsweringOutput]>(reqArgs, { ...options, taskHint: "visual-question-answering", }) )?.[0]; const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number"; if (!isValidOutput) { throw new InferenceOutputError("Expected Array<{answer: string, score: number}>"); } return res; }