/**
 * Inference code generated from the JSON schema spec in ./spec
 *
 * Using src/scripts/inference-codegen
 */

/**
 * Inputs for Automatic Speech Recognition inference
 */
export interface AutomaticSpeechRecognitionInput {
	/**
	 * The input audio data
	 */
	inputs: unknown;
	/**
	 * Additional inference parameters
	 */
	parameters?: AutomaticSpeechRecognitionParameters;
	[property: string]: unknown;
}

/**
 * Additional inference parameters
 *
 * Additional inference parameters for Automatic Speech Recognition
 */
export interface AutomaticSpeechRecognitionParameters {
	/**
	 * Parametrization of the text generation process
	 */
	generate?: GenerationParameters;
	/**
	 * Whether to output corresponding timestamps with the generated text
	 */
	return_timestamps?: boolean;
	[property: string]: unknown;
}

/**
 * Parametrization of the text generation process
 *
 * Ad-hoc parametrization of the text generation process
 */
export interface GenerationParameters {
	/**
	 * Whether to use sampling instead of greedy decoding when generating new tokens.
	 */
	do_sample?: boolean;
	/**
	 * Controls the stopping condition for beam-based methods.
	 */
	early_stopping?: EarlyStoppingUnion;
	/**
	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
	 */
	epsilon_cutoff?: number;
	/**
	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
	 * float strictly between 0 and 1, a token is only considered if it is greater than either
	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
	 * for more details.
	 */
	eta_cutoff?: number;
	/**
	 * The maximum length (in tokens) of the generated text, including the input.
	 */
	max_length?: number;
	/**
	 * The maximum number of tokens to generate. Takes precedence over maxLength.
	 */
	max_new_tokens?: number;
	/**
	 * The minimum length (in tokens) of the generated text, including the input.
	 */
	min_length?: number;
	/**
	 * The minimum number of tokens to generate. Takes precedence over maxLength.
	 */
	min_new_tokens?: number;
	/**
	 * Number of groups to divide num_beams into in order to ensure diversity among different
	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
	 */
	num_beam_groups?: number;
	/**
	 * Number of beams to use for beam search.
	 */
	num_beams?: number;
	/**
	 * The value balances the model confidence and the degeneration penalty in contrastive
	 * search decoding.
	 */
	penalty_alpha?: number;
	/**
	 * The value used to modulate the next token probabilities.
	 */
	temperature?: number;
	/**
	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
	 */
	top_k?: number;
	/**
	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
	 * that add up to top_p or higher are kept for generation.
	 */
	top_p?: number;
	/**
	 * Local typicality measures how similar the conditional probability of predicting a target
	 * token next is to the expected conditional probability of predicting a random token next,
	 * given the partial text already generated. If set to float < 1, the smallest set of the
	 * most locally typical tokens with probabilities that add up to typical_p or higher are
	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
	 */
	typical_p?: number;
	/**
	 * Whether the model should use the past last key/values attentions to speed up decoding
	 */
	use_cache?: boolean;
	[property: string]: unknown;
}

/**
 * Controls the stopping condition for beam-based methods.
 */
export type EarlyStoppingUnion = boolean | "never";

/**
 * Outputs of inference for the Automatic Speech Recognition task
 */
export interface AutomaticSpeechRecognitionOutput {
	/**
	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
	 * the model.
	 */
	chunks?: AutomaticSpeechRecognitionOutputChunk[];
	/**
	 * The recognized text.
	 */
	text: string;
	[property: string]: unknown;
}

export interface AutomaticSpeechRecognitionOutputChunk {
	/**
	 * A chunk of text identified by the model
	 */
	text: string;
	/**
	 * The start and end timestamps corresponding with the text
	 */
	timestamps: number[];
	[property: string]: unknown;
}