Skip to content
OpenAI logo

whisper-large-v3-turbo

Automatic Speech RecognitionOpenAIHosted

Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation.

Model Info
BatchYes
Unit Pricing$0.00051 per audio minute

Usage

import { Buffer } from 'node:buffer';
export interface Env {
AI: Ai;
}
const URL = "https://pub-dbcf9f0bd3af47ca9d40971179ee62de.r2.dev/02f6edc0-1f7b-4272-bd17-f05335104725/audio.mp3";
export default {
async fetch(request, env, ctx): Promise<Response> {
const mp3 = await fetch(URL);
if (!mp3.ok) {
return Response.json({ error: `Failed to fetch MP3: ${mp3.status}` });
}
const mp3Buffer = await mp3.arrayBuffer();
const base64 = Buffer.from(mp3Buffer, 'binary').toString("base64");
try {
const res = await env.AI.run("@cf/openai/whisper-large-v3-turbo", {
audio: base64,
// Specify the language using an ISO 639-1 code.
// Examples: "en" (English), "es" (Spanish), "fr" (French)
// If omitted, the model will auto-detect the language.
language: "en",
});
return Response.json(res);
}
catch (e) {
console.error(e);
return Response.json({ error: "An unexpected error occurred" });
}
},
} satisfies ExportedHandler<Env>

Parameters

task
stringdefault: transcribeSupported tasks are 'translate' or 'transcribe'.
language
stringThe language of the audio being transcribed or translated.
vad_filter
booleandefault: falsePreprocess the audio with a voice activity detection model.
initial_prompt
stringA text prompt to help provide context to the model on the contents of the audio.
prefix
stringThe prefix appended to the beginning of the output of the transcription and can guide the transcription result.
beam_size
integerdefault: 5The number of beams to use in beam search decoding. Higher values may improve accuracy at the cost of speed.
condition_on_previous_text
booleandefault: trueWhether to condition on previous text during transcription. Setting to false may help prevent hallucination loops.
no_speech_threshold
numberdefault: 0.6Threshold for detecting no-speech segments. Segments with no-speech probability above this value are skipped.
compression_ratio_threshold
numberdefault: 2.4Threshold for filtering out segments with high compression ratio, which often indicate repetitive or hallucinated text.
log_prob_threshold
numberdefault: -1Threshold for filtering out segments with low average log probability, indicating low confidence.
hallucination_silence_threshold
numberOptional threshold (in seconds) to skip silent periods that may cause hallucinations.

API Schemas

{
"type": "object",
"properties": {
"audio": {
"anyOf": [
{
"type": "string",
"description": "Base64 encoded value of the audio data."
},
{
"type": "object",
"properties": {
"body": {
"type": "object"
},
"contentType": {
"type": "string"
}
}
}
]
},
"task": {
"type": "string",
"default": "transcribe",
"description": "Supported tasks are 'translate' or 'transcribe'."
},
"language": {
"type": "string",
"description": "The language of the audio being transcribed or translated."
},
"vad_filter": {
"type": "boolean",
"default": false,
"description": "Preprocess the audio with a voice activity detection model."
},
"initial_prompt": {
"type": "string",
"description": "A text prompt to help provide context to the model on the contents of the audio."
},
"prefix": {
"type": "string",
"description": "The prefix appended to the beginning of the output of the transcription and can guide the transcription result."
},
"beam_size": {
"type": "integer",
"default": 5,
"description": "The number of beams to use in beam search decoding. Higher values may improve accuracy at the cost of speed."
},
"condition_on_previous_text": {
"type": "boolean",
"default": true,
"description": "Whether to condition on previous text during transcription. Setting to false may help prevent hallucination loops."
},
"no_speech_threshold": {
"type": "number",
"default": 0.6,
"description": "Threshold for detecting no-speech segments. Segments with no-speech probability above this value are skipped."
},
"compression_ratio_threshold": {
"type": "number",
"default": 2.4,
"description": "Threshold for filtering out segments with high compression ratio, which often indicate repetitive or hallucinated text."
},
"log_prob_threshold": {
"type": "number",
"default": -1,
"description": "Threshold for filtering out segments with low average log probability, indicating low confidence."
},
"hallucination_silence_threshold": {
"type": "number",
"description": "Optional threshold (in seconds) to skip silent periods that may cause hallucinations."
}
},
"required": [
"audio"
]
}