Skip to content

uform-gen2-qwen-500m

Beta

Model ID: @cf/unum/uform-gen2-qwen-500m

UForm-Gen is a small generative vision-language model primarily designed for Image Captioning and Visual Question Answering. The model was pre-trained on the internal image captioning dataset and fine-tuned on public instructions datasets: SVIT, LVIS, VQAs datasets.

Properties

Task Type: Image-to-Text

Code Examples

Workers - Typescript

export interface Env {
AI: Ai;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const res = await fetch("https://cataas.com/cat");
const blob = await res.arrayBuffer();
const input = {
image: [...new Uint8Array(blob)],
prompt: "Generate a caption for this image",
max_tokens: 512,
};
const response = await env.AI.run(
"@cf/unum/uform-gen2-qwen-500m",
input
);
return new Response(JSON.stringify(response));
},
} satisfies ExportedHandler<Env>;

Response

{
"description": " This is a photo of a supdog."
}

API Schema

The following schema is based on JSON Schema

Input JSON Schema

{
"oneOf": [
{
"type": "string",
"format": "binary"
},
{
"type": "object",
"properties": {
"temperature": {
"type": "number"
},
"prompt": {
"type": "string"
},
"raw": {
"type": "boolean",
"default": false
},
"messages": {
"type": "array",
"items": {
"type": "object",
"properties": {
"role": {
"type": "string"
},
"content": {
"type": "string",
"maxLength": 6144
}
},
"required": [
"role",
"content"
]
}
},
"image": {
"oneOf": [
{
"type": "array",
"items": {
"type": "number"
}
},
{
"type": "string",
"format": "binary"
}
]
},
"max_tokens": {
"type": "integer",
"default": 512
}
},
"required": [
"image"
],
"not": {
"required": [
"prompt",
"messages"
]
},
"errorMessage": {
"not": "\"prompt\" and \"messages\" are mutually exclusive"
}
}
]
}

Output JSON Schema

{
"type": "object",
"contentType": "application/json",
"properties": {
"description": {
"type": "string"
}
}
}