Skip to content
t

llama-2-13b-chat-awq Beta

Text Generation • thebloke
@hf/thebloke/llama-2-13b-chat-awq

Llama 2 13B Chat AWQ is an efficient, accurate and blazing-fast low-bit weight quantized Llama 2 variant.

    Playground

    Try out this model with Workers AI LLM Playground. It does not require any setup or authentication and an instant way to preview and test a model directly in the browser.

    Launch the LLM Playground

    Usage

    Worker - Streaming

    export interface Env {
    AI: Ai;
    }
    export default {
    async fetch(request, env): Promise<Response> {
    const messages = [
    { role: "system", content: "You are a friendly assistant" },
    {
    role: "user",
    content: "What is the origin of the phrase Hello, World",
    },
    ];
    const stream = await env.AI.run("@hf/thebloke/llama-2-13b-chat-awq", {
    messages,
    stream: true,
    });
    return new Response(stream, {
    headers: { "content-type": "text/event-stream" },
    });
    },
    } satisfies ExportedHandler<Env>;

    Worker

    export interface Env {
    AI: Ai;
    }
    export default {
    async fetch(request, env): Promise<Response> {
    const messages = [
    { role: "system", content: "You are a friendly assistant" },
    {
    role: "user",
    content: "What is the origin of the phrase Hello, World",
    },
    ];
    const response = await env.AI.run("@hf/thebloke/llama-2-13b-chat-awq", { messages });
    return Response.json(response);
    },
    } satisfies ExportedHandler<Env>;

    Python

    import os
    import requests
    ACCOUNT_ID = "your-account-id"
    AUTH_TOKEN = os.environ.get("CLOUDFLARE_AUTH_TOKEN")
    prompt = "Tell me all about PEP-8"
    response = requests.post(
    f"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@hf/thebloke/llama-2-13b-chat-awq",
    headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
    json={
    "messages": [
    {"role": "system", "content": "You are a friendly assistant"},
    {"role": "user", "content": prompt}
    ]
    }
    )
    result = response.json()
    print(result)

    curl

    Terminal window
    curl https://api.cloudflare.com/client/v4/accounts/$CLOUDFLARE_ACCOUNT_ID/ai/run/@hf/thebloke/llama-2-13b-chat-awq \
    -X POST \
    -H "Authorization: Bearer $CLOUDFLARE_AUTH_TOKEN" \
    -d '{ "messages": [{ "role": "system", "content": "You are a friendly assistant" }, { "role": "user", "content": "Why is pizza so good" }]}'

    Parameters

    Input

    • Prompt object

      • prompt string min 1 max 131072

        The input text prompt for the model to generate a response.

      • image one of

        • 0 array

          An array of integers that represent the image data constrained to 8-bit unsigned integer values

          • items number

            A value between 0 and 255

        • 1 string

          Binary string representing the image contents.

      • raw boolean

        If true, a chat template is not applied and you must adhere to the specific model's expected formatting.

      • stream boolean

        If true, the response will be streamed back incrementally using SSE, Server Sent Events.

      • max_tokens integer default 256

        The maximum number of tokens to generate in the response.

      • temperature number default 0.6 min 0 max 5

        Controls the randomness of the output; higher values produce more random results.

      • top_p number min 0 max 2

        Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses.

      • top_k integer min 1 max 50

        Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises.

      • seed integer min 1 max 9999999999

        Random seed for reproducibility of the generation.

      • repetition_penalty number min 0 max 2

        Penalty for repeated tokens; higher values discourage repetition.

      • frequency_penalty number min 0 max 2

        Decreases the likelihood of the model repeating the same lines verbatim.

      • presence_penalty number min 0 max 2

        Increases the likelihood of the model introducing new topics.

      • lora string

        Name of the LoRA (Low-Rank Adaptation) model to fine-tune the base model.

    • Messages object

      • messages array

        An array of message objects representing the conversation history.

        • items object

          • role string

            The role of the message sender (e.g., 'user', 'assistant', 'system', 'tool').

          • content string max 131072

            The content of the message as a string.

      • image one of

        • 0 array

          An array of integers that represent the image data constrained to 8-bit unsigned integer values

          • items number

            A value between 0 and 255

        • 1 string

          Binary string representing the image contents.

      • functions array

        • items object

          • name string

          • code string

      • tools array

        A list of tools available for the assistant to use.

        • items one of

          • 0 object

            • name string

              The name of the tool. More descriptive the better.

            • description string

              A brief description of what the tool does.

            • parameters object

              Schema defining the parameters accepted by the tool.

              • type string

                The type of the parameters object (usually 'object').

              • required array

                List of required parameter names.

                • items string

              • properties object

                Definitions of each parameter.

                • additionalProperties object

                  • type string

                    The data type of the parameter.

                  • description string

                    A description of the expected parameter.

          • 1 object

            • type string

              Specifies the type of tool (e.g., 'function').

            • function object

              Details of the function tool.

              • name string

                The name of the function.

              • description string

                A brief description of what the function does.

              • parameters object

                Schema defining the parameters accepted by the function.

                • type string

                  The type of the parameters object (usually 'object').

                • required array

                  List of required parameter names.

                  • items string

                • properties object

                  Definitions of each parameter.

                  • additionalProperties object

                    • type string

                      The data type of the parameter.

                    • description string

                      A description of the expected parameter.

      • stream boolean

        If true, the response will be streamed back incrementally.

      • max_tokens integer default 256

        The maximum number of tokens to generate in the response.

      • temperature number default 0.6 min 0 max 5

        Controls the randomness of the output; higher values produce more random results.

      • top_p number min 0 max 2

        Controls the creativity of the AI's responses by adjusting how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses.

      • top_k integer min 1 max 50

        Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises.

      • seed integer min 1 max 9999999999

        Random seed for reproducibility of the generation.

      • repetition_penalty number min 0 max 2

        Penalty for repeated tokens; higher values discourage repetition.

      • frequency_penalty number min 0 max 2

        Decreases the likelihood of the model repeating the same lines verbatim.

      • presence_penalty number min 0 max 2

        Increases the likelihood of the model introducing new topics.

    Output

    • 0 object

      • response string

        The generated text response from the model

      • tool_calls array

        An array of tool calls requests made during the response generation

        • items object

          • arguments object

            The arguments passed to be passed to the tool call request

          • name string

            The name of the tool to be called

    • 1 string

    API Schemas

    The following schemas are based on JSON Schema

    {
    "type": "object",
    "oneOf": [
    {
    "title": "Prompt",
    "properties": {
    "prompt": {
    "type": "string",
    "minLength": 1,
    "maxLength": 131072,
    "description": "The input text prompt for the model to generate a response."
    },
    "image": {
    "oneOf": [
    {
    "type": "array",
    "description": "An array of integers that represent the image data constrained to 8-bit unsigned integer values",
    "items": {
    "type": "number",
    "description": "A value between 0 and 255"
    }
    },
    {
    "type": "string",
    "format": "binary",
    "description": "Binary string representing the image contents."
    }
    ]
    },
    "raw": {
    "type": "boolean",
    "default": false,
    "description": "If true, a chat template is not applied and you must adhere to the specific model's expected formatting."
    },
    "stream": {
    "type": "boolean",
    "default": false,
    "description": "If true, the response will be streamed back incrementally using SSE, Server Sent Events."
    },
    "max_tokens": {
    "type": "integer",
    "default": 256,
    "description": "The maximum number of tokens to generate in the response."
    },
    "temperature": {
    "type": "number",
    "default": 0.6,
    "minimum": 0,
    "maximum": 5,
    "description": "Controls the randomness of the output; higher values produce more random results."
    },
    "top_p": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses."
    },
    "top_k": {
    "type": "integer",
    "minimum": 1,
    "maximum": 50,
    "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises."
    },
    "seed": {
    "type": "integer",
    "minimum": 1,
    "maximum": 9999999999,
    "description": "Random seed for reproducibility of the generation."
    },
    "repetition_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Penalty for repeated tokens; higher values discourage repetition."
    },
    "frequency_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Decreases the likelihood of the model repeating the same lines verbatim."
    },
    "presence_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Increases the likelihood of the model introducing new topics."
    },
    "lora": {
    "type": "string",
    "description": "Name of the LoRA (Low-Rank Adaptation) model to fine-tune the base model."
    }
    },
    "required": [
    "prompt"
    ]
    },
    {
    "title": "Messages",
    "properties": {
    "messages": {
    "type": "array",
    "description": "An array of message objects representing the conversation history.",
    "items": {
    "type": "object",
    "properties": {
    "role": {
    "type": "string",
    "description": "The role of the message sender (e.g., 'user', 'assistant', 'system', 'tool')."
    },
    "content": {
    "type": "string",
    "maxLength": 131072,
    "description": "The content of the message as a string."
    }
    },
    "required": [
    "role",
    "content"
    ]
    }
    },
    "image": {
    "oneOf": [
    {
    "type": "array",
    "description": "An array of integers that represent the image data constrained to 8-bit unsigned integer values",
    "items": {
    "type": "number",
    "description": "A value between 0 and 255"
    }
    },
    {
    "type": "string",
    "format": "binary",
    "description": "Binary string representing the image contents."
    }
    ]
    },
    "functions": {
    "type": "array",
    "items": {
    "type": "object",
    "properties": {
    "name": {
    "type": "string"
    },
    "code": {
    "type": "string"
    }
    },
    "required": [
    "name",
    "code"
    ]
    }
    },
    "tools": {
    "type": "array",
    "description": "A list of tools available for the assistant to use.",
    "items": {
    "type": "object",
    "oneOf": [
    {
    "properties": {
    "name": {
    "type": "string",
    "description": "The name of the tool. More descriptive the better."
    },
    "description": {
    "type": "string",
    "description": "A brief description of what the tool does."
    },
    "parameters": {
    "type": "object",
    "description": "Schema defining the parameters accepted by the tool.",
    "properties": {
    "type": {
    "type": "string",
    "description": "The type of the parameters object (usually 'object')."
    },
    "required": {
    "type": "array",
    "description": "List of required parameter names.",
    "items": {
    "type": "string"
    }
    },
    "properties": {
    "type": "object",
    "description": "Definitions of each parameter.",
    "additionalProperties": {
    "type": "object",
    "properties": {
    "type": {
    "type": "string",
    "description": "The data type of the parameter."
    },
    "description": {
    "type": "string",
    "description": "A description of the expected parameter."
    }
    },
    "required": [
    "type",
    "description"
    ]
    }
    }
    },
    "required": [
    "type",
    "properties"
    ]
    }
    },
    "required": [
    "name",
    "description",
    "parameters"
    ]
    },
    {
    "properties": {
    "type": {
    "type": "string",
    "description": "Specifies the type of tool (e.g., 'function')."
    },
    "function": {
    "type": "object",
    "description": "Details of the function tool.",
    "properties": {
    "name": {
    "type": "string",
    "description": "The name of the function."
    },
    "description": {
    "type": "string",
    "description": "A brief description of what the function does."
    },
    "parameters": {
    "type": "object",
    "description": "Schema defining the parameters accepted by the function.",
    "properties": {
    "type": {
    "type": "string",
    "description": "The type of the parameters object (usually 'object')."
    },
    "required": {
    "type": "array",
    "description": "List of required parameter names.",
    "items": {
    "type": "string"
    }
    },
    "properties": {
    "type": "object",
    "description": "Definitions of each parameter.",
    "additionalProperties": {
    "type": "object",
    "properties": {
    "type": {
    "type": "string",
    "description": "The data type of the parameter."
    },
    "description": {
    "type": "string",
    "description": "A description of the expected parameter."
    }
    },
    "required": [
    "type",
    "description"
    ]
    }
    }
    },
    "required": [
    "type",
    "properties"
    ]
    }
    },
    "required": [
    "name",
    "description",
    "parameters"
    ]
    }
    },
    "required": [
    "type",
    "function"
    ]
    }
    ]
    }
    },
    "stream": {
    "type": "boolean",
    "default": false,
    "description": "If true, the response will be streamed back incrementally."
    },
    "max_tokens": {
    "type": "integer",
    "default": 256,
    "description": "The maximum number of tokens to generate in the response."
    },
    "temperature": {
    "type": "number",
    "default": 0.6,
    "minimum": 0,
    "maximum": 5,
    "description": "Controls the randomness of the output; higher values produce more random results."
    },
    "top_p": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Controls the creativity of the AI's responses by adjusting how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses."
    },
    "top_k": {
    "type": "integer",
    "minimum": 1,
    "maximum": 50,
    "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises."
    },
    "seed": {
    "type": "integer",
    "minimum": 1,
    "maximum": 9999999999,
    "description": "Random seed for reproducibility of the generation."
    },
    "repetition_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Penalty for repeated tokens; higher values discourage repetition."
    },
    "frequency_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Decreases the likelihood of the model repeating the same lines verbatim."
    },
    "presence_penalty": {
    "type": "number",
    "minimum": 0,
    "maximum": 2,
    "description": "Increases the likelihood of the model introducing new topics."
    }
    },
    "required": [
    "messages"
    ]
    }
    ]
    }