airllm-fork-nodejs/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts

import { EventRelay } from "lifecycle-utils";
import { ChatWrapper } from "../../ChatWrapper.js";
import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { LlamaGrammar } from "../LlamaGrammar.js";
import { LLamaChatContextShiftOptions, LlamaChatResponseChunk, LlamaChatResponseFunctionCallParamsChunk } from "../LlamaChat/LlamaChat.js";
import { EvaluationPriority } from "../LlamaContext/types.js";
import { TokenBias } from "../TokenBias.js";
import { LlamaText } from "../../utils/LlamaText.js";
import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
export type LlamaChatSessionOptions = {
    contextSequence: LlamaContextSequence;
    /** `"auto"` is used by default */
    chatWrapper?: "auto" | ChatWrapper;
    systemPrompt?: string;
    /**
     * Add the system prompt even on models that don't support a system prompt.
     *
     * Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
     * but forcing the system prompt on unsupported models may not always work as expected.
     *
     * Use with caution.
     */
    forceAddSystemPrompt?: boolean;
    /**
     * Automatically dispose the sequence when the session is disposed.
     *
     * Defaults to `false`.
     */
    autoDisposeSequence?: boolean;
    contextShift?: LlamaChatSessionContextShiftOptions;
};
export type LlamaChatSessionContextShiftOptions = {
    /**
     * The number of tokens to delete from the context window to make space for new ones.
     * Defaults to 10% of the context size.
     */
    size?: LLamaChatContextShiftOptions["size"];
    /**
     * The strategy to use when deleting tokens from the context window.
     *
     * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
     */
    strategy?: LLamaChatContextShiftOptions["strategy"];
};
export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
    /**
     * Called as the model generates the main response with the generated text chunk.
     *
     * Useful for streaming the generated response as it's being generated.
     *
     * Includes only the main response without any text segments (like thoughts).
     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
     */
    onTextChunk?: (text: string) => void;
    /**
     * Called as the model generates the main response with the generated tokens.
     *
     * Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
     *
     * Includes only the main response without any segments (like thoughts).
     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
     */
    onToken?: (tokens: Token[]) => void;
    /**
     * Called as the model generates a response with the generated text and tokens,
     * including segment information (when the generated output is part of a segment).
     *
     * Useful for streaming the generated response as it's being generated, including the main response and all segments.
     *
     * Only use this function when you need the segmented texts, like thought segments (chain of thought text).
     */
    onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
    /**
     * An AbortSignal to later abort the generation.
     *
     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
     *
     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
     */
    signal?: AbortSignal;
    /**
     * When a response already started being generated and then the signal is aborted,
     * the generation will stop and the response will be returned as is instead of throwing an error.
     *
     * Defaults to `false`.
     */
    stopOnAbortSignal?: boolean;
    /** Maximum number of tokens to generate */
    maxTokens?: number;
    /**
     * Temperature is a hyperparameter that controls the randomness of the generated text.
     * It affects the probability distribution of the model's output tokens.
     *
     * A higher temperature (e.g., 1.5) makes the output more random and creative,
     * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
     *
     * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
     *
     * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
     *
     * Set to `0` to disable.
     * Disabled by default (set to `0`).
     */
    temperature?: number;
    /**
     * From the next token candidates, discard the percentage of tokens with the lowest probability.
     * For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
     * This is useful for generating more high-quality results when using a high temperature.
     * Set to a value between `0` and `1` to enable.
     *
     * Only relevant when `temperature` is set to a value greater than `0`.
     * Disabled by default.
     */
    minP?: number;
    /**
     * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
     * An integer number between `1` and the size of the vocabulary.
     * Set to `0` to disable (which uses the full vocabulary).
     *
     * Only relevant when `temperature` is set to a value greater than 0.
     */
    topK?: number;
    /**
     * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
     * and samples the next token only from this set.
     * A float number between `0` and `1`.
     * Set to `1` to disable.
     *
     * Only relevant when `temperature` is set to a value greater than `0`.
     */
    topP?: number;
    /**
     * Used to control the randomness of the generated text.
     *
     * Change the seed to get different results.
     *
     * Only relevant when using `temperature`.
     */
    seed?: number;
    /**
     * Trim whitespace from the end of the generated text
     * Disabled by default.
     */
    trimWhitespaceSuffix?: boolean;
    /**
     * Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
     *
     * May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
     * so avoid using it together with function calling if you notice unexpected behavior.
     */
    responsePrefix?: string;
    /**
     * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
     */
    evaluationPriority?: EvaluationPriority;
    repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
    /**
     * Adjust the probability of tokens being generated.
     * Can be used to bias the model to generate tokens that you want it to lean towards,
     * or to avoid generating tokens that you want it to avoid.
     */
    tokenBias?: TokenBias | (() => TokenBias);
    /**
     * Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
     */
    customStopTriggers?: (LlamaText | string | (string | Token)[])[];
    /**
     * Called as the model generates function calls with the generated parameters chunk for each function call.
     *
     * Useful for streaming the generated function call parameters as they're being generated.
     * Only useful in specific use cases,
     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
     *
     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
     * according to the function parameters schema.
     *
     * Each function call has its own `callIndex` you can use to distinguish between them.
     *
     * Only relevant when using function calling (via passing the `functions` option).
     */
    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
    /**
     * Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
     */
    budgets?: {
        /**
         * Budget for thought tokens.
         *
         * Defaults to `Infinity`.
         */
        thoughtTokens?: number;
        /**
         * Budget for comment tokens.
         *
         * Defaults to `Infinity`.
         */
        commentTokens?: number;
    };
} & ({
    grammar?: LlamaGrammar;
    functions?: never;
    documentFunctionParams?: never;
    maxParallelFunctionCalls?: never;
    onFunctionCallParamsChunk?: never;
} | {
    grammar?: never;
    functions?: Functions | ChatSessionModelFunctions;
    documentFunctionParams?: boolean;
    maxParallelFunctionCalls?: number;
    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
});
export type LLamaChatCompletePromptOptions = {
    /**
     * Generate a completion for the given user prompt up to the given number of tokens.
     *
     * Defaults to `256` or half the context size, whichever is smaller.
     */
    maxTokens?: LLamaChatPromptOptions["maxTokens"];
    /**
     * When a completion already started being generated and then the given `signal` is aborted,
     * the generation will stop and the completion will be returned as-is instead of throwing an error.
     *
     * Defaults to `false`.
     */
    stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
    /**
     * Called as the model generates a completion with the generated text chunk.
     *
     * Useful for streaming the generated completion as it's being generated.
     */
    onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
    /**
     * Called as the model generates a completion with the generated tokens.
     *
     * Preferably, you'd want to use `onTextChunk` instead of this.
     */
    onToken?: LLamaChatPromptOptions["onToken"];
    signal?: LLamaChatPromptOptions["signal"];
    temperature?: LLamaChatPromptOptions["temperature"];
    minP?: LLamaChatPromptOptions["minP"];
    topK?: LLamaChatPromptOptions["topK"];
    topP?: LLamaChatPromptOptions["topP"];
    seed?: LLamaChatPromptOptions["seed"];
    trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
    evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
    repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
    tokenBias?: LLamaChatPromptOptions["tokenBias"];
    customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
    grammar?: LlamaGrammar;
    /**
     * Functions are not used by the model here,
     * but are used for keeping the instructions given to the model about the functions in the current context state,
     * to avoid context shifts.
     *
     * It's best to provide the same functions that were used for the previous prompt here.
     */
    functions?: ChatSessionModelFunctions;
    /**
     * Functions are not used by the model here,
     * but are used for keeping the instructions given to the model about the functions in the current context state,
     * to avoid context shifts.
     *
     * It's best to provide the same value that was used for the previous prompt here.
     */
    documentFunctionParams?: boolean;
    /**
     * Whether to complete the prompt as a model response.
     *
     * - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
     *   This is a good option to workaround some models that don't support used prompt completions.
     * - **`true`**: Always complete as a model response
     * - **`false`**: Never complete as a model response
     *
     * Defaults to `"auto"`.
     */
    completeAsModel?: "auto" | boolean | {
        /**
         * Whether to complete the prompt as a model response.
         *
         * - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
         *   This is a good option to workaround some models that don't support used prompt completions.
         * - **`true`**: Always complete as a model response
         * - **`false`**: Never complete as a model response
         *
         * Defaults to `"auto"`.
         */
        enabled?: "auto" | boolean;
        /**
         * The messages to append to the chat history to generate a completion as a model response.
         *
         * If the last message is a model message, the prompt will be pushed to it for the completion,
         * otherwise a new model message will be added with the prompt.
         *
         * It must contain a user message or a system message before the model message.
         *
         * Default to:
         * ```ts
         * [
         *     {
         *         type: "system",
         *         text: "For your next response predict what the user may send next. " +
         *             "No yapping, no whitespace. Match the user's language and tone."
         *     },
         *     {type: "user", text: ""},
         *     {type: "model", response: [""]}
         * ]
         * ```
         */
        appendedMessages?: ChatHistoryItem[];
    };
};
export type LLamaChatPreloadPromptOptions = {
    signal?: LLamaChatCompletePromptOptions["signal"];
    evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
    functions?: LLamaChatCompletePromptOptions["functions"];
    documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
};
export type LlamaChatSessionRepeatPenalty = {
    /**
     * Number of recent tokens generated by the model to apply penalties to repetition of.
     * Defaults to `64`.
     */
    lastTokens?: number;
    punishTokensFilter?: (tokens: Token[]) => Token[];
    /**
     * Penalize new line tokens.
     * Enabled by default.
     */
    penalizeNewLine?: boolean;
    /**
     * The relative amount to lower the probability of the tokens in `punishTokens` by
     * Defaults to `1.1`.
     * Set to `1` to disable.
     */
    penalty?: number;
    /**
     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
     * Disabled by default (`0`).
     * Set to a value between `0` and `1` to enable.
     */
    frequencyPenalty?: number;
    /**
     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
     * Disabled by default (`0`).
     * Set to a value between `0` and `1` to enable.
     */
    presencePenalty?: number;
};
/**
 * @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
 */
export declare class LlamaChatSession {
    readonly onDispose: EventRelay<void>;
    constructor(options: LlamaChatSessionOptions);
    dispose({ disposeSequence }?: {
        disposeSequence?: boolean;
    }): void;
    /** @hidden */
    [Symbol.dispose](): void;
    get disposed(): boolean;
    get chatWrapper(): ChatWrapper;
    get sequence(): LlamaContextSequence;
    get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
    get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
    prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
    /**
     * @param prompt
     * @param [options]
     */
    promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
        response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
        responseText: string;
        stopReason: "customStopTrigger";
        customStopTrigger: (string | Token)[];
        remainingGenerationAfterStop: string | Token[] | undefined;
    } | {
        response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
        responseText: string;
        stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
        remainingGenerationAfterStop: string | Token[] | undefined;
        customStopTrigger?: undefined;
    }>;
    /**
     * Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
     * and feel faster.
     *
     * > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
     * @param prompt - the prompt to preload
     * @param [options]
     */
    preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
    /**
     * Preload a user prompt into the current context sequence state and generate a completion for it.
     *
     * > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
     * > so consider limiting the length of prompts you preload.
     * >
     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
     * @param prompt - the prompt to preload
     * @param [options]
     */
    completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
    /**
     * Create a smart completion engine that caches the prompt completions
     * and reuses them when the user prompt matches the beginning of the cached prompt or completion.
     *
     * All completions are made and cache is used only for the current chat session state.
     * You can create a single completion engine for an entire chat session.
     */
    createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
    /**
     * See `completePrompt` for more information.
     * @param prompt
     * @param [options]
     */
    completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel }?: LLamaChatCompletePromptOptions): Promise<{
        completion: string;
        stopReason: "customStopTrigger";
        customStopTrigger: (string | Token)[];
        remainingGenerationAfterStop: string | Token[] | undefined;
    } | {
        completion: string;
        stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
        remainingGenerationAfterStop: string | Token[] | undefined;
        customStopTrigger?: undefined;
    }>;
    getChatHistory(): ChatHistoryItem[];
    getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
    setChatHistory(chatHistory: ChatHistoryItem[]): void;
    /** Clear the chat history and reset it to the initial state. */
    resetChatHistory(): void;
}