First upload version 0.0.1

2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
@@ -0,0 +1,459 @@
+import { EventRelay } from "lifecycle-utils";
+import { ChatWrapper } from "../../ChatWrapper.js";
+import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
+import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
+import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
+import { LlamaGrammar } from "../LlamaGrammar.js";
+import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
+import { EvaluationPriority } from "../LlamaContext/types.js";
+import { TokenBias } from "../TokenBias.js";
+import { LlamaModel } from "../LlamaModel/LlamaModel.js";
+export type LlamaChatOptions = {
+    contextSequence: LlamaContextSequence;
+    /** `"auto"` is used by default */
+    chatWrapper?: "auto" | ChatWrapper;
+    /**
+     * Automatically dispose the sequence when the session is disposed
+     *
+     * Defaults to `false`.
+     */
+    autoDisposeSequence?: boolean;
+};
+export type LlamaChatResponseChunk = LlamaChatResponseTextChunk | LlamaChatResponseSegmentChunk;
+export type LlamaChatResponseTextChunk = {
+    /** When `type` is `undefined`, the chunk is part of the main response and is not a segment */
+    type: undefined;
+    /**
+     * `segmentType` has no purpose when `type` is `undefined` (meaning that this chunk is part of the main response and is not a segment).
+     */
+    segmentType: undefined;
+    /**
+     * The generated text chunk.
+     *
+     * Detokenized from the `tokens` property,
+     * but with the context of the previous generation (for better spacing of the text with some models).
+     *
+     * Prefer using this property over `tokens` when streaming the generated response as text.
+     */
+    text: string;
+    /** The generated tokens */
+    tokens: Token[];
+};
+export type LlamaChatResponseSegmentChunk = {
+    type: "segment";
+    /** Segment type */
+    segmentType: ChatModelSegmentType;
+    /**
+     * The generated text chunk.
+     *
+     * Detokenized from the `tokens` property,
+     * but with the context of the previous generation (for better spacing of the text with some models).
+     *
+     * Prefer using this property over `tokens` when streaming the generated response as text.
+     */
+    text: string;
+    /** The generated tokens */
+    tokens: Token[];
+    /**
+     * When the current chunk is the start of a segment, this field will be set.
+     *
+     * It's possible that a chunk with no tokens and empty text will be emitted just to set this field
+     * to signify that the segment has started.
+     */
+    segmentStartTime?: Date;
+    /**
+     * When the current chunk is the last one of a segment (meaning the current segment has ended), this field will be set.
+     *
+     * It's possible that a chunk with no tokens and empty text will be emitted just to set this field
+     * to signify that the segment has ended.
+     */
+    segmentEndTime?: Date;
+};
+export type LlamaChatResponseFunctionCallParamsChunk = {
+    /**
+     * Each different function call has a different `callIndex`.
+     *
+     * When the previous function call has finished being generated, the `callIndex` of the next one will increment.
+     *
+     * Use this value to distinguish between different function calls.
+     */
+    callIndex: number;
+    /**
+     * The name of the function being called
+     */
+    functionName: string;
+    /**
+     * A chunk of the generated text used for the function call parameters.
+     *
+     * Collect all the chunks together to construct the full function call parameters.
+     *
+     * After the function call is finished, the entire constructed params text can be parsed as a JSON object,
+     * according to the function parameters schema.
+     */
+    paramsChunk: string;
+    /**
+     * When this is `true`, the current chunk is the last chunk in the generation of the current function call parameters.
+     */
+    done: boolean;
+};
+export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * Called as the model generates the main response with the generated text chunk.
+     *
+     * Useful for streaming the generated response as it's being generated.
+     *
+     * Includes only the main response without any text segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onTextChunk?: (text: string) => void;
+    /**
+     * Called as the model generates the main response with the generated tokens.
+     *
+     * Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
+     *
+     * Includes only the main response without any segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onToken?: (tokens: Token[]) => void;
+    /**
+     * Called as the model generates a response with the generated text and tokens,
+     * including segment information (when the generated output is part of a segment).
+     *
+     * Useful for streaming the generated response as it's being generated, including the main response and all segments.
+     *
+     * Only use this function when you need the segmented texts, like thought segments (chain of thought text).
+     */
+    onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
+    signal?: AbortSignal;
+    /**
+     * When a response already started being generated and then the signal is aborted,
+     * the generation will stop and the response will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /** Maximum number of tokens to generate */
+    maxTokens?: number;
+    /**
+     * Temperature is a hyperparameter that controls the randomness of the generated text.
+     * It affects the probability distribution of the model's output tokens.
+     *
+     * A higher temperature (e.g., 1.5) makes the output more random and creative,
+     * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
+     *
+     * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
+     *
+     * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+     *
+     * Set to `0` to disable.
+     * Disabled by default (set to `0`).
+     */
+    temperature?: number;
+    /**
+     * From the next token candidates, discard the percentage of tokens with the lowest probability.
+     * For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
+     * This is useful for generating more high-quality results when using a high temperature.
+     * Set to a value between `0` and `1` to enable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     * Disabled by default.
+     */
+    minP?: number;
+    /**
+     * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
+     * An integer number between `1` and the size of the vocabulary.
+     * Set to `0` to disable (which uses the full vocabulary).
+     *
+     * Only relevant when `temperature` is set to a value greater than 0.
+     */
+    topK?: number;
+    /**
+     * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
+     * and samples the next token only from this set.
+     * A float number between `0` and `1`.
+     * Set to `1` to disable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     */
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    /**
+     * Trim whitespace from the end of the generated text
+     *
+     * Defaults to `false`.
+     */
+    trimWhitespaceSuffix?: boolean;
+    repeatPenalty?: false | LLamaContextualRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
+     */
+    evaluationPriority?: EvaluationPriority;
+    contextShift?: LLamaChatContextShiftOptions;
+    /**
+     * Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
+     */
+    customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
+    /**
+     * The evaluation context window returned from the last evaluation.
+     * This is an optimization to utilize existing context sequence state better when possible.
+     */
+    lastEvaluationContextWindow?: {
+        /** The history of the last evaluation. */
+        history?: ChatHistoryItem[];
+        /**
+         * Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
+         * If the last evaluation context window is not used, a new context will be generated based on the full history,
+         * which will decrease the likelihood of another context shift happening so soon.
+         *
+         * A number between `0` (exclusive) and `1` (inclusive).
+         */
+        minimumOverlapPercentageToPreventContextShift?: number;
+    };
+    /**
+     * Called as the model generates function calls with the generated parameters chunk for each function call.
+     *
+     * Useful for streaming the generated function call parameters as they're being generated.
+     * Only useful in specific use cases,
+     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
+     *
+     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
+     * according to the function parameters schema.
+     *
+     * Each function call has its own `callIndex` you can use to distinguish between them.
+     *
+     * Only relevant when using function calling (via passing the `functions` option).
+     */
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+    /**
+     * Set the maximum number of tokens the model is allowed to spend on various segmented responses.
+     */
+    budgets?: {
+        /**
+         * Whether to include the tokens already consumed by the current model response being completed in the budget.
+         *
+         * Defaults to `true`.
+         */
+        includeCurrentResponse?: boolean;
+        /**
+         * Budget for thought tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        thoughtTokens?: number;
+        /**
+         * Budget for comment tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        commentTokens?: number;
+    };
+    /**
+     * Stop the generation when the model tries to generate a non-textual segment or call a function.
+     *
+     * Useful for generating completions in a form of a model response.
+     *
+     * Defaults to `false`.
+     */
+    abortOnNonText?: boolean;
+} & ({
+    grammar?: LlamaGrammar;
+    functions?: never;
+    documentFunctionParams?: never;
+    maxParallelFunctionCalls?: never;
+    onFunctionCall?: never;
+    onFunctionCallParamsChunk?: never;
+} | {
+    grammar?: never;
+    functions?: Functions | ChatModelFunctions;
+    documentFunctionParams?: boolean;
+    maxParallelFunctionCalls?: number;
+    onFunctionCall?: (functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>) => void;
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+});
+export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * Complete the given user prompt without adding it or the completion to the returned context window.
+     */
+    initialUserPrompt?: string;
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /**
+     * Called as the model generates a completion with the generated text chunk.
+     *
+     * Useful for streaming the generated completion as it's being generated.
+     */
+    onTextChunk?: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
+    /**
+     * Called as the model generates a completion with the generated tokens.
+     *
+     * Preferably, you'd want to use `onTextChunk` instead of this.
+     */
+    onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"];
+    signal?: LLamaChatGenerateResponseOptions<Functions>["signal"];
+    maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
+    temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"];
+    minP?: LLamaChatGenerateResponseOptions<Functions>["minP"];
+    topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
+    topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
+    seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
+    trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
+    repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
+    tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
+    evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
+    contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
+    customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
+    lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"];
+    grammar?: LlamaGrammar;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same functions that were used for the previous prompt here.
+     */
+    functions?: Functions | ChatModelFunctions;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same value that was used for the previous prompt here.
+     */
+    documentFunctionParams?: boolean;
+};
+export type LLamaChatContextShiftOptions = {
+    /**
+     * The number of tokens to delete from the context window to make space for new ones.
+     * Defaults to 10% of the context size.
+     */
+    size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+    /**
+     * The strategy to use when deleting tokens from the context window.
+     *
+     * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
+     */
+    strategy?: "eraseFirstResponseAndKeepFirstSystem" | ((options: {
+        /** Full chat history */
+        chatHistory: readonly ChatHistoryItem[];
+        /** Maximum number of tokens that the new chat history should fit under when tokenized */
+        maxTokensCount: number;
+        /** Tokenizer used to tokenize the chat history */
+        tokenizer: Tokenizer;
+        /** Chat wrapper used to generate the context state */
+        chatWrapper: ChatWrapper;
+        /**
+         * The metadata returned from the last context shift strategy call.
+         * Will be `null` on the first call.
+         */
+        lastShiftMetadata?: object | null;
+    }) => {
+        chatHistory: ChatHistoryItem[];
+        metadata?: object | null;
+    } | Promise<{
+        chatHistory: ChatHistoryItem[];
+        metadata?: object | null;
+    }>);
+    /**
+     * The `contextShiftMetadata` returned from the last evaluation.
+     * This is an optimization to utilize the existing context state better when possible.
+     */
+    lastEvaluationMetadata?: object | undefined | null;
+};
+export declare class LlamaChat {
+    readonly onDispose: EventRelay<void>;
+    constructor({ contextSequence, chatWrapper, autoDisposeSequence }: LlamaChatOptions);
+    dispose({ disposeSequence }?: {
+        disposeSequence?: boolean;
+    }): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get chatWrapper(): ChatWrapper;
+    get sequence(): LlamaContextSequence;
+    get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
+    get model(): LlamaModel;
+    generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatGenerateResponseOptions<Functions>): Promise<LlamaChatResponse<Functions>>;
+    loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatLoadAndCompleteUserMessageOptions<Functions>): Promise<LlamaChatLoadAndCompleteUserResponse>;
+}
+export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * The response text only, _without_ any text segments (like thoughts).
+     */
+    response: string;
+    /**
+     * The full response, including all text and text segments (like thoughts).
+     */
+    fullResponse: Array<string | LlamaChatResponseSegment>;
+    functionCalls?: Functions extends ChatModelFunctions ? LlamaChatResponseFunctionCall<Functions>[] : never;
+    lastEvaluation: {
+        cleanHistory: ChatHistoryItem[];
+        contextWindow: ChatHistoryItem[];
+        contextShiftMetadata: any;
+    };
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort";
+    } | {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+    };
+};
+export type LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions, FunctionCallName extends keyof Functions & string = string & keyof Functions, Params = Functions[FunctionCallName]["params"] extends undefined | null | void ? undefined : GbnfJsonSchemaToType<Functions[FunctionCallName]["params"]>> = {
+    functionName: FunctionCallName;
+    params: Params;
+    raw: LlamaTextJSON;
+};
+export type LlamaChatResponseSegment = {
+    type: "segment";
+    segmentType: ChatModelSegmentType;
+    text: string;
+    ended: boolean;
+    raw: LlamaTextJSON;
+    startTime?: string;
+    endTime?: string;
+};
+export type LlamaChatLoadAndCompleteUserResponse = {
+    completion: string;
+    lastEvaluation: {
+        /**
+         * The completion and initial user prompt are not added to this context window result,
+         * but are loaded to the current context sequence state as tokens
+         */
+        contextWindow: ChatHistoryItem[];
+        contextShiftMetadata: any;
+    };
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
+    } | {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+    };
+};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.d.ts
@@ -0,0 +1,11 @@
+import { LlamaGrammar } from "../../LlamaGrammar.js";
+import { ChatModelFunctions } from "../../../types.js";
+import { ChatWrapper } from "../../../ChatWrapper.js";
+import { Llama } from "../../../bindings/Llama.js";
+export declare class FunctionCallNameGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
+    private readonly _functions;
+    private readonly _chatWrapper;
+    constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper);
+    parseFunctionName(generatedFunctionName: string): keyof Functions & string;
+    private _validateFunctions;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js
@@ -0,0 +1,55 @@
+import { LlamaGrammar } from "../../LlamaGrammar.js";
+import { LlamaText } from "../../../utils/LlamaText.js";
+import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
+import { GbnfGrammar } from "../../../utils/gbnfJson/terminals/GbnfGrammar.js";
+import { GbnfOr } from "../../../utils/gbnfJson/terminals/GbnfOr.js";
+import { GbnfVerbatimText } from "../../../utils/gbnfJson/terminals/GbnfVerbatimText.js";
+import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
+export class FunctionCallNameGrammar extends LlamaGrammar {
+    _functions;
+    _chatWrapper;
+    constructor(llama, functions, chatWrapper) {
+        const grammar = getGbnfGrammarForFunctionName(functions, chatWrapper);
+        super(llama, {
+            grammar,
+            stopGenerationTriggers: [LlamaText("\n")],
+            trimWhitespaceSuffix: true
+        });
+        this._functions = functions;
+        this._chatWrapper = chatWrapper;
+        this._validateFunctions();
+    }
+    parseFunctionName(generatedFunctionName) {
+        if (this._chatWrapper.settings.functions.call.optionalPrefixSpace && generatedFunctionName[0] === " ")
+            generatedFunctionName = generatedFunctionName.slice(1);
+        const newlineIndex = generatedFunctionName.indexOf("\n");
+        const functionName = generatedFunctionName.slice(0, newlineIndex < 0
+            ? generatedFunctionName.length
+            : newlineIndex);
+        if (!Object.hasOwn(this._functions, functionName))
+            throw new LlamaFunctionCallValidationError(`Function name "${functionName}" is not in the supplied functions object`, this._functions, this._chatWrapper, generatedFunctionName);
+        return functionName;
+    }
+    _validateFunctions() {
+        for (const functionsName of Object.keys(this._functions)) {
+            if (functionsName.includes(" ") || functionsName.includes("\n") || functionsName.includes("\t"))
+                throw new Error(`Function name "${functionsName}" contains spaces, new lines or tabs`);
+            else if (functionsName === "")
+                throw new Error("Function name cannot be an empty string");
+        }
+    }
+}
+function getGbnfGrammarForFunctionName(functions, chatWrapper) {
+    const grammarGenerator = new GbnfGrammarGenerator();
+    const functionNameGrammars = [];
+    for (const functionName of Object.keys(functions))
+        functionNameGrammars.push(new GbnfVerbatimText(functionName));
+    const callGrammar = new GbnfOr(functionNameGrammars);
+    const rootTerminal = new GbnfGrammar([
+        ...(chatWrapper.settings.functions.call.optionalPrefixSpace ? ["[ ]?"] : []),
+        callGrammar.resolve(grammarGenerator)
+    ]);
+    const rootGrammar = rootTerminal.getGrammar();
+    return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]");
+}
+//# sourceMappingURL=FunctionCallNameGrammar.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"FunctionCallNameGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AAEtD,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AAErF,OAAO,EAAC,WAAW,EAAC,MAAM,kDAAkD,CAAC;AAE7E,OAAO,EAAC,MAAM,EAAC,MAAM,6CAA6C,CAAC;AACnE,OAAO,EAAC,gBAAgB,EAAC,MAAM,uDAAuD,CAAC;AAEvF,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,uBAAoE,SAAQ,YAAY;IAChF,UAAU,CAAY;IACtB,YAAY,CAAc;IAE3C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB;QAC3E,MAAM,OAAO,GAAG,6BAA6B,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;QAEtE,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACzC,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAEhC,IAAI,CAAC,kBAAkB,EAAE,CAAC;IAC9B,CAAC;IAEM,iBAAiB,CAAC,qBAA6B;QAClD,IAAI,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,IAAI,qBAAqB,CAAC,CAAC,CAAC,KAAK,GAAG;YACjG,qBAAqB,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAG,qBAAqB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAEzD,MAAM,YAAY,GAAG,qBAAqB,CAAC,KAAK,CAC5C,CAAC,EACD,YAAY,GAAG,CAAC;YACZ,CAAC,CAAC,qBAAqB,CAAC,MAAM;YAC9B,CAAC,CAAC,YAAY,CACO,CAAC;QAE9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC;YAC7C,MAAM,IAAI,gCAAgC,CACtC,kBAAkB,YAAY,2CAA2C,EACzE,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,qBAAqB,CACxB,CAAC;QAEN,OAAO,YAAY,CAAC;IACxB,CAAC;IAEO,kBAAkB;QACtB,KAAK,MAAM,aAAa,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACvD,IAAI,aAAa,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC;gBAC3F,MAAM,IAAI,KAAK,CAAC,kBAAkB,aAAa,sCAAsC,CAAC,CAAC;iBACtF,IAAI,aAAa,KAAK,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACnE,CAAC;IACL,CAAC;CACJ;AAED,SAAS,6BAA6B,CAClC,SAAoB,EAAE,WAAwB;IAE9C,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAEpD,MAAM,oBAAoB,GAAmB,EAAE,CAAC;IAEhD,KAAK,MAAM,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC;QAC7C,oBAAoB,CAAC,IAAI,CAAC,IAAI,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC;IAElE,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAErD,MAAM,YAAY,GAAG,IAAI,WAAW,CAAC;QACjC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,WAAW,CAAC,OAAO,CAAC,gBAAgB,CAAC;KACxC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC;IAE9C,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,QAAQ,CAAC,CAAC;AACrE,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.d.ts
@@ -0,0 +1,16 @@
+import { LlamaGrammar } from "../../LlamaGrammar.js";
+import { ChatModelFunctions } from "../../../types.js";
+import { ChatWrapper } from "../../../ChatWrapper.js";
+import { Llama } from "../../../bindings/Llama.js";
+import { GbnfJsonSchema } from "../../../utils/gbnfJson/types.js";
+export declare class FunctionCallParamsGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
+    private readonly _functions;
+    private readonly _chatWrapper;
+    private readonly _functionName;
+    private readonly _paramsSchema;
+    constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper, functionName: string, paramsSchema: GbnfJsonSchema);
+    parseParams(callText: string): {
+        params: any;
+        raw: string;
+    };
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js
@@ -0,0 +1,45 @@
+import { LlamaGrammar } from "../../LlamaGrammar.js";
+import { LlamaText } from "../../../utils/LlamaText.js";
+import { validateObjectAgainstGbnfSchema } from "../../../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
+import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
+import { getGbnfJsonTerminalForGbnfJsonSchema } from "../../../utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js";
+import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
+export class FunctionCallParamsGrammar extends LlamaGrammar {
+    _functions;
+    _chatWrapper;
+    _functionName;
+    _paramsSchema;
+    constructor(llama, functions, chatWrapper, functionName, paramsSchema) {
+        const grammar = getGbnfGrammarForFunctionParams(paramsSchema);
+        super(llama, {
+            grammar,
+            stopGenerationTriggers: [LlamaText("\n".repeat(4))],
+            trimWhitespaceSuffix: true
+        });
+        this._functions = functions;
+        this._chatWrapper = chatWrapper;
+        this._functionName = functionName;
+        this._paramsSchema = paramsSchema;
+    }
+    parseParams(callText) {
+        const endIndex = callText.lastIndexOf("\n".repeat(4));
+        if (endIndex < 0)
+            throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to end with stop generation trigger`, this._functions, this._chatWrapper, callText);
+        const paramsString = callText.slice(0, endIndex);
+        if (paramsString.trim().length === 0)
+            throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to not be empty`, this._functions, this._chatWrapper, callText);
+        const params = JSON.parse(paramsString);
+        validateObjectAgainstGbnfSchema(params, this._paramsSchema);
+        return {
+            params: params, // prevent infinite TS type instantiation
+            raw: paramsString
+        };
+    }
+}
+function getGbnfGrammarForFunctionParams(paramsSchema) {
+    const grammarGenerator = new GbnfGrammarGenerator();
+    const rootTerminal = getGbnfJsonTerminalForGbnfJsonSchema(paramsSchema, grammarGenerator);
+    const rootGrammar = rootTerminal.resolve(grammarGenerator, true);
+    return grammarGenerator.generateGbnfFile(rootGrammar + ` "${"\\n".repeat(4)}"`);
+}
+//# sourceMappingURL=FunctionCallParamsGrammar.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"FunctionCallParamsGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AACtD,OAAO,EAAC,+BAA+B,EAAC,MAAM,kEAAkE,CAAC;AAEjH,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AACrF,OAAO,EAAC,oCAAoC,EAAC,MAAM,uEAAuE,CAAC;AAI3H,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,yBAAsE,SAAQ,YAAY;IAClF,UAAU,CAAY;IACtB,YAAY,CAAc;IAC1B,aAAa,CAAS;IACtB,aAAa,CAAiB;IAE/C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB,EAAE,YAAoB,EAAE,YAA4B;QAC/H,MAAM,OAAO,GAAG,+BAA+B,CAAC,YAAY,CAAC,CAAC;QAE9D,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAChC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAEM,WAAW,CAAC,QAAgB;QAC/B,MAAM,QAAQ,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,CAAC;YACZ,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,uCAAuC,EACxG,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAEjD,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAChC,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,mBAAmB,EACpF,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAExC,+BAA+B,CAAC,MAAM,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAE5D,OAAO;YACH,MAAM,EAAE,MAAa,EAAE,yCAAyC;YAChE,GAAG,EAAE,YAAY;SACpB,CAAC;IACN,CAAC;CACJ;AAED,SAAS,+BAA+B,CAAC,YAA4B;IACjE,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IACpD,MAAM,YAAY,GAAG,oCAAoC,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;IAC1F,MAAM,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;IAEjE,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACpF,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.d.ts
@@ -0,0 +1,8 @@
+import { ChatModelFunctions } from "../../../types.js";
+import { ChatWrapper } from "../../../ChatWrapper.js";
+export declare class LlamaFunctionCallValidationError<const Functions extends ChatModelFunctions> extends Error {
+    readonly functions: Functions;
+    readonly chatWrapper: ChatWrapper;
+    readonly callText: string;
+    constructor(message: string, functions: Functions, chatWrapper: ChatWrapper, callText: string);
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js
@@ -0,0 +1,12 @@
+export class LlamaFunctionCallValidationError extends Error {
+    functions;
+    chatWrapper;
+    callText;
+    constructor(message, functions, chatWrapper, callText) {
+        super(message);
+        this.functions = functions;
+        this.chatWrapper = chatWrapper;
+        this.callText = callText;
+    }
+}
+//# sourceMappingURL=LlamaFunctionCallValidationError.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaFunctionCallValidationError.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts"],"names":[],"mappings":"AAIA,MAAM,OAAO,gCAA6E,SAAQ,KAAK;IACnF,SAAS,CAAY;IACrB,WAAW,CAAc;IACzB,QAAQ,CAAS;IAEjC,YAAmB,OAAe,EAAE,SAAoB,EAAE,WAAwB,EAAE,QAAgB;QAChG,KAAK,CAAC,OAAO,CAAC,CAAC;QAEf,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC7B,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.d.ts
@@ -0,0 +1,16 @@
+import { ChatHistoryItem, Tokenizer } from "../../../../types.js";
+import { ChatWrapper } from "../../../../ChatWrapper.js";
+export declare function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }: {
+    chatHistory: ChatHistoryItem[];
+    maxTokensCount: number;
+    tokenizer: Tokenizer;
+    chatWrapper: ChatWrapper;
+    lastShiftMetadata?: object | null;
+}): Promise<{
+    chatHistory: ChatHistoryItem[];
+    metadata: CalculationMetadata;
+}>;
+type CalculationMetadata = {
+    removedCharactersNumber: number;
+};
+export {};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js
@@ -0,0 +1,254 @@
+import { isChatModelResponseFunctionCall, isChatModelResponseSegment } from "../../../../types.js";
+import { findCharacterRemovalCountToFitChatHistoryInContext } from "../../../../utils/findCharacterRemovalCountToFitChatHistoryInContext.js";
+import { truncateLlamaTextAndRoundToWords, truncateTextAndRoundToWords } from "../../../../utils/truncateTextAndRoundToWords.js";
+import { LlamaText } from "../../../../utils/LlamaText.js";
+export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }) {
+    let initialCharactersRemovalCount = 0;
+    if (isCalculationMetadata(lastShiftMetadata))
+        initialCharactersRemovalCount = lastShiftMetadata.removedCharactersNumber;
+    const { removedCharactersCount, compressedChatHistory } = await findCharacterRemovalCountToFitChatHistoryInContext({
+        chatHistory,
+        tokensCountToFit: maxTokensCount,
+        initialCharactersRemovalCount,
+        tokenizer,
+        chatWrapper,
+        failedCompressionErrorMessage: "Failed to compress chat history for context shift due to a too long prompt or system message that cannot be compressed without affecting the generation quality. " +
+            "Consider increasing the context size or shortening the long prompt or system message.",
+        compressChatHistory({ chatHistory, charactersToRemove, estimatedCharactersPerToken }) {
+            const res = chatHistory.map((item) => structuredClone(item));
+            let charactersLeftToRemove = charactersToRemove;
+            function compressFunctionCalls() {
+                for (let i = res.length - 1; i >= 0 && charactersLeftToRemove > 0; i--) {
+                    const historyItem = res[i];
+                    if (historyItem.type !== "model")
+                        continue;
+                    for (let t = historyItem.response.length - 1; t >= 0 && charactersLeftToRemove > 0; t--) {
+                        const item = historyItem.response[t];
+                        if (typeof item === "string" || item.type !== "functionCall")
+                            continue;
+                        if (item.rawCall == null)
+                            continue;
+                        const originalRawCallTokensLength = LlamaText.fromJSON(item.rawCall).tokenize(tokenizer, "trimLeadingSpace").length;
+                        const newRawCallText = chatWrapper.generateFunctionCall(item.name, item.params);
+                        const newRawCallTextTokensLength = newRawCallText.tokenize(tokenizer, "trimLeadingSpace").length;
+                        if (newRawCallTextTokensLength < originalRawCallTokensLength) {
+                            item.rawCall = newRawCallText.toJSON();
+                            charactersLeftToRemove -= ((originalRawCallTokensLength - newRawCallTextTokensLength) * estimatedCharactersPerToken);
+                        }
+                    }
+                }
+            }
+            function removeHistoryThatLedToModelResponseAtIndex(index) {
+                let removedItems = 0;
+                for (let i = index - 1; i >= 0; i--) {
+                    const historyItem = res[i];
+                    if (historyItem == null)
+                        continue;
+                    if (historyItem.type === "model")
+                        break; // stop removing history items if we reach another model response
+                    if (i === 0 && historyItem.type === "system")
+                        break; // keep the first system message
+                    if (historyItem.type === "user" || historyItem.type === "system") {
+                        const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove, undefined, false);
+                        const newTextString = newText.toString();
+                        const historyItemString = LlamaText.fromJSON(historyItem.text).toString();
+                        if (newText.values.length === 0) {
+                            res.splice(i, 1);
+                            i++;
+                            removedItems++;
+                            charactersLeftToRemove -= historyItemString.length;
+                        }
+                        else if (newTextString.length < historyItemString.length) {
+                            charactersLeftToRemove -= historyItemString.length - newTextString.length;
+                            if (historyItem.type === "user")
+                                historyItem.text = newText.toString();
+                            else
+                                historyItem.text = newText.toJSON();
+                        }
+                    }
+                    else {
+                        void historyItem;
+                    }
+                }
+                return removedItems;
+            }
+            function compressHistoryThatLedToModelResponseAtIndex(index, keepTokensCount = 0) {
+                let removedItems = 0;
+                let promptStartIndex = undefined;
+                for (let i = index - 1; i >= 0; i--) {
+                    const historyItem = res[i];
+                    if (historyItem == null)
+                        continue;
+                    if (historyItem.type === "model") {
+                        promptStartIndex = i + 1;
+                        break;
+                    }
+                    if (i === 0 && historyItem.type === "system") {
+                        promptStartIndex = i + 1;
+                        break; // keep the first system message
+                    }
+                }
+                if (promptStartIndex == null || promptStartIndex >= index)
+                    return 0;
+                for (let i = promptStartIndex; i < index && charactersLeftToRemove > 0; i++) {
+                    const historyItem = res[i];
+                    if (historyItem == null || historyItem.type !== "user")
+                        continue;
+                    let removeChars = Math.min(charactersLeftToRemove, historyItem.text.length);
+                    if (keepTokensCount > 0) {
+                        removeChars -= Math.floor(keepTokensCount * estimatedCharactersPerToken);
+                        if (removeChars < 0)
+                            removeChars = 0;
+                        keepTokensCount -= Math.min(keepTokensCount, Math.max(0, historyItem.text.length - removeChars) / estimatedCharactersPerToken);
+                    }
+                    const newText = truncateTextAndRoundToWords(historyItem.text, removeChars, undefined, false);
+                    if (newText.length === 0) {
+                        res.splice(i, 1);
+                        i--;
+                        index--;
+                        removedItems++;
+                        charactersLeftToRemove -= historyItem.text.length;
+                    }
+                    else {
+                        charactersLeftToRemove -= historyItem.text.length - newText.length;
+                        historyItem.text = newText;
+                    }
+                }
+                return removedItems;
+            }
+            function removeEmptySegmentsFromModelResponse(modelResponse) {
+                const stack = [];
+                for (let t = 0; t < modelResponse.length && charactersLeftToRemove > 0; t++) {
+                    const item = modelResponse[t];
+                    const isLastItem = t === modelResponse.length - 1;
+                    if (!isChatModelResponseSegment(item))
+                        continue;
+                    const type = item.segmentType;
+                    const topStack = stack.at(-1);
+                    if (topStack?.type === type) {
+                        if (item.ended && item.text === "" && topStack.canRemove) {
+                            modelResponse.splice(t, 1);
+                            t--;
+                            modelResponse.splice(topStack.startIndex, 1);
+                            t--;
+                            stack.pop();
+                        }
+                        else if (!item.ended && item.text === "" && !isLastItem) {
+                            modelResponse.splice(t, 1);
+                            t--;
+                        }
+                        else if (!item.ended && item.text !== "")
+                            topStack.canRemove = false;
+                        else if (item.ended)
+                            stack.pop();
+                    }
+                    else if (!item.ended)
+                        stack.push({
+                            type,
+                            startIndex: t,
+                            canRemove: item.text === ""
+                        });
+                }
+            }
+            function compressFirstModelResponse() {
+                for (let i = 0; i < res.length && charactersLeftToRemove > 0; i++) {
+                    const historyItem = res[i];
+                    const isLastHistoryItem = i === res.length - 1;
+                    if (historyItem.type !== "model")
+                        continue;
+                    for (let t = 0; t < historyItem.response.length && charactersLeftToRemove > 0; t++) {
+                        const item = historyItem.response[t];
+                        const isLastText = t === historyItem.response.length - 1;
+                        if (isLastHistoryItem && isLastText)
+                            continue;
+                        if (typeof item === "string") {
+                            const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove, undefined, true);
+                            if (newText === "") {
+                                historyItem.response.splice(t, 1);
+                                t--;
+                                charactersLeftToRemove -= item.length;
+                            }
+                            else if (newText.length < item.length) {
+                                historyItem.response[t] = newText;
+                                charactersLeftToRemove -= item.length - newText.length;
+                            }
+                        }
+                        else if (isChatModelResponseFunctionCall(item)) {
+                            historyItem.response.splice(t, 1);
+                            t--;
+                            const functionCallAndResultTokenUsage = chatWrapper.generateFunctionCallsAndResults([item], true)
+                                .tokenize(tokenizer, "trimLeadingSpace").length;
+                            charactersLeftToRemove -= functionCallAndResultTokenUsage * estimatedCharactersPerToken;
+                        }
+                        else if (isChatModelResponseSegment(item)) {
+                            if (item.text !== "") {
+                                const newText = truncateTextAndRoundToWords(item.text, charactersLeftToRemove, undefined, true);
+                                if (newText === "" && item.ended) {
+                                    const emptySegmentTokenUsage = chatWrapper.generateModelResponseText([{ ...item, text: "" }], true)
+                                        .tokenize(tokenizer, "trimLeadingSpace").length;
+                                    historyItem.response.splice(t, 1);
+                                    t--;
+                                    charactersLeftToRemove -= item.text.length + emptySegmentTokenUsage * estimatedCharactersPerToken;
+                                }
+                                else {
+                                    charactersLeftToRemove -= item.text.length - newText.length;
+                                    item.text = newText;
+                                }
+                            }
+                        }
+                        else
+                            void item;
+                    }
+                    removeEmptySegmentsFromModelResponse(historyItem.response);
+                    if (historyItem.response.length === 0) {
+                        // if the model response is removed from the history,
+                        // the things that led to it are not important anymore
+                        i -= removeHistoryThatLedToModelResponseAtIndex(i);
+                        res.splice(i, 1);
+                        i--;
+                    }
+                }
+            }
+            function compressLastModelResponse(minCharactersToKeep = 60) {
+                const lastHistoryItem = res[res.length - 1];
+                if (lastHistoryItem == null || lastHistoryItem.type !== "model")
+                    return;
+                const lastResponseItem = lastHistoryItem.response[lastHistoryItem.response.length - 1];
+                if (lastResponseItem == null || typeof lastResponseItem !== "string")
+                    return;
+                compressHistoryThatLedToModelResponseAtIndex(res.length - 1, maxTokensCount / 4);
+                if (charactersLeftToRemove <= 0)
+                    return;
+                const nextTextLength = Math.max(Math.min(lastResponseItem.length, minCharactersToKeep), lastResponseItem.length - charactersLeftToRemove);
+                const charactersToRemoveFromText = lastResponseItem.length - nextTextLength;
+                const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText, undefined, true);
+                if (newText.length < lastResponseItem.length) {
+                    lastHistoryItem.response[lastHistoryItem.response.length - 1] = newText;
+                    charactersLeftToRemove -= lastResponseItem.length - newText.length;
+                }
+                if (charactersLeftToRemove <= 0)
+                    return;
+                compressHistoryThatLedToModelResponseAtIndex(res.length - 1);
+            }
+            compressFunctionCalls();
+            if (charactersLeftToRemove <= 0)
+                return res;
+            compressFirstModelResponse();
+            if (charactersLeftToRemove <= 0)
+                return res;
+            compressLastModelResponse();
+            return res;
+        }
+    });
+    const newMetadata = {
+        removedCharactersNumber: removedCharactersCount
+    };
+    return {
+        chatHistory: compressedChatHistory,
+        metadata: newMetadata
+    };
+}
+function isCalculationMetadata(metadata) {
+    return metadata != null && typeof metadata === "object" && typeof metadata.removedCharactersNumber === "number";
+}
+//# sourceMappingURL=eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts
@@ -0,0 +1,433 @@
+import { EventRelay } from "lifecycle-utils";
+import { ChatWrapper } from "../../ChatWrapper.js";
+import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
+import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
+import { LlamaGrammar } from "../LlamaGrammar.js";
+import { LLamaChatContextShiftOptions, LlamaChatResponseChunk, LlamaChatResponseFunctionCallParamsChunk } from "../LlamaChat/LlamaChat.js";
+import { EvaluationPriority } from "../LlamaContext/types.js";
+import { TokenBias } from "../TokenBias.js";
+import { LlamaText } from "../../utils/LlamaText.js";
+import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
+export type LlamaChatSessionOptions = {
+    contextSequence: LlamaContextSequence;
+    /** `"auto"` is used by default */
+    chatWrapper?: "auto" | ChatWrapper;
+    systemPrompt?: string;
+    /**
+     * Add the system prompt even on models that don't support a system prompt.
+     *
+     * Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
+     * but forcing the system prompt on unsupported models may not always work as expected.
+     *
+     * Use with caution.
+     */
+    forceAddSystemPrompt?: boolean;
+    /**
+     * Automatically dispose the sequence when the session is disposed.
+     *
+     * Defaults to `false`.
+     */
+    autoDisposeSequence?: boolean;
+    contextShift?: LlamaChatSessionContextShiftOptions;
+};
+export type LlamaChatSessionContextShiftOptions = {
+    /**
+     * The number of tokens to delete from the context window to make space for new ones.
+     * Defaults to 10% of the context size.
+     */
+    size?: LLamaChatContextShiftOptions["size"];
+    /**
+     * The strategy to use when deleting tokens from the context window.
+     *
+     * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
+     */
+    strategy?: LLamaChatContextShiftOptions["strategy"];
+};
+export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
+    /**
+     * Called as the model generates the main response with the generated text chunk.
+     *
+     * Useful for streaming the generated response as it's being generated.
+     *
+     * Includes only the main response without any text segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onTextChunk?: (text: string) => void;
+    /**
+     * Called as the model generates the main response with the generated tokens.
+     *
+     * Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
+     *
+     * Includes only the main response without any segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onToken?: (tokens: Token[]) => void;
+    /**
+     * Called as the model generates a response with the generated text and tokens,
+     * including segment information (when the generated output is part of a segment).
+     *
+     * Useful for streaming the generated response as it's being generated, including the main response and all segments.
+     *
+     * Only use this function when you need the segmented texts, like thought segments (chain of thought text).
+     */
+    onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
+    signal?: AbortSignal;
+    /**
+     * When a response already started being generated and then the signal is aborted,
+     * the generation will stop and the response will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /** Maximum number of tokens to generate */
+    maxTokens?: number;
+    /**
+     * Temperature is a hyperparameter that controls the randomness of the generated text.
+     * It affects the probability distribution of the model's output tokens.
+     *
+     * A higher temperature (e.g., 1.5) makes the output more random and creative,
+     * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
+     *
+     * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
+     *
+     * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+     *
+     * Set to `0` to disable.
+     * Disabled by default (set to `0`).
+     */
+    temperature?: number;
+    /**
+     * From the next token candidates, discard the percentage of tokens with the lowest probability.
+     * For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
+     * This is useful for generating more high-quality results when using a high temperature.
+     * Set to a value between `0` and `1` to enable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     * Disabled by default.
+     */
+    minP?: number;
+    /**
+     * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
+     * An integer number between `1` and the size of the vocabulary.
+     * Set to `0` to disable (which uses the full vocabulary).
+     *
+     * Only relevant when `temperature` is set to a value greater than 0.
+     */
+    topK?: number;
+    /**
+     * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
+     * and samples the next token only from this set.
+     * A float number between `0` and `1`.
+     * Set to `1` to disable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     */
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    /**
+     * Trim whitespace from the end of the generated text
+     * Disabled by default.
+     */
+    trimWhitespaceSuffix?: boolean;
+    /**
+     * Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
+     *
+     * May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
+     * so avoid using it together with function calling if you notice unexpected behavior.
+     */
+    responsePrefix?: string;
+    /**
+     * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
+     */
+    evaluationPriority?: EvaluationPriority;
+    repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
+     */
+    customStopTriggers?: (LlamaText | string | (string | Token)[])[];
+    /**
+     * Called as the model generates function calls with the generated parameters chunk for each function call.
+     *
+     * Useful for streaming the generated function call parameters as they're being generated.
+     * Only useful in specific use cases,
+     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
+     *
+     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
+     * according to the function parameters schema.
+     *
+     * Each function call has its own `callIndex` you can use to distinguish between them.
+     *
+     * Only relevant when using function calling (via passing the `functions` option).
+     */
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+    /**
+     * Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
+     */
+    budgets?: {
+        /**
+         * Budget for thought tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        thoughtTokens?: number;
+        /**
+         * Budget for comment tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        commentTokens?: number;
+    };
+} & ({
+    grammar?: LlamaGrammar;
+    functions?: never;
+    documentFunctionParams?: never;
+    maxParallelFunctionCalls?: never;
+    onFunctionCallParamsChunk?: never;
+} | {
+    grammar?: never;
+    functions?: Functions | ChatSessionModelFunctions;
+    documentFunctionParams?: boolean;
+    maxParallelFunctionCalls?: number;
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+});
+export type LLamaChatCompletePromptOptions = {
+    /**
+     * Generate a completion for the given user prompt up to the given number of tokens.
+     *
+     * Defaults to `256` or half the context size, whichever is smaller.
+     */
+    maxTokens?: LLamaChatPromptOptions["maxTokens"];
+    /**
+     * When a completion already started being generated and then the given `signal` is aborted,
+     * the generation will stop and the completion will be returned as-is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
+    /**
+     * Called as the model generates a completion with the generated text chunk.
+     *
+     * Useful for streaming the generated completion as it's being generated.
+     */
+    onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
+    /**
+     * Called as the model generates a completion with the generated tokens.
+     *
+     * Preferably, you'd want to use `onTextChunk` instead of this.
+     */
+    onToken?: LLamaChatPromptOptions["onToken"];
+    signal?: LLamaChatPromptOptions["signal"];
+    temperature?: LLamaChatPromptOptions["temperature"];
+    minP?: LLamaChatPromptOptions["minP"];
+    topK?: LLamaChatPromptOptions["topK"];
+    topP?: LLamaChatPromptOptions["topP"];
+    seed?: LLamaChatPromptOptions["seed"];
+    trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
+    evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
+    repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
+    tokenBias?: LLamaChatPromptOptions["tokenBias"];
+    customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
+    grammar?: LlamaGrammar;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same functions that were used for the previous prompt here.
+     */
+    functions?: ChatSessionModelFunctions;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same value that was used for the previous prompt here.
+     */
+    documentFunctionParams?: boolean;
+    /**
+     * Whether to complete the prompt as a model response.
+     *
+     * - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
+     *   This is a good option to workaround some models that don't support used prompt completions.
+     * - **`true`**: Always complete as a model response
+     * - **`false`**: Never complete as a model response
+     *
+     * Defaults to `"auto"`.
+     */
+    completeAsModel?: "auto" | boolean | {
+        /**
+         * Whether to complete the prompt as a model response.
+         *
+         * - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
+         *   This is a good option to workaround some models that don't support used prompt completions.
+         * - **`true`**: Always complete as a model response
+         * - **`false`**: Never complete as a model response
+         *
+         * Defaults to `"auto"`.
+         */
+        enabled?: "auto" | boolean;
+        /**
+         * The messages to append to the chat history to generate a completion as a model response.
+         *
+         * If the last message is a model message, the prompt will be pushed to it for the completion,
+         * otherwise a new model message will be added with the prompt.
+         *
+         * It must contain a user message or a system message before the model message.
+         *
+         * Default to:
+         * ```ts
+         * [
+         *     {
+         *         type: "system",
+         *         text: "For your next response predict what the user may send next. " +
+         *             "No yapping, no whitespace. Match the user's language and tone."
+         *     },
+         *     {type: "user", text: ""},
+         *     {type: "model", response: [""]}
+         * ]
+         * ```
+         */
+        appendedMessages?: ChatHistoryItem[];
+    };
+};
+export type LLamaChatPreloadPromptOptions = {
+    signal?: LLamaChatCompletePromptOptions["signal"];
+    evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
+    functions?: LLamaChatCompletePromptOptions["functions"];
+    documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
+};
+export type LlamaChatSessionRepeatPenalty = {
+    /**
+     * Number of recent tokens generated by the model to apply penalties to repetition of.
+     * Defaults to `64`.
+     */
+    lastTokens?: number;
+    punishTokensFilter?: (tokens: Token[]) => Token[];
+    /**
+     * Penalize new line tokens.
+     * Enabled by default.
+     */
+    penalizeNewLine?: boolean;
+    /**
+     * The relative amount to lower the probability of the tokens in `punishTokens` by
+     * Defaults to `1.1`.
+     * Set to `1` to disable.
+     */
+    penalty?: number;
+    /**
+     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    frequencyPenalty?: number;
+    /**
+     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    presencePenalty?: number;
+};
+/**
+ * @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
+ */
+export declare class LlamaChatSession {
+    readonly onDispose: EventRelay<void>;
+    constructor(options: LlamaChatSessionOptions);
+    dispose({ disposeSequence }?: {
+        disposeSequence?: boolean;
+    }): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get chatWrapper(): ChatWrapper;
+    get sequence(): LlamaContextSequence;
+    get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
+    get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
+    prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
+    /**
+     * @param prompt
+     * @param [options]
+     */
+    promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
+        response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
+        responseText: string;
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+        remainingGenerationAfterStop: string | Token[] | undefined;
+    } | {
+        response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
+        responseText: string;
+        stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
+        remainingGenerationAfterStop: string | Token[] | undefined;
+        customStopTrigger?: undefined;
+    }>;
+    /**
+     * Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
+     * and feel faster.
+     *
+     * > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
+    /**
+     * Preload a user prompt into the current context sequence state and generate a completion for it.
+     *
+     * > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
+     * > so consider limiting the length of prompts you preload.
+     * >
+     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
+    /**
+     * Create a smart completion engine that caches the prompt completions
+     * and reuses them when the user prompt matches the beginning of the cached prompt or completion.
+     *
+     * All completions are made and cache is used only for the current chat session state.
+     * You can create a single completion engine for an entire chat session.
+     */
+    createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
+    /**
+     * See `completePrompt` for more information.
+     * @param prompt
+     * @param [options]
+     */
+    completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel }?: LLamaChatCompletePromptOptions): Promise<{
+        completion: string;
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+        remainingGenerationAfterStop: string | Token[] | undefined;
+    } | {
+        completion: string;
+        stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
+        remainingGenerationAfterStop: string | Token[] | undefined;
+        customStopTrigger?: undefined;
+    }>;
+    getChatHistory(): ChatHistoryItem[];
+    getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
+    setChatHistory(chatHistory: ChatHistoryItem[]): void;
+    /** Clear the chat history and reset it to the initial state. */
+    resetChatHistory(): void;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js
@@ -0,0 +1,622 @@
+import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { appendUserMessageToChatHistory } from "../../utils/appendUserMessageToChatHistory.js";
+import { LlamaChat } from "../LlamaChat/LlamaChat.js";
+import { wrapAbortSignal } from "../../utils/wrapAbortSignal.js";
+import { safeEventCallback } from "../../utils/safeEventCallback.js";
+import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
+import { LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
+const defaultCompleteAsModel = {
+    enabled: "auto",
+    appendedMessages: [
+        {
+            type: "system",
+            text: "For your next response predict what the user may send next. No yapping, no whitespace. Match the user's language and tone."
+        },
+        { type: "user", text: "" },
+        { type: "model", response: [""] }
+    ]
+};
+/**
+ * @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
+ */
+export class LlamaChatSession {
+    /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _autoDisposeSequence;
+    /** @internal */ _contextShift;
+    /** @internal */ _forceAddSystemPrompt;
+    /** @internal */ _systemPrompt;
+    /** @internal */ _chatLock = {};
+    /** @internal */ _chatHistory;
+    /** @internal */ _lastEvaluation;
+    /** @internal */ _canUseContextWindowForCompletion = true;
+    /** @internal */ _chat;
+    /** @internal */ _chatHistoryStateRef = {};
+    /** @internal */ _preloadAndCompleteAbortControllers = new Set();
+    onDispose = new EventRelay();
+    constructor(options) {
+        const { contextSequence, chatWrapper = "auto", systemPrompt, forceAddSystemPrompt = false, autoDisposeSequence = false, contextShift } = options;
+        if (contextSequence == null)
+            throw new Error("contextSequence cannot be null");
+        if (contextSequence.disposed)
+            throw new DisposedError();
+        this._contextShift = contextShift;
+        this._forceAddSystemPrompt = forceAddSystemPrompt;
+        this._systemPrompt = systemPrompt;
+        this._chat = new LlamaChat({
+            autoDisposeSequence,
+            chatWrapper,
+            contextSequence
+        });
+        const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
+        if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
+            this._chatHistory = this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt });
+        else
+            this._chatHistory = [];
+        this._autoDisposeSequence = autoDisposeSequence;
+        this._disposeAggregator.add(this._chat.onDispose.createListener(() => {
+            this.dispose();
+        }));
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+    }
+    dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
+        if (this._chat == null)
+            return;
+        this._chat.dispose({ disposeSequence });
+        this._chat = null;
+        this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    [Symbol.dispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._chat == null || this._chat.disposed;
+    }
+    get chatWrapper() {
+        if (this._chat == null)
+            throw new DisposedError();
+        return this._chat.chatWrapper;
+    }
+    get sequence() {
+        if (this._chat == null)
+            throw new DisposedError();
+        return this._chat.sequence;
+    }
+    get context() {
+        return this.sequence.context;
+    }
+    get model() {
+        return this.sequence.model;
+    }
+    async prompt(prompt, options = {}) {
+        const { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers } = options;
+        const { responseText } = await this.promptWithMeta(prompt, {
+            // this is a workaround to allow passing both `functions` and `grammar`
+            functions: functions,
+            grammar: grammar,
+            documentFunctionParams: documentFunctionParams,
+            maxParallelFunctionCalls: maxParallelFunctionCalls,
+            onFunctionCallParamsChunk: onFunctionCallParamsChunk,
+            onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens,
+            temperature, minP, topK, topP, seed,
+            trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
+        });
+        return responseText;
+    }
+    /**
+     * @param prompt
+     * @param [options]
+     */
+    async promptWithMeta(prompt, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority } = {}) {
+        this._ensureNotDisposed();
+        if (grammar != null && grammar._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+        this._stopAllPreloadAndPromptCompletions();
+        return await withLock([this._chatLock, "evaluation"], signal, async () => {
+            this._ensureNotDisposed();
+            this._stopAllPreloadAndPromptCompletions();
+            if (this._chat == null)
+                throw new DisposedError();
+            const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
+            const [abortController, disposeAbortController] = wrapAbortSignal(signal);
+            let lastEvaluation = this._canUseContextWindowForCompletion
+                ? this._lastEvaluation
+                : undefined;
+            let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
+            let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
+                ? undefined
+                : appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
+            let previousFunctionCalls = 0;
+            const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
+                ? responsePrefix
+                : undefined;
+            newChatHistory.push({
+                type: "model",
+                response: resolvedResponsePrefix != null
+                    ? [resolvedResponsePrefix]
+                    : []
+            });
+            if (newContextWindowChatHistory != null)
+                newContextWindowChatHistory.push({
+                    type: "model",
+                    response: resolvedResponsePrefix != null
+                        ? [resolvedResponsePrefix]
+                        : []
+                });
+            if (resolvedResponsePrefix != null) {
+                safeEventCallback(onToken)?.(this.model.tokenize(resolvedResponsePrefix));
+                safeEventCallback(onTextChunk)?.(resolvedResponsePrefix);
+                safeEventCallback(onResponseChunk)?.({
+                    type: undefined,
+                    segmentType: undefined,
+                    text: resolvedResponsePrefix,
+                    tokens: this.model.tokenize(resolvedResponsePrefix)
+                });
+            }
+            try {
+                while (true) {
+                    const functionCallsAndResults = [];
+                    let canThrowFunctionCallingErrors = false;
+                    let abortedOnFunctionCallError = false;
+                    const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
+                    const { lastEvaluation: currentLastEvaluation, metadata } = await this._chat.generateResponse(newChatHistory, {
+                        functions,
+                        documentFunctionParams,
+                        maxParallelFunctionCalls,
+                        grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
+                        onTextChunk: safeEventCallback(onTextChunk),
+                        onToken: safeEventCallback(onToken),
+                        onResponseChunk: safeEventCallback(onResponseChunk),
+                        onFunctionCallParamsChunk: onFunctionCallParamsChunk == null
+                            ? undefined
+                            : safeEventCallback((chunk) => onFunctionCallParamsChunk?.({
+                                callIndex: previousFunctionCalls + chunk.callIndex,
+                                functionName: chunk.functionName,
+                                paramsChunk: chunk.paramsChunk,
+                                done: chunk.done
+                            })),
+                        budgets: {
+                            includeCurrentResponse: true,
+                            thoughtTokens: budgets?.thoughtTokens,
+                            commentTokens: budgets?.commentTokens
+                        },
+                        signal: abortController.signal,
+                        stopOnAbortSignal,
+                        repeatPenalty,
+                        minP,
+                        topK,
+                        topP,
+                        seed,
+                        tokenBias,
+                        customStopTriggers,
+                        maxTokens,
+                        temperature,
+                        trimWhitespaceSuffix,
+                        contextShift: {
+                            ...this._contextShift,
+                            lastEvaluationMetadata: lastEvaluation?.contextShiftMetadata
+                        },
+                        evaluationPriority,
+                        lastEvaluationContextWindow: {
+                            history: newContextWindowChatHistory,
+                            minimumOverlapPercentageToPreventContextShift: 0.5
+                        },
+                        onFunctionCall: async (functionCall) => {
+                            functionCallsAndResults.push((async () => {
+                                try {
+                                    const functionDefinition = functions?.[functionCall.functionName];
+                                    if (functionDefinition == null)
+                                        throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
+                                    const functionCallResult = await functionDefinition.handler(functionCall.params);
+                                    return {
+                                        functionCall,
+                                        functionDefinition,
+                                        functionCallResult
+                                    };
+                                }
+                                catch (err) {
+                                    if (!abortController.signal.aborted) {
+                                        abortedOnFunctionCallError = true;
+                                        abortController.abort(err);
+                                    }
+                                    if (canThrowFunctionCallingErrors)
+                                        throw err;
+                                    return null;
+                                }
+                            })());
+                        }
+                    });
+                    this._ensureNotDisposed();
+                    if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
+                        throw abortController.signal.reason;
+                    if (maxTokens != null)
+                        maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
+                    lastEvaluation = currentLastEvaluation;
+                    newChatHistory = lastEvaluation.cleanHistory;
+                    if (functionCallsAndResults.length > 0) {
+                        canThrowFunctionCallingErrors = true;
+                        const functionCallResultsPromise = Promise.all(functionCallsAndResults);
+                        const raceEventAbortController = new AbortController();
+                        await Promise.race([
+                            functionCallResultsPromise,
+                            new Promise((accept, reject) => {
+                                abortController.signal.addEventListener("abort", () => {
+                                    if (abortedOnFunctionCallError || !stopOnAbortSignal)
+                                        reject(abortController.signal.reason);
+                                    else
+                                        accept();
+                                }, { signal: raceEventAbortController.signal });
+                                if (abortController.signal.aborted) {
+                                    if (abortedOnFunctionCallError || !stopOnAbortSignal)
+                                        reject(abortController.signal.reason);
+                                    else
+                                        accept();
+                                }
+                            })
+                        ]);
+                        raceEventAbortController.abort();
+                        this._ensureNotDisposed();
+                        if (!abortController.signal.aborted) {
+                            const functionCallResults = (await functionCallResultsPromise)
+                                .filter((result) => result != null);
+                            this._ensureNotDisposed();
+                            if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
+                                throw abortController.signal.reason;
+                            newContextWindowChatHistory = lastEvaluation.contextWindow;
+                            let startNewChunk = supportsParallelFunctionCalling;
+                            for (const { functionCall, functionDefinition, functionCallResult } of functionCallResults) {
+                                newChatHistory = addFunctionCallToChatHistory({
+                                    chatHistory: newChatHistory,
+                                    functionName: functionCall.functionName,
+                                    functionDescription: functionDefinition.description,
+                                    callParams: functionCall.params,
+                                    callResult: functionCallResult,
+                                    rawCall: functionCall.raw,
+                                    startsNewChunk: startNewChunk
+                                });
+                                newContextWindowChatHistory = addFunctionCallToChatHistory({
+                                    chatHistory: newContextWindowChatHistory,
+                                    functionName: functionCall.functionName,
+                                    functionDescription: functionDefinition.description,
+                                    callParams: functionCall.params,
+                                    callResult: functionCallResult,
+                                    rawCall: functionCall.raw,
+                                    startsNewChunk: startNewChunk
+                                });
+                                startNewChunk = false;
+                                previousFunctionCalls++;
+                            }
+                            lastEvaluation.cleanHistory = newChatHistory;
+                            lastEvaluation.contextWindow = newContextWindowChatHistory;
+                            if (abortController.signal.aborted && !abortedOnFunctionCallError && stopOnAbortSignal) {
+                                metadata.stopReason = "abort";
+                                metadata.remainingGenerationAfterStop = undefined;
+                            }
+                            else
+                                continue;
+                        }
+                    }
+                    this._lastEvaluation = lastEvaluation;
+                    this._canUseContextWindowForCompletion = true;
+                    this._chatHistory = newChatHistory;
+                    this._chatHistoryStateRef = {};
+                    const lastModelResponseItem = getLastModelResponseItem(newChatHistory);
+                    const responseText = lastModelResponseItem.response
+                        .filter((item) => typeof item === "string")
+                        .join("");
+                    if (metadata.stopReason === "customStopTrigger")
+                        return {
+                            response: lastModelResponseItem.response,
+                            responseText,
+                            stopReason: metadata.stopReason,
+                            customStopTrigger: metadata.customStopTrigger,
+                            remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                        };
+                    return {
+                        response: lastModelResponseItem.response,
+                        responseText,
+                        stopReason: metadata.stopReason,
+                        remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                    };
+                }
+            }
+            finally {
+                disposeAbortController();
+            }
+        });
+    }
+    /**
+     * Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
+     * and feel faster.
+     *
+     * > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    async preloadPrompt(prompt, options = {}) {
+        await this.completePromptWithMeta(prompt, {
+            ...options,
+            completeAsModel: false,
+            maxTokens: 0
+        });
+    }
+    /**
+     * Preload a user prompt into the current context sequence state and generate a completion for it.
+     *
+     * > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
+     * > so consider limiting the length of prompts you preload.
+     * >
+     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    async completePrompt(prompt, options = {}) {
+        const { completion } = await this.completePromptWithMeta(prompt, options);
+        return completion;
+    }
+    /**
+     * Create a smart completion engine that caches the prompt completions
+     * and reuses them when the user prompt matches the beginning of the cached prompt or completion.
+     *
+     * All completions are made and cache is used only for the current chat session state.
+     * You can create a single completion engine for an entire chat session.
+     */
+    createPromptCompletionEngine(options) {
+        return LlamaChatSessionPromptCompletionEngine._create(this, options);
+    }
+    /**
+     * See `completePrompt` for more information.
+     * @param prompt
+     * @param [options]
+     */
+    async completePromptWithMeta(prompt, { maxTokens, stopOnAbortSignal = false, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel } = {}) {
+        this._ensureNotDisposed();
+        if (grammar != null) {
+            if (grammar._llama == null)
+                throw new Error("The grammar passed to this function is not a LlamaGrammar instance.");
+            else if (grammar._llama !== this.model._llama)
+                throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+        }
+        const [abortController, disposeAbortController] = wrapAbortSignal(signal);
+        this._preloadAndCompleteAbortControllers.add(abortController);
+        const completeAsModelEnabled = typeof completeAsModel == "boolean"
+            ? completeAsModel
+            : completeAsModel === "auto"
+                ? "auto"
+                : completeAsModel?.enabled ?? defaultCompleteAsModel.enabled;
+        const modelArchitecture = this.model.fileInfo.metadata?.general?.architecture;
+        const shouldCompleteAsModel = completeAsModelEnabled === "auto"
+            ? modelArchitecture === GgufArchitectureType.gptOss
+            : completeAsModelEnabled;
+        try {
+            return await withLock([this._chatLock, "evaluation"], abortController.signal, async () => {
+                this._ensureNotDisposed();
+                if (this._chat == null)
+                    throw new DisposedError();
+                if (shouldCompleteAsModel) {
+                    const messagesToAppendOption = (typeof completeAsModel == "boolean" || completeAsModel === "auto")
+                        ? defaultCompleteAsModel.appendedMessages
+                        : completeAsModel?.appendedMessages ?? defaultCompleteAsModel.appendedMessages;
+                    const messagesToAppend = messagesToAppendOption.length === 0
+                        ? defaultCompleteAsModel.appendedMessages
+                        : messagesToAppendOption;
+                    const addMessageToChatHistory = (chatHistory) => {
+                        const newHistory = chatHistory.slice();
+                        if (messagesToAppend.at(0)?.type === "model")
+                            newHistory.push({ type: "user", text: "" });
+                        for (let i = 0; i < messagesToAppend.length; i++) {
+                            const item = messagesToAppend[i];
+                            const isLastItem = i === messagesToAppend.length - 1;
+                            if (item == null)
+                                continue;
+                            if (isLastItem && item.type === "model") {
+                                const newResponse = item.response.slice();
+                                if (typeof newResponse.at(-1) === "string")
+                                    newResponse.push(newResponse.pop() + prompt);
+                                else
+                                    newResponse.push(prompt);
+                                newHistory.push({
+                                    type: "model",
+                                    response: newResponse
+                                });
+                            }
+                            else
+                                newHistory.push(item);
+                        }
+                        if (messagesToAppend.at(-1)?.type !== "model")
+                            newHistory.push({ type: "model", response: [prompt] });
+                        return {
+                            history: newHistory,
+                            addedCount: newHistory.length - chatHistory.length
+                        };
+                    };
+                    const { history: messagesWithPrompt, addedCount } = addMessageToChatHistory(this._chatHistory);
+                    const { response, lastEvaluation, metadata } = await this._chat.generateResponse(messagesWithPrompt, {
+                        abortOnNonText: true,
+                        functions,
+                        documentFunctionParams,
+                        grammar: grammar, // this is allowed only because `abortOnNonText` is enabled
+                        onTextChunk,
+                        onToken,
+                        signal: abortController.signal,
+                        stopOnAbortSignal: true,
+                        repeatPenalty,
+                        minP,
+                        topK,
+                        topP,
+                        seed,
+                        tokenBias,
+                        customStopTriggers,
+                        maxTokens: maxTokens == null
+                            ? undefined
+                            : Math.min(1, maxTokens), // regular prompting ignores `maxTokens: 0`
+                        temperature,
+                        trimWhitespaceSuffix,
+                        contextShift: {
+                            ...this._contextShift,
+                            lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
+                        },
+                        evaluationPriority,
+                        lastEvaluationContextWindow: {
+                            history: this._lastEvaluation?.contextWindow == null
+                                ? undefined
+                                : addMessageToChatHistory(this._lastEvaluation?.contextWindow).history,
+                            minimumOverlapPercentageToPreventContextShift: 0.8
+                        }
+                    });
+                    this._ensureNotDisposed();
+                    this._lastEvaluation = {
+                        cleanHistory: this._chatHistory,
+                        contextWindow: lastEvaluation.contextWindow.slice(0, -addedCount),
+                        contextShiftMetadata: lastEvaluation.contextShiftMetadata
+                    };
+                    this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
+                    if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
+                        throw abortController.signal.reason;
+                    if (metadata.stopReason === "customStopTrigger")
+                        return {
+                            completion: response,
+                            stopReason: metadata.stopReason,
+                            customStopTrigger: metadata.customStopTrigger,
+                            remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                        };
+                    return {
+                        completion: response,
+                        stopReason: metadata.stopReason,
+                        remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                    };
+                }
+                else {
+                    const { completion, lastEvaluation, metadata } = await this._chat.loadChatAndCompleteUserMessage(asWithLastUserMessageRemoved(this._chatHistory), {
+                        initialUserPrompt: prompt,
+                        functions,
+                        documentFunctionParams,
+                        grammar,
+                        onTextChunk,
+                        onToken,
+                        signal: abortController.signal,
+                        stopOnAbortSignal: true,
+                        repeatPenalty,
+                        minP,
+                        topK,
+                        topP,
+                        seed,
+                        tokenBias,
+                        customStopTriggers,
+                        maxTokens,
+                        temperature,
+                        trimWhitespaceSuffix,
+                        contextShift: {
+                            ...this._contextShift,
+                            lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
+                        },
+                        evaluationPriority,
+                        lastEvaluationContextWindow: {
+                            history: asWithLastUserMessageRemoved(this._lastEvaluation?.contextWindow),
+                            minimumOverlapPercentageToPreventContextShift: 0.8
+                        }
+                    });
+                    this._ensureNotDisposed();
+                    this._lastEvaluation = {
+                        cleanHistory: this._chatHistory,
+                        contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
+                        contextShiftMetadata: lastEvaluation.contextShiftMetadata
+                    };
+                    this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
+                    if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
+                        throw abortController.signal.reason;
+                    if (metadata.stopReason === "customStopTrigger")
+                        return {
+                            completion: completion,
+                            stopReason: metadata.stopReason,
+                            customStopTrigger: metadata.customStopTrigger,
+                            remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                        };
+                    return {
+                        completion: completion,
+                        stopReason: metadata.stopReason,
+                        remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                    };
+                }
+            });
+        }
+        finally {
+            this._preloadAndCompleteAbortControllers.delete(abortController);
+            disposeAbortController();
+        }
+    }
+    getChatHistory() {
+        return structuredClone(this._chatHistory);
+    }
+    getLastEvaluationContextWindow() {
+        if (this._lastEvaluation == null)
+            return null;
+        return structuredClone(this._lastEvaluation?.contextWindow);
+    }
+    setChatHistory(chatHistory) {
+        this._chatHistory = structuredClone(chatHistory);
+        this._chatHistoryStateRef = {};
+        this._lastEvaluation = undefined;
+        this._canUseContextWindowForCompletion = false;
+    }
+    /** Clear the chat history and reset it to the initial state. */
+    resetChatHistory() {
+        if (this._chat == null || this.disposed)
+            throw new DisposedError();
+        const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
+        if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
+            this.setChatHistory(this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt }));
+        else
+            this.setChatHistory([]);
+    }
+    /** @internal */
+    _stopAllPreloadAndPromptCompletions() {
+        for (const abortController of this._preloadAndCompleteAbortControllers)
+            abortController.abort();
+        this._preloadAndCompleteAbortControllers.clear();
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this.disposed)
+            throw new DisposedError();
+    }
+}
+function addFunctionCallToChatHistory({ chatHistory, functionName, functionDescription, callParams, callResult, rawCall, startsNewChunk }) {
+    const newChatHistory = chatHistory.slice();
+    if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
+        newChatHistory.push({
+            type: "model",
+            response: []
+        });
+    const lastModelResponseItem = newChatHistory[newChatHistory.length - 1];
+    const newLastModelResponseItem = { ...lastModelResponseItem };
+    newChatHistory[newChatHistory.length - 1] = newLastModelResponseItem;
+    const modelResponse = newLastModelResponseItem.response.slice();
+    newLastModelResponseItem.response = modelResponse;
+    const functionCall = {
+        type: "functionCall",
+        name: functionName,
+        description: functionDescription,
+        params: callParams,
+        result: callResult,
+        rawCall
+    };
+    if (startsNewChunk)
+        functionCall.startsNewChunk = true;
+    modelResponse.push(functionCall);
+    return newChatHistory;
+}
+function getLastModelResponseItem(chatHistory) {
+    if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "model")
+        throw new Error("Expected chat history to end with a model response");
+    return chatHistory[chatHistory.length - 1];
+}
+function asWithLastUserMessageRemoved(chatHistory) {
+    if (chatHistory == null)
+        return chatHistory;
+    const newChatHistory = chatHistory.slice();
+    while (newChatHistory.at(-1)?.type === "user")
+        newChatHistory.pop();
+    return newChatHistory;
+}
+//# sourceMappingURL=LlamaChatSession.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts
@@ -0,0 +1,43 @@
+import type { LlamaContextSequence } from "../../LlamaContext/LlamaContext.js";
+import type { LLamaChatCompletePromptOptions } from "../LlamaChatSession.js";
+export type LLamaChatPromptCompletionEngineOptions = {
+    /**
+     * Max tokens to allow for preloading a prompt and generating a completion for it.
+     *
+     * Defaults to `256` or half of the context size, whichever is smaller.
+     */
+    maxPreloadTokens?: number;
+    onGeneration?(prompt: string, completion: string): void;
+    /**
+     * Max number of completions to cache.
+     *
+     * Defaults to `100`.
+     */
+    maxCachedCompletions?: number;
+    temperature?: LLamaChatCompletePromptOptions["temperature"];
+    minP?: LLamaChatCompletePromptOptions["minP"];
+    topK?: LLamaChatCompletePromptOptions["topK"];
+    topP?: LLamaChatCompletePromptOptions["topP"];
+    seed?: LLamaChatCompletePromptOptions["seed"];
+    trimWhitespaceSuffix?: LLamaChatCompletePromptOptions["trimWhitespaceSuffix"];
+    evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
+    repeatPenalty?: LLamaChatCompletePromptOptions["repeatPenalty"];
+    tokenBias?: LLamaChatCompletePromptOptions["tokenBias"];
+    customStopTriggers?: LLamaChatCompletePromptOptions["customStopTriggers"];
+    grammar?: LLamaChatCompletePromptOptions["grammar"];
+    functions?: LLamaChatCompletePromptOptions["functions"];
+    documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
+    completeAsModel?: LLamaChatCompletePromptOptions["completeAsModel"];
+};
+export declare const defaultMaxPreloadTokens: (sequence: LlamaContextSequence) => number;
+export declare class LlamaChatSessionPromptCompletionEngine {
+    private constructor();
+    dispose(): void;
+    /**
+     * Get completion for the prompt from the cache,
+     * and begin preloading this prompt into the context sequence and completing it.
+     *
+     * On completion progress, `onGeneration` (configured for this engine instance) will be called.
+     */
+    complete(prompt: string): string;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js
@@ -0,0 +1,191 @@
+import { DisposeAggregator, DisposedError } from "lifecycle-utils";
+import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
+import { LruCache } from "../../../utils/LruCache.js";
+import { safeEventCallback } from "../../../utils/safeEventCallback.js";
+export const defaultMaxPreloadTokens = (sequence) => {
+    const defaultValue = 256;
+    return sequence.model.fileInsights.swaSize != null
+        ? Math.min(Math.ceil(sequence.model.fileInsights.swaSize / 2), defaultValue, Math.ceil(sequence.contextSize / 2))
+        : Math.min(defaultValue, Math.ceil(sequence.contextSize / 2));
+};
+const defaultMaxCachedCompletions = 100;
+export class LlamaChatSessionPromptCompletionEngine {
+    /** @internal */ _chatSession;
+    /** @internal */ _maxPreloadTokens;
+    /** @internal */ _maxCachedCompletions;
+    /** @internal */ _onGeneration;
+    /** @internal */ _completionOptions;
+    /** @internal */ _completionCaches = new WeakMap();
+    /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _currentCompletionAbortController = new AbortController();
+    /** @internal */ _lastPrompt;
+    /** @internal */ _disposed = false;
+    constructor(chatSession, { maxPreloadTokens = defaultMaxPreloadTokens(chatSession.sequence), onGeneration, maxCachedCompletions = defaultMaxCachedCompletions, ...options }) {
+        this._chatSession = chatSession;
+        this._maxPreloadTokens = Math.max(1, maxPreloadTokens);
+        this._maxCachedCompletions = Math.max(1, maxCachedCompletions);
+        this._onGeneration = safeEventCallback(onGeneration);
+        this._completionOptions = options;
+        this.dispose = this.dispose.bind(this);
+        this._disposeAggregator.add(this._chatSession.onDispose.createListener(this.dispose));
+        this._disposeAggregator.add(() => {
+            this._disposed = true;
+            this._currentCompletionAbortController.abort();
+        });
+    }
+    dispose() {
+        if (this._disposed)
+            return;
+        this._disposeAggregator.dispose();
+    }
+    /**
+     * Get completion for the prompt from the cache,
+     * and begin preloading this prompt into the context sequence and completing it.
+     *
+     * On completion progress, `onGeneration` (configured for this engine instance) will be called.
+     */
+    complete(prompt) {
+        if (this._disposed)
+            throw new DisposedError();
+        const completionCache = this._getCurrentCompletionCache();
+        const completion = completionCache.getCompletion(prompt);
+        if (this._lastPrompt == null || !(this._lastPrompt + (completion ?? "")).startsWith(prompt)) {
+            this._lastPrompt = prompt;
+            this._restartCompletion(completionCache);
+        }
+        this._lastPrompt = prompt;
+        return completion ?? "";
+    }
+    /** @internal */
+    _getCurrentCompletionCache() {
+        const completionCache = this._completionCaches.get(this._chatSession._chatHistoryStateRef);
+        if (completionCache != null)
+            return completionCache;
+        const newCompletionCache = new CompletionCache(this._maxCachedCompletions);
+        this._completionCaches.set(this._chatSession._chatHistoryStateRef, newCompletionCache);
+        return newCompletionCache;
+    }
+    /** @internal */
+    _restartCompletion(completionCache) {
+        if (this._disposed)
+            return;
+        this._currentCompletionAbortController.abort();
+        this._currentCompletionAbortController = new AbortController();
+        const prompt = this._lastPrompt;
+        if (prompt == null)
+            return;
+        const existingCompletion = completionCache.getCompletion(prompt);
+        const promptToComplete = prompt + (existingCompletion ?? "");
+        const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete, false, "trimLeadingSpace").length;
+        const leftTokens = Math.max(0, this._maxPreloadTokens - currentPromptTokens);
+        if (leftTokens === 0)
+            return;
+        const currentAbortController = this._currentCompletionAbortController;
+        const currentAbortSignal = this._currentCompletionAbortController.signal;
+        let currentCompletion = "";
+        void this._chatSession.completePrompt(promptToComplete, {
+            ...this._completionOptions,
+            stopOnAbortSignal: false,
+            maxTokens: leftTokens,
+            signal: currentAbortSignal,
+            onTextChunk: (chunk) => {
+                currentCompletion += chunk;
+                const completion = (existingCompletion ?? "") + currentCompletion;
+                completionCache.putCompletion(prompt, completion);
+                if (this._getCurrentCompletionCache() !== completionCache) {
+                    currentAbortController.abort();
+                    return;
+                }
+                if (this._lastPrompt === prompt)
+                    this._onGeneration?.(prompt, completion);
+            }
+        })
+            .then(() => {
+            if (this._lastPrompt !== prompt && this._getCurrentCompletionCache() === completionCache)
+                return this._restartCompletion(completionCache);
+        })
+            .catch((err) => {
+            if ((currentAbortSignal.aborted && err === currentAbortSignal.reason) || err instanceof DOMException)
+                return;
+            console.error(getConsoleLogPrefix(false, false), err);
+        });
+    }
+    /** @internal */
+    static _create(chatSession, options = {}) {
+        return new LlamaChatSessionPromptCompletionEngine(chatSession, options);
+    }
+}
+class CompletionCache {
+    /** @internal */ _cache;
+    /** @internal */ _rootNode = [new Map()];
+    constructor(maxInputs) {
+        this._cache = new LruCache(maxInputs, {
+            onDelete: (key) => {
+                this._deleteInput(key);
+            }
+        });
+    }
+    get maxInputs() {
+        return this._cache.maxSize;
+    }
+    getCompletion(input) {
+        let node = this._rootNode;
+        for (let i = 0; i < input.length; i++) {
+            if (node == null)
+                return null;
+            const [next, completion] = node;
+            const char = input[i];
+            if (!next.has(char)) {
+                if (completion != null && completion.startsWith(input.slice(i))) {
+                    this._cache.get(input.slice(0, i));
+                    return completion.slice(input.length - i);
+                }
+            }
+            node = next.get(char);
+        }
+        if (node == null)
+            return null;
+        const [, possibleCompletion] = node;
+        if (possibleCompletion != null) {
+            this._cache.get(input);
+            return possibleCompletion;
+        }
+        return null;
+    }
+    putCompletion(input, completion) {
+        this._cache.set(input, null);
+        let node = this._rootNode;
+        for (let i = 0; i < input.length; i++) {
+            const [next] = node;
+            const char = input[i];
+            if (!next.has(char))
+                next.set(char, [new Map()]);
+            node = next.get(char);
+        }
+        const currentCompletion = node[1];
+        if (currentCompletion != null && currentCompletion.startsWith(completion))
+            return currentCompletion;
+        node[1] = completion;
+        return completion;
+    }
+    /** @internal */
+    _deleteInput(input) {
+        let lastNodeWithMultipleChildren = this._rootNode;
+        let lastNodeWithMultipleChildrenDeleteChar = input[0];
+        let node = this._rootNode;
+        for (let i = 0; i < input.length; i++) {
+            const [next] = node;
+            const char = input[i];
+            if (next.size > 1) {
+                lastNodeWithMultipleChildren = node;
+                lastNodeWithMultipleChildrenDeleteChar = char;
+            }
+            if (!next.has(char))
+                return;
+            node = next.get(char);
+        }
+        if (lastNodeWithMultipleChildrenDeleteChar !== "")
+            lastNodeWithMultipleChildren[0].delete(lastNodeWithMultipleChildrenDeleteChar);
+    }
+}
+//# sourceMappingURL=LlamaChatSessionPromptCompletionEngine.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts
@@ -0,0 +1,15 @@
+import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../../../utils/gbnfJson/types.js";
+import { ChatSessionModelFunction } from "../../../types.js";
+/**
+ * Define a function that can be used by the model in a chat session, and return it.
+ *
+ * This is a helper function to facilitate defining functions with full TypeScript type information.
+ *
+ * The handler function can return a Promise, and the return value will be awaited before being returned to the model.
+ * @param functionDefinition
+ */
+export declare function defineChatSessionFunction<const Params extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs>>({ description, params, handler }: {
+    description?: string;
+    params?: Readonly<Params> & GbnfJsonSchema<Defs>;
+    handler: (params: GbnfJsonSchemaToType<NoInfer<Params>>) => Promise<any> | any;
+}): ChatSessionModelFunction<NoInfer<Params>>;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js
@@ -0,0 +1,16 @@
+/**
+ * Define a function that can be used by the model in a chat session, and return it.
+ *
+ * This is a helper function to facilitate defining functions with full TypeScript type information.
+ *
+ * The handler function can return a Promise, and the return value will be awaited before being returned to the model.
+ * @param functionDefinition
+ */
+export function defineChatSessionFunction({ description, params, handler }) {
+    return {
+        description,
+        params,
+        handler
+    };
+}
+//# sourceMappingURL=defineChatSessionFunction.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"defineChatSessionFunction.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AACH,MAAM,UAAU,yBAAyB,CAGvC,EACE,WAAW,EACX,MAAM,EACN,OAAO,EAKV;IACG,OAAO;QACH,WAAW;QACX,MAAM;QACN,OAAO;KACV,CAAC;AACN,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.d.ts
@@ -0,0 +1,186 @@
+import { EventRelay } from "lifecycle-utils";
+import { LLamaContextualRepeatPenalty, Token } from "../types.js";
+import { LlamaText } from "../utils/LlamaText.js";
+import { LlamaGrammar } from "./LlamaGrammar.js";
+import { EvaluationPriority } from "./LlamaContext/types.js";
+import { LlamaContextSequence } from "./LlamaContext/LlamaContext.js";
+import { TokenBias } from "./TokenBias.js";
+export type LlamaCompletionOptions = {
+    contextSequence: LlamaContextSequence;
+    /**
+     * Automatically dispose the sequence when the object is disposed.
+     *
+     * Defaults to `false`.
+     */
+    autoDisposeSequence?: boolean;
+};
+export type LlamaCompletionGenerationOptions = {
+    /**
+     * Called as the model generates a completion with the generated text chunk.
+     *
+     * Useful for streaming the generated completion as it's being generated.
+     */
+    onTextChunk?: (text: string) => void;
+    /**
+     * Called as the model generates a completion with the generated tokens.
+     *
+     * Preferably, you'd want to use `onTextChunk` instead of this.
+     */
+    onToken?: (tokens: Token[]) => void;
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
+    signal?: AbortSignal;
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /** Maximum number of tokens to generate */
+    maxTokens?: number;
+    /**
+     * Temperature is a hyperparameter that controls the randomness of the generated text.
+     * It affects the probability distribution of the model's output tokens.
+     *
+     * A higher temperature (e.g., 1.5) makes the output more random and creative,
+     * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
+     *
+     * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
+     *
+     * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+     *
+     * Set to `0` to disable.
+     * Disabled by default (set to `0`).
+     */
+    temperature?: number;
+    /**
+     * From the next token candidates, discard the percentage of tokens with the lowest probability.
+     * For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
+     * This is useful for generating more high-quality results when using a high temperature.
+     * Set to a value between `0` and `1` to enable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     * Disabled by default.
+     */
+    minP?: number;
+    /**
+     * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
+     * An integer number between `1` and the size of the vocabulary.
+     * Set to `0` to disable (which uses the full vocabulary).
+     *
+     * Only relevant when `temperature` is set to a value greater than 0.
+     */
+    topK?: number;
+    /**
+     * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
+     * and samples the next token only from this set.
+     * A float number between `0` and `1`.
+     * Set to `1` to disable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     */
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    /**
+     * Trim whitespace from the end of the generated text
+     * Disabled by default.
+     */
+    trimWhitespaceSuffix?: boolean;
+    repeatPenalty?: false | LLamaContextualRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
+     */
+    evaluationPriority?: EvaluationPriority;
+    grammar?: LlamaGrammar;
+    /**
+     * Custom stop triggers to stop the completion when any of the provided triggers are found.
+     */
+    customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
+    /**
+     * The number of tokens to delete from the context window to make space for new ones.
+     * Defaults to 10% of the context size.
+     */
+    contextShiftSize?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+    /**
+     * Context shift reconstructs the context with partial relevant data to continue generation when the context fills up.
+     * This flag disables this behavior.
+     * This flag will cause the generation to stop when the context fills up
+     * by setting an appropriate `maxTokens` value or lowering the given `maxTokens` value when needed.
+     * This flag will cause the generation to fail if there's no space for generating new tokens at all with the given inputs.
+     *
+     * Disabled by default. Not recommended unless you know what you're doing.
+     */
+    disableContextShift?: boolean;
+};
+export type LlamaInfillGenerationOptions = LlamaCompletionGenerationOptions & {
+    /**
+     * The minimum number of tokens to keep from the prefix input when making a context shift.
+     * Defaults to 10% of the context size.
+     */
+    minPrefixKeepTokens?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+};
+export type LlamaCompletionResponse = {
+    response: string;
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
+    } | {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+    };
+};
+/**
+ * @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
+ */
+export declare class LlamaCompletion {
+    readonly onDispose: EventRelay<void>;
+    constructor({ contextSequence, autoDisposeSequence }: LlamaCompletionOptions);
+    dispose({ disposeSequence }?: {
+        disposeSequence?: boolean;
+    }): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get infillSupported(): boolean;
+    /**
+     * Generate a completion for an input.
+     */
+    generateCompletion(input: Token[] | string | LlamaText, options?: LlamaCompletionGenerationOptions): Promise<string>;
+    /**
+     * Same as `generateCompletion`, but returns additional metadata about the generation.
+     * See `generateCompletion` for more information.
+     */
+    generateCompletionWithMeta(input: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, customStopTriggers, contextShiftSize, disableContextShift }?: LlamaCompletionGenerationOptions): Promise<LlamaCompletionResponse>;
+    /**
+     * Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
+     * should connect to a given continuation (`suffixInput`).
+     * For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
+     * to make the final text be `123456789`.
+     */
+    generateInfillCompletion(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, options?: LlamaInfillGenerationOptions): Promise<string>;
+    /**
+     * Same as `generateInfillCompletion`, but returns additional metadata about the generation.
+     * See `generateInfillCompletion` for more information.
+     */
+    generateInfillCompletionWithMeta(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, contextShiftSize, customStopTriggers, minPrefixKeepTokens, disableContextShift }?: LlamaInfillGenerationOptions): Promise<LlamaCompletionResponse>;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js
@@ -0,0 +1,495 @@
+import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { tokenizeInput } from "../utils/tokenizeInput.js";
+import { UnsupportedError } from "../utils/UnsupportedError.js";
+import { removeNullFields } from "../utils/removeNullFields.js";
+import { TokenStreamRegulator } from "../utils/TokenStreamRegulator.js";
+import { StopGenerationDetector } from "../utils/StopGenerationDetector.js";
+import { UNKNOWN_UNICODE_CHAR } from "../consts.js";
+import { getQueuedTokensBeforeStopTrigger } from "../utils/getQueuedTokensBeforeStopTrigger.js";
+import { safeEventCallback } from "../utils/safeEventCallback.js";
+import { pushAll } from "../utils/pushAll.js";
+import { GgufArchitectureType } from "../gguf/types/GgufMetadataTypes.js";
+import { resolveBeginningTokenToPrepend } from "../utils/tokenizerUtils.js";
+import { LlamaGrammarEvaluationState } from "./LlamaGrammarEvaluationState.js";
+const defaultContextShiftSize = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
+const defaultMinPrefixKeepTokens = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
+/**
+ * @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
+ */
+export class LlamaCompletion {
+    /** @internal */ _disposeAggregator = new DisposeAggregator();
+    /** @internal */ _autoDisposeSequence;
+    /** @internal */ _sequence;
+    onDispose = new EventRelay();
+    constructor({ contextSequence, autoDisposeSequence = false }) {
+        this._sequence = contextSequence;
+        this._autoDisposeSequence = autoDisposeSequence;
+        this._disposeAggregator.add(this._sequence.onDispose.createListener(() => {
+            this.dispose();
+        }));
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+    }
+    dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
+        if (this._sequence == null || this.disposed)
+            return;
+        if (disposeSequence)
+            this._sequence.dispose();
+        this._sequence = null;
+        this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    [Symbol.dispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._sequence == null || this._sequence.disposed;
+    }
+    get infillSupported() {
+        if (this._sequence == null)
+            throw new DisposedError();
+        return this._sequence.model.tokens.infill.prefix != null &&
+            this._sequence.model.tokens.infill.suffix != null;
+    }
+    /**
+     * Generate a completion for an input.
+     */
+    async generateCompletion(input, options = {}) {
+        const { response } = await this.generateCompletionWithMeta(input, options);
+        return response;
+    }
+    /**
+     * Same as `generateCompletion`, but returns additional metadata about the generation.
+     * See `generateCompletion` for more information.
+     */
+    async generateCompletionWithMeta(input, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, customStopTriggers, contextShiftSize = defaultContextShiftSize, disableContextShift } = {}) {
+        if (this._sequence == null || this.disposed)
+            throw new DisposedError();
+        const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
+        const extraEosTokens = getExtraCompletionEosTokens(this._sequence.model);
+        async function fitInputIntoContext({ maxTokens, tokens }) {
+            const res = [];
+            if (beginningTokenToPrepend != null)
+                res.push(beginningTokenToPrepend);
+            const inputTokensSize = Math.max(0, Math.min(maxTokens - res.length, tokens.length));
+            if (inputTokensSize === 0 && tokens.length > 0)
+                throw new Error("The context size is too small to generate a response for the given input");
+            const slicedTokens = tokens.slice(-inputTokensSize);
+            pushAll(res, slicedTokens);
+            return res;
+        }
+        const ensureNotAborted = () => {
+            if (signal?.aborted && !stopOnAbortSignal)
+                throw signal.reason;
+            if (this.disposed)
+                throw new DisposedError();
+        };
+        return await withLock([this, "generateCompletion"], signal, async () => {
+            ensureNotAborted();
+            if (this._sequence == null || this.disposed)
+                throw new DisposedError();
+            const resolvedInput = tokenizeInput(input, this._sequence.model.tokenizer, beginningTokenToPrepend != null
+                ? "trimLeadingSpace"
+                : undefined);
+            const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
+            ensureNotAborted();
+            const inputTokens = await fitInputIntoContext({
+                maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
+                tokens: resolvedInput
+            });
+            ensureNotAborted();
+            const resolvedMaxTokens = !disableContextShift
+                ? maxTokens
+                : (maxTokens != null && maxTokens > 0)
+                    ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
+                    : this._sequence.context.contextSize - inputTokens.length;
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
+            return await this._generateResponse(inputTokens, {
+                onTextChunk: safeEventCallback(onTextChunk),
+                onToken: safeEventCallback(onToken),
+                signal,
+                stopOnAbortSignal,
+                maxTokens: resolvedMaxTokens,
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                trimWhitespaceSuffix,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                grammar,
+                contextShiftSize,
+                customStopTriggers
+            }, {
+                async contextShift({ shiftSize, res, pendingTokens, sequence }) {
+                    return {
+                        newContextState: await fitInputIntoContext({
+                            maxTokens: sequence.context.contextSize - shiftSize,
+                            tokens: [...resolvedInput, ...res, ...pendingTokens]
+                        })
+                    };
+                },
+                extraEosTokens
+            });
+        });
+    }
+    /**
+     * Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
+     * should connect to a given continuation (`suffixInput`).
+     * For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
+     * to make the final text be `123456789`.
+     */
+    async generateInfillCompletion(prefixInput, suffixInput, options = {}) {
+        const { response } = await this.generateInfillCompletionWithMeta(prefixInput, suffixInput, options);
+        return response;
+    }
+    /**
+     * Same as `generateInfillCompletion`, but returns additional metadata about the generation.
+     * See `generateInfillCompletion` for more information.
+     */
+    async generateInfillCompletionWithMeta(prefixInput, suffixInput, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers, minPrefixKeepTokens = defaultMinPrefixKeepTokens, disableContextShift = false } = {}) {
+        if (this._sequence == null || this.disposed)
+            throw new DisposedError();
+        const prefixToken = this._sequence.model.tokens.infill.prefix;
+        const suffixToken = this._sequence.model.tokens.infill.suffix;
+        const middleToken = this._sequence.model.tokens.infill.middle;
+        const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
+        if (prefixToken == null || suffixToken == null)
+            throw new UnsupportedError("Infill completions are not supported by this model");
+        const extraEosTokens = getExtraInfillEosTokens(this._sequence.model);
+        async function fitInputIntoContext({ maxTokens, prefixTokens, suffixTokens, sequence }) {
+            if (prefixToken == null || suffixToken == null)
+                throw new UnsupportedError("Infill completions are not supported by this model");
+            // 2 - InfillPrefix token, InfillSuffix token
+            const specialTokensInContext = 2 +
+                (middleToken != null ? 1 : 0) +
+                (beginningTokenToPrepend != null ? 1 : 0);
+            const resolvedMaxTokens = maxTokens - specialTokensInContext;
+            let sizeLeftToFill = resolvedMaxTokens;
+            let suffixTokensSize = Math.min(sizeLeftToFill, suffixTokens.length);
+            sizeLeftToFill -= suffixTokensSize;
+            let prefixTokensSize = Math.min(sizeLeftToFill, prefixTokens.length);
+            sizeLeftToFill -= prefixTokensSize;
+            if (sizeLeftToFill <= 0 && disableContextShift)
+                throw new Error("The context size is too small to generate a response for the given input, and context shift is disabled. " +
+                    "Consider removing `disableContextShift` or reducing the input size.");
+            const resolvedMinPrefixKeepTokens = Math.min(Math.min(resolvedMaxTokens, prefixTokens.length), Math.max(1, Math.floor(minPrefixKeepTokens instanceof Function
+                ? await minPrefixKeepTokens(sequence)
+                : minPrefixKeepTokens)));
+            if (prefixTokensSize < resolvedMinPrefixKeepTokens) {
+                const diffToFill = Math.min(suffixTokensSize, resolvedMinPrefixKeepTokens - prefixTokensSize);
+                prefixTokensSize += diffToFill;
+                suffixTokensSize -= diffToFill;
+            }
+            const resolvedPrefixTokens = prefixTokens.slice(-prefixTokensSize);
+            const resolvedSuffixTokens = suffixTokens.slice(0, suffixTokensSize);
+            const newContextState = [];
+            if (beginningTokenToPrepend != null)
+                newContextState.push(beginningTokenToPrepend);
+            if (middleToken != null) {
+                newContextState.push(prefixToken);
+                pushAll(newContextState, resolvedPrefixTokens);
+                newContextState.push(suffixToken);
+                pushAll(newContextState, resolvedSuffixTokens);
+                newContextState.push(middleToken);
+            }
+            else {
+                newContextState.push(suffixToken);
+                pushAll(newContextState, resolvedSuffixTokens);
+                newContextState.push(prefixToken);
+                pushAll(newContextState, resolvedPrefixTokens);
+            }
+            return newContextState;
+        }
+        const ensureNotAborted = () => {
+            if (signal?.aborted && !stopOnAbortSignal)
+                throw signal.reason;
+            if (this.disposed)
+                throw new DisposedError();
+        };
+        return await withLock([this, "generateCompletion"], signal, async () => {
+            ensureNotAborted();
+            if (this._sequence == null || this.disposed)
+                throw new DisposedError();
+            const resolvedPrefixInputTokens = tokenizeInput(prefixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
+            const resolvedSuffixInputTokens = tokenizeInput(suffixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
+            const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
+            ensureNotAborted();
+            const inputTokens = await fitInputIntoContext({
+                maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
+                prefixTokens: resolvedPrefixInputTokens,
+                suffixTokens: resolvedSuffixInputTokens,
+                sequence: this._sequence
+            });
+            ensureNotAborted();
+            const resolvedMaxTokens = !disableContextShift
+                ? maxTokens
+                : (maxTokens != null && maxTokens > 0)
+                    ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
+                    : this._sequence.context.contextSize - inputTokens.length;
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
+            return await this._generateResponse(inputTokens, {
+                onTextChunk: safeEventCallback(onTextChunk),
+                onToken: safeEventCallback(onToken),
+                signal,
+                stopOnAbortSignal,
+                maxTokens: resolvedMaxTokens,
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                trimWhitespaceSuffix,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                grammar,
+                contextShiftSize,
+                customStopTriggers
+            }, {
+                async contextShift({ shiftSize, res, pendingTokens, sequence }) {
+                    return {
+                        newContextState: await fitInputIntoContext({
+                            maxTokens: sequence.context.contextSize - shiftSize,
+                            prefixTokens: [...resolvedPrefixInputTokens, ...res, ...pendingTokens],
+                            suffixTokens: resolvedSuffixInputTokens,
+                            sequence
+                        })
+                    };
+                },
+                extraEosTokens
+            });
+        });
+    }
+    /** @internal */
+    async _generateResponse(tokens, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers }, { contextShift, extraEosTokens = new Set() }) {
+        if (this._sequence == null)
+            throw new DisposedError();
+        const sequence = this._sequence;
+        const model = sequence.model;
+        const context = sequence.context;
+        const res = [];
+        const pendingTokens = [];
+        const grammarEvaluationState = grammar != null
+            ? new LlamaGrammarEvaluationState({ model, grammar })
+            : undefined;
+        const { lastTokens: repeatPenaltyLastTokens = 64, punishTokensFilter, penalizeNewLine, penalty, frequencyPenalty, presencePenalty } = repeatPenalty === false
+            ? { lastTokens: 0 }
+            : repeatPenalty;
+        const streamRegulator = new TokenStreamRegulator();
+        const stopGenerationDetector = new StopGenerationDetector();
+        const customStopGenerationTriggersDetector = new StopGenerationDetector();
+        const locksToReleaseOnValidGeneration = [];
+        const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
+        let inputTokens = tokens;
+        let generatedTokens = 0;
+        if (grammar != null)
+            StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
+                .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
+        if (customStopTriggers != null)
+            StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
+                .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
+        const ensureNotAborted = () => {
+            if (signal?.aborted && !stopOnAbortSignal)
+                throw signal.reason;
+            if (this.disposed)
+                throw new DisposedError();
+        };
+        const getPenaltyTokens = () => {
+            if (this._sequence == null)
+                throw new DisposedError();
+            let punishTokens = res.slice(-repeatPenaltyLastTokens);
+            if (punishTokensFilter != null)
+                punishTokens = punishTokensFilter(punishTokens);
+            if (penalizeNewLine == null || !penalizeNewLine) {
+                const nlToken = model.tokens.nl;
+                if (nlToken != null)
+                    punishTokens = punishTokens.filter((token) => token !== nlToken);
+            }
+            return punishTokens;
+        };
+        while (true) {
+            ensureNotAborted();
+            let shouldContextShift = false;
+            if (inputTokens.length === 1 && sequence.nextTokenIndex !== 0)
+                await sequence.eraseContextTokenRanges([{
+                        start: 0,
+                        end: sequence.nextTokenIndex
+                    }]);
+            else {
+                const lastToken = inputTokens[inputTokens.length - 1];
+                // we need to decode at least one token to generate a response
+                inputTokens.pop();
+                await sequence.adaptStateToTokens(inputTokens, false);
+                inputTokens.push(lastToken);
+                ensureNotAborted();
+                const firstDifferentIndex = sequence.nextTokenIndex;
+                inputTokens.splice(0, firstDifferentIndex);
+            }
+            const evaluationIterator = sequence.evaluate(inputTokens, removeNullFields({
+                temperature, minP, topK, topP, seed,
+                grammarEvaluationState,
+                repeatPenalty: !repeatPenaltyEnabled ? undefined : {
+                    punishTokens: getPenaltyTokens,
+                    maxPunishTokens: repeatPenaltyLastTokens,
+                    penalty,
+                    frequencyPenalty,
+                    presencePenalty
+                },
+                tokenBias,
+                evaluationPriority,
+                yieldEogToken: true
+            }));
+            const pendingPartialTokens = [];
+            for await (const token of evaluationIterator) {
+                ensureNotAborted();
+                generatedTokens++;
+                const tokens = pendingPartialTokens.length === 0
+                    ? [token]
+                    : [...pendingPartialTokens, token];
+                const text = model.detokenize([token]);
+                if (pendingPartialTokens.length === 0 &&
+                    text.endsWith(UNKNOWN_UNICODE_CHAR) &&
+                    !model.isSpecialToken(token) &&
+                    !model.isEogToken(token)) {
+                    pendingPartialTokens.push(token);
+                    continue;
+                }
+                else {
+                    pendingPartialTokens.length = 0;
+                    const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
+                    if (text.endsWith(UNKNOWN_UNICODE_CHAR) || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "") || (text === "" && locksToReleaseOnValidGeneration.length > 0 && !model.isSpecialToken(token))) {
+                        locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
+                    }
+                    else {
+                        while (locksToReleaseOnValidGeneration.length > 0)
+                            locksToReleaseOnValidGeneration.shift().dispose();
+                    }
+                    stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
+                    if (model.isEogToken(token) || extraEosTokens.has(token))
+                        queuedTokenRelease.createTokenIndexLock(0);
+                    pushAll(pendingTokens, streamRegulator.popFreeChunkTokens());
+                    if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
+                        model.isEogToken(token) || extraEosTokens.has(token)) {
+                        const triggeredStops = stopGenerationDetector.hasTriggeredStops
+                            ? stopGenerationDetector.getTriggeredStops()
+                            : customStopGenerationTriggersDetector.getTriggeredStops();
+                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
+                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
+                        pushAll(pendingTokens, queuedTokensBeforeStopTrigger);
+                        const { firstRemainingGenerationAfterStop } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
+                        if (pendingTokens.length > 0) {
+                            onToken?.(pendingTokens.slice());
+                            onTextChunk?.(model.detokenize(pendingTokens, false, res));
+                        }
+                        pushAll(res, pendingTokens);
+                        pendingTokens.length = 0;
+                        let modelResponse = model.detokenize(res);
+                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
+                            modelResponse = modelResponse.trimEnd();
+                        const isEogToken = model.isEogToken(token) || extraEosTokens.has(token);
+                        if (isEogToken || stopGenerationDetector.hasTriggeredStops)
+                            return {
+                                response: modelResponse,
+                                metadata: {
+                                    remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                    stopReason: isEogToken
+                                        ? "eogToken"
+                                        : "stopGenerationTrigger"
+                                }
+                            };
+                        return {
+                            response: modelResponse,
+                            metadata: {
+                                remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                stopReason: "customStopTrigger",
+                                customStopTrigger: triggeredStops[0].stopTrigger
+                            }
+                        };
+                    }
+                    if (pendingTokens.length > 0) {
+                        onToken?.(pendingTokens.slice());
+                        onTextChunk?.(model.detokenize(pendingTokens, false, res));
+                        pushAll(res, pendingTokens);
+                        pendingTokens.length = 0;
+                    }
+                }
+                const aborted = (signal?.aborted ?? false) && stopOnAbortSignal;
+                const maxTokensReached = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
+                if (aborted || maxTokensReached) {
+                    let modelResponse = model.detokenize(res);
+                    if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
+                        modelResponse = modelResponse.trimEnd();
+                    return {
+                        response: modelResponse,
+                        metadata: {
+                            stopReason: aborted
+                                ? "abort"
+                                : "maxTokens"
+                        }
+                    };
+                }
+                if (sequence.nextTokenIndex >= context.contextSize - 1) {
+                    shouldContextShift = true;
+                    break;
+                }
+            }
+            if (shouldContextShift) {
+                const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, sequence);
+                ensureNotAborted();
+                const { newContextState } = await contextShift({
+                    shiftSize: resolvedContextShiftSize,
+                    res,
+                    pendingTokens,
+                    sequence
+                });
+                ensureNotAborted();
+                inputTokens = newContextState;
+                continue;
+            }
+            break;
+        }
+        throw new Error("The context size is too small to generate a response");
+    }
+}
+async function resolveContextShiftSize(contextShiftSize, sequence) {
+    if (typeof contextShiftSize === "number")
+        return contextShiftSize;
+    else if (contextShiftSize instanceof Function)
+        return Math.min(sequence.context.contextSize, Math.max(1, Math.floor(contextShiftSize instanceof Function
+            ? await contextShiftSize(sequence)
+            : contextShiftSize)));
+    return defaultContextShiftSize(sequence);
+}
+function getExtraCompletionEosTokens(model) {
+    const extraEosTokens = new Set();
+    if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
+        model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
+        for (const token of model.iterateAllTokens()) {
+            const tokenText = model.detokenize([token], true);
+            if (tokenText === "<|file_separator|>" || tokenText === "<|fim_prefix|>") {
+                extraEosTokens.add(token);
+                if (extraEosTokens.size === 2)
+                    break;
+            }
+        }
+    }
+    return extraEosTokens;
+}
+function getExtraInfillEosTokens(model) {
+    const extraEosTokens = new Set();
+    if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
+        model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
+        for (const token of model.iterateAllTokens()) {
+            const tokenText = model.detokenize([token], true);
+            if (tokenText === "<|file_separator|>") {
+                extraEosTokens.add(token);
+                break;
+            }
+        }
+    }
+    return extraEosTokens;
+}
+//# sourceMappingURL=LlamaCompletion.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
@@ -0,0 +1,245 @@
+import { EventRelay } from "lifecycle-utils";
+import { Token } from "../../types.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { LlamaModel } from "../LlamaModel/LlamaModel.js";
+import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
+import { TokenPredictor } from "./TokenPredictor.js";
+export declare class LlamaContext {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    dispose(): Promise<void>;
+    /** @hidden */
+    [Symbol.asyncDispose](): Promise<void>;
+    get disposed(): boolean;
+    get model(): LlamaModel;
+    get contextSize(): number;
+    get batchSize(): number;
+    get flashAttention(): boolean;
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize(): number;
+    /** The number of threads currently used to evaluate tokens */
+    get currentThreads(): number;
+    /**
+     * The number of threads that are preferred to be used to evaluate tokens.
+     *
+     * The actual number of threads used may be lower when other evaluations are running in parallel.
+     */
+    get idealThreads(): number;
+    getAllocatedContextSize(): number;
+    get totalSequences(): number;
+    get sequencesLeft(): number;
+    /**
+     * Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
+     * When there are no sequences left, this method will throw an error.
+     */
+    getSequence(options?: {
+        contextShift?: ContextShiftOptions;
+        /**
+         * Token predictor to use for the sequence.
+         * Don't share the same token predictor between multiple sequences.
+         *
+         * Using a token predictor doesn't affect the generation output itself -
+         * it only allows for greater parallelization of the token evaluation to speed up the generation.
+         *
+         * > **Note:** that if a token predictor is too resource intensive,
+         * > it can slow down the generation process due to the overhead of running the predictor.
+         * >
+         * > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
+         *
+         * Automatically disposed when disposing the sequence.
+         * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
+         */
+        tokenPredictor?: TokenPredictor;
+    }): LlamaContextSequence;
+    dispatchPendingBatch(): void;
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     *
+     * Requires the `performanceTracking` option to be enabled.
+     *
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
+    printTimings(): Promise<void>;
+}
+export declare class LlamaContextSequence {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    dispose(): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get context(): LlamaContext;
+    get model(): LlamaModel;
+    /** The maximum number of tokens that the sequence state can hold */
+    get contextSize(): number;
+    /** The index where the next evaluated token will be placed in the context */
+    get nextTokenIndex(): number;
+    /** The current context state tokens */
+    get contextTokens(): Token[];
+    get tokenMeter(): TokenMeter;
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    get tokenPredictor(): TokenPredictor | undefined;
+    /**
+     * Get the index of the first token in the KV cache.
+     *
+     * If you remove any tokens from the state that come before this index,
+     * no cached prefix tokens evaluation state will be used for the next evaluation.
+     *
+     * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
+     * then the cached state for range `0-10` will be used in the next evaluation,
+     * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
+     * and will be re-evaluated in the next evaluation.
+     *
+     * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
+     *
+     * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     *
+     * When the KV cache is empty, this index will be `-1`.
+     *
+     * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
+     */
+    get stateCellsStartIndex(): number;
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    get tokenPredictions(): {
+        /** Number of token predictions that were actually used (tokens that were validated and then consumed) */
+        used: number;
+        /** Number of token predictions that were not used (tokens that were validated and were not consumed) */
+        unused: number;
+        /** Number of token predictions that were validated successfully */
+        validated: number;
+        /** Number of token predictions that were refuted */
+        refuted: number;
+    };
+    get isLoadedToMemory(): boolean;
+    compareContextTokens(tokens: Token[]): {
+        firstDifferentIndex: number;
+    };
+    /**
+     * Erase parts of the context state to align it with the given tokens.
+     *
+     * If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
+     *
+     * To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
+     *
+     * If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
+     * which incurs token evaluation of the shifted tokens.
+     */
+    adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
+    /**
+     * Clear the history of the sequence.
+     */
+    clearHistory(): Promise<void>;
+    /**
+     * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
+     * The start of each range is inclusive, and the end of each range is exclusive.
+     * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
+     */
+    eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
+    /**
+     * Evaluate the provided tokens into the context sequence without generating new tokens.
+     */
+    evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority;
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions;
+    }): Promise<void>;
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority;
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions;
+        /** Called on each token result after it's generated */
+        onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
+    }): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
+    /**
+     * Save the current context sequence evaluation state to a file.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    saveStateToFile(filePath: string): Promise<{
+        fileSize: number;
+    }>;
+    /**
+     * Load a context sequence evaluation state from a file.
+     *
+     * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
+     *
+     * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    loadStateFromFile(filePath: string, acceptRisk: {
+        /**
+         * Loading a state file created using a different model may crash the process.
+         *
+         * You must accept this risk to use this feature.
+         */
+        acceptRisk: true;
+    }): Promise<void>;
+}
+export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
+    contextSize: number;
+    sequences: number;
+}): number;
+export declare function getDefaultContextSequences(): number;
+export declare function getDefaultModelContextSize({ trainContextSize }: {
+    trainContextSize?: number;
+}): number;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
@@ -0,0 +1 @@
+export {};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
@@ -0,0 +1,31 @@
+/** @internal */
+export class LlamaSampler {
+    /** @internal */ _llama;
+    /** @internal */ _sampler;
+    /** @internal */ disposed = false;
+    constructor(model) {
+        this._llama = model._llama;
+        this._sampler = new this._llama._bindings.AddonSampler(model._model);
+        this.asyncDispose = this.asyncDispose.bind(this);
+    }
+    dispose() {
+        this.disposed = true;
+        this._sampler.dispose();
+    }
+    async asyncDispose() {
+        this.disposed = true;
+        this._sampler.dispose();
+    }
+    applyConfig(config) {
+        return this._sampler.applyConfig(config);
+    }
+    /** @internal */
+    static _canBeNextTokenForGrammarEvaluationState(llama, grammarEvaluationState, token) {
+        return llama._bindings.AddonSampler.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
+    }
+    /** @internal */
+    static _acceptTokenOnGrammarEvaluationState(llama, grammarEvaluationState, token) {
+        llama._bindings.AddonSampler.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
+    }
+}
+//# sourceMappingURL=LlamaSampler.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
@@ -0,0 +1,55 @@
+import { Token } from "../../types.js";
+import { SequenceEvaluateOptions } from "./types.js";
+import { LlamaContextSequence } from "./LlamaContext.js";
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export declare abstract class TokenPredictor {
+    /**
+     * Resets the state of the predictor.
+     *
+     * Called before the generation starts.
+     */
+    abstract reset(params: {
+        /** The target sequence that this token predictor is generating tokens for */
+        targetSequence: LlamaContextSequence;
+        /**
+         * The tokens that are or will be loaded into the state.
+         *
+         * The initial predictions should be based on these tokens.
+         *
+         * When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
+         */
+        stateTokens: Token[];
+        /**
+         * Options used for the evaluation on the target sequence.
+         *
+         * The `grammarEvaluationState` is cloned before being passed to the token predictor,
+         * so it can be modified without affecting the original state.
+         */
+        evaluateOptions: Readonly<SequenceEvaluateOptions>;
+    }): Promise<void> | void;
+    abstract pushTokens(tokens: Token[]): void;
+    /**
+     * Predicts the next tokens based on the current state.
+     *
+     * If the generation should wait until the minimum predications are ready,
+     * this method should return a promise that resolves when the minimum predictions are ready.
+     *
+     * A background prediction process can be started when this function is called,
+     * so that the next predictions will be ready when this function is called again.
+     */
+    abstract predictTokens(): Promise<Token[]> | Token[];
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    stop(untilPredictionsExhausted?: boolean): Promise<void> | void;
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    updateInputTokens(tokens: Token[]): void;
+    dispose(): Promise<void> | void;
+    /** @hidden */
+    [Symbol.dispose](): void | Promise<void>;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
@@ -0,0 +1,20 @@
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export class TokenPredictor {
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    stop(untilPredictionsExhausted) { }
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    updateInputTokens(tokens) { }
+    dispose() { }
+    /** @hidden */
+    [Symbol.dispose]() {
+        return this.dispose();
+    }
+}
+//# sourceMappingURL=TokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
@@ -0,0 +1,56 @@
+import { Token } from "../../../types.js";
+import { SequenceEvaluateOptions } from "../types.js";
+import { LlamaContextSequence } from "../LlamaContext.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export declare class DraftSequenceTokenPredictor extends TokenPredictor {
+    constructor(draftSequence: LlamaContextSequence, options?: {
+        /**
+         * The minimum number of tokens to draft.
+         *
+         * Defaults to `0`.
+         */
+        minTokens?: number;
+        /**
+         * Maximum number of tokens to draft.
+         *
+         * Defaults to `16`.
+         */
+        maxTokens?: number;
+        /**
+         * Evaluate options default to the values of the target sequence.
+         *
+         * You can override any of the options for the prediction here.
+         */
+        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
+        /**
+         * Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
+         * When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
+         * are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
+         *
+         * A number between `0` and `1` representing the minimum probability of the token to be generated.
+         *
+         * Set to `0` to disable.
+         *
+         * Defaults to `0.6`.
+         */
+        minConfidence?: number;
+    });
+    get draftSequence(): LlamaContextSequence;
+    get minTokens(): number;
+    get maxTokens(): number;
+    get minConfidence(): number | undefined;
+    reset({ targetSequence, stateTokens, evaluateOptions }: {
+        targetSequence: LlamaContextSequence;
+        stateTokens: Token[];
+        evaluateOptions: Readonly<SequenceEvaluateOptions>;
+    }): Promise<void>;
+    pushTokens(tokens: Token[]): void;
+    predictTokens(): Token[] | Promise<Token[]>;
+    stop(untilPredictionsExhausted?: boolean): void;
+    dispose(): void;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
@@ -0,0 +1,266 @@
+import { withLock } from "lifecycle-utils";
+import { pushAll } from "../../../utils/pushAll.js";
+import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
+import { LlamaSampler } from "../LlamaSampler.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+const defaultPredictionMinTokens = 0;
+const defaultPredictionMaxTokens = 16;
+const defaultPredictionMinConfidence = 0.6;
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export class DraftSequenceTokenPredictor extends TokenPredictor {
+    /** @internal */ _draftSequence;
+    /** @internal */ _minTokens;
+    /** @internal */ _maxTokens;
+    /** @internal */ _minConfidence;
+    /** @internal */ _stateTokens = [];
+    /** @internal */ _pendingEvalTokens = [];
+    /** @internal */ _predictedTokens = [];
+    /** @internal */ _evaluateOptions = {};
+    /** @internal */ _overrideEvaluateOptions = {};
+    /** @internal */ _grammarEvaluationStateOption;
+    /** @internal */ _currentEvaluationAbortController = new AbortController();
+    /** @internal */ _resetAbortController = new AbortController();
+    /** @internal */ _stopped = true;
+    /** @internal */ _waitForPredictionExhaustion = false;
+    /** @internal */ _minTokensCallbacks = [];
+    /** @internal */ _resetPredictions = false;
+    /** @internal */ _iterator;
+    /** @internal */ _active = false;
+    /** @internal */ _disposed = false;
+    constructor(draftSequence, options = {}) {
+        super();
+        this._draftSequence = draftSequence;
+        this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
+        this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
+        this._overrideEvaluateOptions = options.evaluateOptions ?? {};
+        this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
+        if (draftSequence.disposed)
+            throw new Error("The draft sequence is disposed");
+    }
+    get draftSequence() {
+        return this._draftSequence;
+    }
+    get minTokens() {
+        return this._minTokens;
+    }
+    get maxTokens() {
+        return this._maxTokens;
+    }
+    get minConfidence() {
+        return this._minConfidence;
+    }
+    async reset({ targetSequence, stateTokens, evaluateOptions }) {
+        this._currentEvaluationAbortController.abort();
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        this._resetAbortController = new AbortController();
+        this._stopped = true;
+        this._waitForPredictionExhaustion = false;
+        this._iterator?.return();
+        this._iterator = undefined;
+        const currentAbortSignal = this._resetAbortController.signal;
+        targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
+        try {
+            await withLock([this, "evaluate"], currentAbortSignal, async () => {
+                this._stateTokens = stateTokens.slice();
+                this._pendingEvalTokens = [];
+                this._predictedTokens = [];
+                this._resetPredictions = false;
+                while (this._minTokensCallbacks.length > 0)
+                    this._minTokensCallbacks.shift()?.();
+                const lastToken = this._stateTokens.pop();
+                if (lastToken != null)
+                    this._pendingEvalTokens.push(lastToken);
+                this._evaluateOptions = evaluateOptions;
+                this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+                    ? this._evaluateOptions.grammarEvaluationState()?.clone()
+                    : this._evaluateOptions.grammarEvaluationState?.clone();
+                const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
+                await this._draftSequence.adaptStateToTokens(newStateTokens, true);
+                newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
+                await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
+                    contextShift: this._evaluateOptions.contextShift,
+                    evaluationPriority: this._evaluateOptions.evaluationPriority
+                });
+            });
+        }
+        catch (err) {
+            if (err !== currentAbortSignal.reason)
+                throw err;
+        }
+    }
+    pushTokens(tokens) {
+        const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+            ? this._evaluateOptions.grammarEvaluationState()?.clone()
+            : this._evaluateOptions.grammarEvaluationState?.clone();
+        void withLock([this, "pushTokens"], async () => {
+            this._grammarEvaluationStateOption = grammarEvaluationStateOption;
+            const tokensToPush = tokens.slice();
+            while (!this._resetPredictions && tokensToPush.length > 0) {
+                const token = tokensToPush.shift();
+                if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
+                    this._predictedTokens.shift();
+                }
+                else {
+                    tokensToPush.unshift(token);
+                    break;
+                }
+            }
+            if (tokensToPush.length === 0) {
+                if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
+                    this._resume();
+                return;
+            }
+            this._currentEvaluationAbortController.abort();
+            this._currentEvaluationAbortController = new AbortController();
+            pushAll(this._pendingEvalTokens, tokensToPush);
+            this._resetPredictions = true;
+            this._resume();
+        });
+    }
+    predictTokens() {
+        if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
+            return this._predictedTokens;
+        this._stopped = false;
+        if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
+            this._waitForPredictionExhaustion = false;
+            this._resume();
+        }
+        if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
+            return this._predictedTokens;
+        if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
+            if (this._resetPredictions)
+                return [];
+            return this._predictedTokens;
+        }
+        return new Promise((accept) => void this._minTokensCallbacks.push(accept))
+            .then(() => {
+            if (this._resetPredictions)
+                return [];
+            return this._predictedTokens;
+        });
+    }
+    stop(untilPredictionsExhausted = false) {
+        this._stopped = true;
+        this._currentEvaluationAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        if (untilPredictionsExhausted)
+            this._waitForPredictionExhaustion = true;
+        void withLock([this, "evaluate"], async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+    dispose() {
+        this._disposed = true;
+        this._stopped = true;
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController.abort();
+        void withLock([this, "evaluate"], async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+    /** @internal */
+    _canIterate() {
+        return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
+    }
+    /** @internal */
+    _resume() {
+        if (this._active || !this._canIterate())
+            return;
+        this._active = true;
+        void withLock([this, "evaluate"], async () => {
+            try {
+                const abortSignal = this._currentEvaluationAbortController.signal;
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+                const resetPredications = async () => {
+                    this._iterator?.return();
+                    this._iterator = undefined;
+                    this._waitForPredictionExhaustion = false;
+                    this._resetPredictions = false;
+                    const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
+                    this._predictedTokens = [];
+                    await this._draftSequence.eraseContextTokenRanges([{
+                            start: this._draftSequence.nextTokenIndex - tokenToDelete,
+                            end: this._draftSequence.nextTokenIndex
+                        }]);
+                };
+                const createIterator = () => {
+                    const tokens = this._pendingEvalTokens;
+                    this._pendingEvalTokens = [];
+                    return this.draftSequence.evaluateWithMetadata(tokens, { confidence: true }, {
+                        ...this._evaluateOptions,
+                        ...this._overrideEvaluateOptions,
+                        grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
+                    });
+                };
+                if (this._resetPredictions)
+                    await resetPredications();
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+                let iterator = createIterator();
+                this._iterator = iterator;
+                while (this._canIterate() && !abortSignal.aborted) {
+                    const { value, done } = await iterator.next();
+                    let shouldBreak = done;
+                    if (value != null) {
+                        const { token, confidence } = value;
+                        if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
+                            confidence < this._minConfidence) {
+                            this._iterator = undefined;
+                            await iterator.return();
+                            this._waitForPredictionExhaustion = true;
+                            shouldBreak = true;
+                        }
+                        else
+                            this._predictedTokens.push(token);
+                    }
+                    if (this._resetPredictions && !abortSignal.aborted) {
+                        await resetPredications();
+                        iterator = createIterator();
+                        this._iterator = iterator;
+                        continue;
+                    }
+                    if (this._predictedTokens.length >= this._minTokens) {
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                    }
+                    if (shouldBreak) {
+                        this._iterator = undefined;
+                        await iterator.return();
+                        this._waitForPredictionExhaustion = true;
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                        break;
+                    }
+                }
+            }
+            finally {
+                this._active = false;
+            }
+        });
+    }
+    /** @internal */
+    _getGrammarEvaluationStateWithTokens(tokens) {
+        if (this._grammarEvaluationStateOption == null)
+            return undefined;
+        const clone = this._grammarEvaluationStateOption.clone();
+        for (const token of tokens) {
+            const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+            if (!canAddToken) {
+                console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
+                this._grammarEvaluationStateOption = undefined;
+                return undefined;
+            }
+            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+        }
+        return clone;
+    }
+}
+//# sourceMappingURL=DraftSequenceTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
@@ -0,0 +1,58 @@
+import { Token } from "../../../types.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export declare class InputLookupTokenPredictor extends TokenPredictor {
+    constructor(options?: {
+        patternLength?: {
+            /**
+             * Min pattern length to look for in the input tokens.
+             *
+             * Defaults to `1`.
+             */
+            min?: number;
+            /**
+             * Max pattern length to look for in the input tokens.
+             *
+             * Set to `0` to disable the max pattern size.
+             *
+             * Defaults to `0`.
+             */
+            max?: number;
+        };
+        predictionLength?: {
+            /**
+             * Minimum number of tokens to predict.
+             *
+             * Defaults to `1`.
+             */
+            min?: number;
+            /**
+             * Maximum number of tokens to predict.
+             *
+             * Defaults to `3`.
+             */
+            max?: number;
+        };
+    });
+    get patternMinLength(): number;
+    get patternMaxLength(): number;
+    get predictionMinLength(): number;
+    get predictionMaxLength(): number;
+    reset({ stateTokens }: {
+        stateTokens: Token[];
+    }): void;
+    updateInputTokens(tokens: Token[]): void;
+    pushTokens(tokens: Token[]): void;
+    predictTokens(): Token[];
+    dispose(): void;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
@@ -0,0 +1,138 @@
+import { DisposedError } from "lifecycle-utils";
+import { pushAll } from "../../../utils/pushAll.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+const defaultPatternMinLength = 1;
+const defaultPatternMaxLength = 0;
+const defaultPredictionMinLength = 1;
+const defaultPredictionMaxLength = 3;
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export class InputLookupTokenPredictor extends TokenPredictor {
+    /** @internal */ _patternMinLength;
+    /** @internal */ _patternMaxLength;
+    /** @internal */ _predictionMinLength;
+    /** @internal */ _predictionMaxLength;
+    /** @internal */ _lastPredictionMatchStartIndex = undefined;
+    /** @internal */ _lastPredictionMatchLength = undefined;
+    /** @internal */ _stateTokens = [];
+    /** @internal */ _inputTokens = [];
+    /** @internal */ _disposed = false;
+    constructor(options = {}) {
+        super();
+        this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
+        this._patternMaxLength = Math.floor(Math.max(0, Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)));
+        this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
+        this._predictionMaxLength = Math.floor(Math.max(this._patternMinLength, options.predictionLength?.max ?? defaultPredictionMaxLength));
+    }
+    get patternMinLength() {
+        return this._patternMinLength;
+    }
+    get patternMaxLength() {
+        return this._patternMaxLength;
+    }
+    get predictionMinLength() {
+        return this._predictionMinLength;
+    }
+    get predictionMaxLength() {
+        return this._predictionMaxLength;
+    }
+    reset({ stateTokens }) {
+        this._stateTokens = stateTokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    updateInputTokens(tokens) {
+        this._inputTokens = tokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    pushTokens(tokens) {
+        pushAll(this._stateTokens, tokens);
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            this._lastPredictionMatchLength += tokens.length;
+        }
+    }
+    predictTokens() {
+        if (this._disposed)
+            throw new DisposedError();
+        if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
+            return [];
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            for (let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1, s = this._stateTokens.length - 1; p >= this._lastPredictionMatchStartIndex && s >= 0; p--, s--) {
+                if (this._inputTokens[p] !== this._stateTokens[s]) {
+                    delete this._lastPredictionMatchStartIndex;
+                    delete this._lastPredictionMatchLength;
+                    break;
+                }
+            }
+            if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+                const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
+                if (predictionEndIndex < this._inputTokens.length) {
+                    return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+                }
+            }
+        }
+        const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
+        if (matchStartIndex == null || matchLength == null)
+            return [];
+        const predictionEndIndex = matchStartIndex + matchLength;
+        const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+        if (res.length >= this._predictionMinLength) {
+            this._lastPredictionMatchStartIndex = matchStartIndex;
+            this._lastPredictionMatchLength = matchLength;
+            return res;
+        }
+        return [];
+    }
+    dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    /** @internal */
+    _findLongestPatternIndex(findIn, lookupPattern) {
+        const checkIndexes = [];
+        let bestIndex = -1;
+        let bestIndexDiff = -1;
+        for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
+            const token = findIn[i];
+            for (let j = checkIndexes.length - 1; j >= 0; j--) {
+                const startIndex = checkIndexes[j];
+                const indexDiff = startIndex - i;
+                if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength)) {
+                    checkIndexes.splice(j, 1);
+                    if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                        bestIndex = startIndex;
+                        bestIndexDiff = indexDiff;
+                    }
+                }
+            }
+            if (token === lookupPattern[lookupPattern.length - 1])
+                checkIndexes.unshift(i);
+        }
+        for (let j = checkIndexes.length - 1; j >= 0; j--) {
+            const startIndex = checkIndexes[j];
+            const indexDiff = startIndex + 1;
+            checkIndexes.splice(j, 1);
+            if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                bestIndex = startIndex;
+                bestIndexDiff = indexDiff;
+            }
+        }
+        if (bestIndex >= 0)
+            return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
+        return [];
+    }
+}
+//# sourceMappingURL=InputLookupTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
@@ -0,0 +1,458 @@
+import { PickOptions } from "../../utils/utilTypes.js";
+import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
+import type { TokenBias } from "../TokenBias.js";
+import type { Token } from "../../types.js";
+import type { LlamaContextSequence } from "./LlamaContext.js";
+export type LlamaContextOptions = {
+    /**
+     * number of sequences for the context.
+     * Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context.
+     * Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other.
+     * This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch).
+     *
+     * Each sequence increases the memory usage of the context.
+     *
+     * Defaults to `1`.
+     */
+    sequences?: number;
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
+     * aligns the context size to multiples of 256 for performance reasons.
+     * To check the actual context size that gets created, use the `.contextSize` property
+     * of the created context instance or any of its sequences.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number;
+        max?: number;
+    };
+    /**
+     * The number of tokens that can be processed at once by the GPU.
+     *
+     * Defaults to `512` or `contextSize` if `contextSize` is less than `512`.
+     */
+    batchSize?: number;
+    /**
+     * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
+     *
+     * The support for flash attention is currently experimental and may not always work as expected.
+     * Use with caution.
+     *
+     * This option will be ignored if flash attention is not supported by the model.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
+     *
+     * Upon flash attention exiting the experimental status, the default value will become `true`
+     * (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
+     */
+    flashAttention?: boolean;
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware.
+     *
+     * This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running.
+     * To ensure the minimum number of threads you want to use are always used,
+     * set this to an object with a `min` property (see the `min` property description for more details).
+     *
+     * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+     *
+     * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+     * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+     */
+    threads?: number | {
+        /**
+         * The ideal number of threads to use for evaluations.
+         *
+         * If other evaluations are running, the actual number of threads may be lower than this value.
+         *
+         * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+         *
+         * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+         * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+         */
+        ideal?: number;
+        /**
+         * Ensure evaluations always use at least this number of threads.
+         *
+         * Use with caution, since setting this value too high can lead to the context waiting too much time
+         * to reserve this number of threads before the evaluation can start.
+         */
+        min?: number;
+    };
+    /**
+     * Control the parallel sequences processing behavior.
+     *
+     * See {@link BatchingOptions} for more information.
+     */
+    batching?: BatchingOptions;
+    /**
+     * When using SWA (Sliding Window Attention) on a supported model,
+     * extend the sliding window size to the current context size (meaning practically disabling SWA).
+     *
+     * Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
+     * but will allow reusing the evaluation cache of any prefix length of the context sequence state
+     * (instead of just the size of the sliding window when SWA is used).
+     *
+     * This option has no effect on models that do not support SWA (Sliding Window Attention).
+     *
+     * > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
+     */
+    swaFullCache?: boolean;
+    /**
+     * Load the provided LoRA adapters onto the context.
+     * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
+     * without the need for extensive retraining from scratch.
+     *
+     * If a string is provided, it will be treated as a path to a single LoRA adapter file.
+     *
+     * The adapters will be released from memory once the model (not just the context) is disposed.
+     */
+    lora?: string | {
+        adapters: Array<{
+            filePath: string;
+            /**
+             * Defaults to `1`
+             */
+            scale?: number;
+        }>;
+        /**
+         * Called with the LoRA adapters load percentage when the LoRA adapters are being loaded.
+         * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
+         */
+        onLoadProgress?(loadProgress: number): void;
+    };
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal;
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean;
+    /**
+     * On failed context creation, retry the creation with a smaller context size.
+     *
+     * Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties.
+     *
+     * Set `retries` to `false` to disable.
+     */
+    failedCreationRemedy?: false | {
+        /**
+         * Retries to attempt to create the context.
+         *
+         * Defaults to `6`.
+         */
+        retries?: number;
+        /**
+         * The percentage to decrease the context size by on each retry.
+         * Should be a number between `0` and `1`.
+         *
+         * If a function is provided, it will be called with the current context size and should return the new context size.
+         *
+         * Defaults to `0.16`.
+         */
+        autoContextSizeShrink?: number | ((contextSize: number) => number);
+    };
+    /**
+     * Track the inference performance of the context, so using `.printTimings()` will work.
+     *
+     * Defaults to `false`.
+     */
+    performanceTracking?: boolean;
+};
+export type LlamaContextSequenceRepeatPenalty = {
+    /** Tokens to lower the predication probability of to be the next predicted token */
+    punishTokens: Token[] | (() => Token[]);
+    /**
+     * The maximum number of tokens that will be provided in the `punishTokens` array.
+     *
+     * This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation.
+     *
+     * Don't set this value too high, as it can allocate too much memory.
+     *
+     * Defaults to `64`.
+     */
+    maxPunishTokens?: number;
+    /**
+     * The relative amount to lower the probability of the tokens in `punishTokens` by.
+     *
+     * Defaults to `1.1`.
+     * Set to `1` to disable.
+     */
+    penalty?: number;
+    /**
+     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`.
+     *
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    frequencyPenalty?: number;
+    /**
+     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`.
+     *
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    presencePenalty?: number;
+};
+export type BatchingOptions = {
+    /**
+     * The strategy used to dispatch items to be processed when there are items pending to be processed.
+     * - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
+     * You can provide a custom function to define a custom dispatch schedule.
+     *
+     * Defaults to `"nextCycle"`.
+     */
+    dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule;
+    /**
+     * The strategy used to prioritize pending items to be processed.
+     * - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
+     * - **`"firstInFirstOut"`** - process items in the order they were added.
+     * - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
+     * See the {@link CustomBatchingPrioritizationStrategy} type for more information.
+     *
+     * Defaults to `"maximumParallelism"`.
+     */
+    itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy;
+};
+/**
+ * A function that schedules the dispatch of the batch items.
+ * Call the `dispatch` function to dispatch the items.
+ */
+export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void;
+/**
+ * A function that prioritizes the batch items to be processed.
+ * The function receives an array of `items` and the `size` of how many tokens can be processed in this batch.
+ *
+ * The function should return an array of prioritized items,
+ * where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received,
+ * and where the `item` of each prioritized item is the same reference to an original item in the `items` array.
+ */
+export type CustomBatchingPrioritizationStrategy = (options: {
+    items: readonly BatchItem[];
+    size: number;
+}) => PrioritizedBatchItem[];
+export type ContextShiftOptions = {
+    size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+    strategy?: "eraseBeginning" | ((options: {
+        sequence: LlamaContextSequence;
+        size: number;
+    }) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>);
+};
+export type ContextTokensDeleteRange = {
+    start: number;
+    end: number;
+};
+export type SequenceEvaluateOptions = {
+    temperature?: number;
+    minP?: number;
+    topK?: number;
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Defaults to the current epoch time.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
+    repeatPenalty?: LlamaContextSequenceRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+     * evaluated based on the strategy chosen for the context.
+     * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+     * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+     * highest evaluation priority.
+     * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+     * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+     */
+    evaluationPriority?: EvaluationPriority;
+    /**
+     * Override the sequence context shift options for this evaluation
+     *
+     * See {@link ContextShiftOptions} for more information.
+     */
+    contextShift?: ContextShiftOptions;
+    /**
+     * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
+     * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
+     * Defaults to `false`.
+     */
+    yieldEogToken?: boolean;
+};
+export type SequenceEvaluateMetadataOptions = {
+    /**
+     * Get the confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)` from the output.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    readonly confidence?: boolean;
+    /**
+     * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+     *
+     * Only enable when needed, as it impacts the performance.
+     *
+     * Defaults to `false`.
+     */
+    readonly probabilities?: boolean;
+};
+export type SequenceEvaluateOutput<Options extends {
+    readonly confidence?: boolean;
+    readonly probabilities?: boolean;
+} = {
+    readonly confidence: true;
+    readonly probabilities: true;
+}> = PickOptions<{
+    /**
+     * The next token generated by the model and selected using the given options (such a temperature).
+     */
+    token: Token;
+    /**
+     * The confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)`.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    confidence: number;
+    /**
+     * The probabilities of the tokens from the vocabulary to be the next token.
+     *
+     * A probability is a number from `0` to `1`.
+     *
+     * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+     *
+     * The map is sorted by the probability of the tokens from the highest to the lowest,
+     * and is reflected in the order of the entries when iterating over the map.
+     * Use `.entries().next().value` to get the top probability pair
+     * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+     */
+    probabilities: Map<Token, number>;
+}, Options & {
+    token: true;
+}>;
+export type ControlledEvaluateInputItem = Token | [
+    token: Token,
+    options: {
+        generateNext?: {
+            /**
+             * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+             *
+             * Only enable when needed, as it impacts the performance.
+             *
+             * Defaults to `false`.
+             */
+            probabilities?: boolean;
+            /**
+             * Get the confidence (probability) of the selected token.
+             *
+             * Same as `next.probabilities.get(next.token)` from the output.
+             *
+             * If you need only this value, you can skip getting the full probabilities list to improve performance.
+             *
+             * This value might be slightly different when evaluated on different GPUs and configurations.
+             */
+            confidence?: boolean;
+            /**
+             * Generate the next token with the provided options using sampling.
+             *
+             * Setting this to `true` will generate probabilities for the next token and sample it.
+             */
+            token?: boolean;
+            options?: {
+                temperature?: number;
+                minP?: number;
+                topK?: number;
+                topP?: number;
+                /**
+                 * Used to control the randomness of the generated text.
+                 *
+                 * Change the seed to get different results.
+                 *
+                 * Defaults to the current epoch time.
+                 *
+                 * Only relevant when using `temperature`.
+                 */
+                seed?: number;
+                repeatPenalty?: LlamaContextSequenceRepeatPenalty;
+                /**
+                 * Adjust the probability of tokens being generated.
+                 * Can be used to bias the model to generate tokens that you want it to lean towards,
+                 * or to avoid generating tokens that you want it to avoid.
+                 */
+                tokenBias?: TokenBias | (() => TokenBias);
+            };
+        };
+    }
+];
+export type ControlledEvaluateIndexOutput = {
+    next: {
+        token?: Token | null;
+        /**
+         * The confidence (probability) of the selected token (the `token` field in this object).
+         *
+         * Same as `next.probabilities.get(next.token)`.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
+         */
+        confidence?: number;
+        /**
+         * The probabilities of the tokens from the vocabulary to be the next token.
+         *
+         * A probability is a number from `0` to `1`.
+         *
+         * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+         *
+         * The map is sorted by the probability of the tokens from the highest to the lowest,
+         * and is reflected in the order of the entries when iterating over the map.
+         * Use `.entries().next().value` to get the top probability pair
+         * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+         */
+        probabilities?: Map<Token, number>;
+    };
+};
+/**
+ * 1 - low
+ *
+ * 5 - high
+ */
+export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
+export type BatchItem = {
+    readonly tokens: readonly Token[];
+    readonly logits: readonly (true | undefined)[];
+    readonly evaluationPriority: EvaluationPriority;
+};
+export type PrioritizedBatchItem = {
+    item: BatchItem;
+    processAmount: number;
+};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
@@ -0,0 +1,2 @@
+export {};
+//# sourceMappingURL=types.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
@@ -0,0 +1,5 @@
+import { BatchItem, PrioritizedBatchItem } from "../../types.js";
+export declare function firstInFirstOutStrategy({ items, size }: {
+    items: readonly BatchItem[];
+    size: number;
+}): PrioritizedBatchItem[];
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
@@ -0,0 +1,16 @@
+export function firstInFirstOutStrategy({ items, size }) {
+    const res = [];
+    const sortedItems = items
+        .slice()
+        .sort((a, b) => b.evaluationPriority - a.evaluationPriority);
+    let leftFreeTokens = size;
+    for (const item of sortedItems) {
+        const processAmount = Math.min(item.tokens.length, leftFreeTokens);
+        res.push({ item, processAmount });
+        leftFreeTokens -= processAmount;
+        if (leftFreeTokens === 0)
+            break;
+    }
+    return res;
+}
+//# sourceMappingURL=firstInFirstOutStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
@@ -0,0 +1,5 @@
+import { BatchItem, PrioritizedBatchItem } from "../../types.js";
+export declare function maximumParallelismStrategy({ items, size }: {
+    items: readonly BatchItem[];
+    size: number;
+}): PrioritizedBatchItem[];
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
@@ -0,0 +1,42 @@
+export function maximumParallelismStrategy({ items, size }) {
+    let leftFreeTokens = size;
+    const minTokensForEachItem = Math.floor(leftFreeTokens / items.length);
+    const res = [];
+    const clippedItems = [];
+    for (const item of items) {
+        const processAmount = Math.min(item.tokens.length, leftFreeTokens, minTokensForEachItem);
+        const prioritizeItem = { item, processAmount };
+        res.push(prioritizeItem);
+        leftFreeTokens -= processAmount;
+        if (processAmount < item.tokens.length)
+            clippedItems.push(prioritizeItem);
+        if (leftFreeTokens === 0)
+            break;
+    }
+    for (let passesLeft = 3; leftFreeTokens > 0 && clippedItems.length > 0 && passesLeft > 0; passesLeft--) {
+        const minIncreaseAmount = Math.ceil(leftFreeTokens / clippedItems.length);
+        for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
+            const prioritizeItem = clippedItems[i];
+            const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
+            const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens, minIncreaseAmount);
+            prioritizeItem.processAmount += increaseAmount;
+            if (increaseAmount === unprocessedAmount) {
+                clippedItems.splice(i, 1);
+                i--;
+            }
+        }
+    }
+    clippedItems.sort((a, b) => b.item.evaluationPriority - a.item.evaluationPriority);
+    for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
+        const prioritizeItem = clippedItems[i];
+        const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
+        const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens);
+        prioritizeItem.processAmount += increaseAmount;
+        if (increaseAmount === unprocessedAmount) {
+            clippedItems.splice(i, 1);
+            i--;
+        }
+    }
+    return res;
+}
+//# sourceMappingURL=maximumParallelismStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
@@ -0,0 +1 @@
+export declare function padSafeContextSize(value: number, padDirection: "up" | "down", padding?: number): number;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
@@ -0,0 +1,18 @@
+import { contextSizePad } from "../../../config.js";
+export function padSafeContextSize(value, padDirection, padding = contextSizePad) {
+    const paddedSize = ggmlPad(value, padding);
+    if (paddedSize === value)
+        return value;
+    else if (padDirection === "up")
+        return paddedSize;
+    else if (padDirection === "down") {
+        const smallerPaddedSize = ggmlPad(value - padding, padding);
+        if (smallerPaddedSize >= padding)
+            return smallerPaddedSize;
+    }
+    return paddedSize;
+}
+function ggmlPad(value, padding) {
+    return ((value + padding - 1) & ~(padding - 1));
+}
+//# sourceMappingURL=padSafeContextSize.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
@@ -0,0 +1,2 @@
+import { BatchingOptions } from "../types.js";
+export declare function resolveBatchItemsPrioritizationStrategy(strategy: Required<BatchingOptions>["itemPrioritizationStrategy"]): import("../types.js").CustomBatchingPrioritizationStrategy;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
@@ -0,0 +1,13 @@
+import { maximumParallelismStrategy } from "./batchItemsPrioritizationStrategies/maximumParallelismStrategy.js";
+import { firstInFirstOutStrategy } from "./batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js";
+export function resolveBatchItemsPrioritizationStrategy(strategy) {
+    if (strategy instanceof Function)
+        return strategy;
+    else if (strategy === "maximumParallelism")
+        return maximumParallelismStrategy;
+    else if (strategy === "firstInFirstOut")
+        return firstInFirstOutStrategy;
+    void strategy;
+    throw new Error(`Unknown batch items prioritize strategy: ${strategy}`);
+}
+//# sourceMappingURL=resolveBatchItemsPrioritizationStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.d.ts
@@ -0,0 +1,21 @@
+export type LlamaEmbeddingOptions = {
+    vector: readonly number[];
+};
+export type LlamaEmbeddingJSON = {
+    type: "embedding";
+    vector: readonly number[];
+};
+export declare class LlamaEmbedding {
+    readonly vector: readonly number[];
+    constructor(options: LlamaEmbeddingOptions);
+    toJSON(): LlamaEmbeddingJSON;
+    /**
+     * Calculates the cosine similarity between this embedding and another embedding.
+     *
+     * Note that you should only compare embeddings created by the exact same model file.
+     * @returns A value between 0 and 1 representing the similarity between the embedding vectors,
+     * where 1 means the embeddings are identical.
+     */
+    calculateCosineSimilarity(other: LlamaEmbedding | LlamaEmbeddingJSON | readonly number[]): number;
+    static fromJSON(json: LlamaEmbeddingJSON): LlamaEmbedding;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js
@@ -0,0 +1,53 @@
+export class LlamaEmbedding {
+    vector;
+    constructor(options) {
+        this.vector = Object.freeze(options.vector.slice());
+    }
+    toJSON() {
+        return {
+            type: "embedding",
+            vector: this.vector
+        };
+    }
+    /**
+     * Calculates the cosine similarity between this embedding and another embedding.
+     *
+     * Note that you should only compare embeddings created by the exact same model file.
+     * @returns A value between 0 and 1 representing the similarity between the embedding vectors,
+     * where 1 means the embeddings are identical.
+     */
+    calculateCosineSimilarity(other) {
+        const otherVector = other instanceof Array
+            ? other
+            : other.vector;
+        if (otherVector == null)
+            throw new Error("Other vector is null");
+        else if (otherVector.length !== this.vector.length) {
+            if (otherVector.length === 0 || this.vector.length === 0)
+                return 0;
+            else
+                throw new Error("Vectors have different lengths");
+        }
+        let dotProduct = 0;
+        let thisMagnitude = 0;
+        let otherMagnitude = 0;
+        for (let i = 0; i < this.vector.length; i++) {
+            dotProduct += this.vector[i] * otherVector[i];
+            thisMagnitude += Math.pow(this.vector[i], 2);
+            otherMagnitude += Math.pow(otherVector[i], 2);
+        }
+        if (thisMagnitude === 0 && otherMagnitude === 0)
+            return 1;
+        else if (thisMagnitude === 0 || otherMagnitude === 0)
+            return 0;
+        const thisNorm = Math.sqrt(thisMagnitude);
+        const otherNorm = Math.sqrt(otherMagnitude);
+        return dotProduct / (thisNorm * otherNorm);
+    }
+    static fromJSON(json) {
+        return new LlamaEmbedding({
+            vector: json.vector
+        });
+    }
+}
+//# sourceMappingURL=LlamaEmbedding.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaEmbedding.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbedding.ts"],"names":[],"mappings":"AASA,MAAM,OAAO,cAAc;IACP,MAAM,CAAoB;IAE1C,YAAmB,OAA8B;QAC7C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM;QACT,OAAO;YACH,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC;IACN,CAAC;IAED;;;;;;OAMG;IACI,yBAAyB,CAAC,KAA8D;QAC3F,MAAM,WAAW,GAAG,KAAK,YAAY,KAAK;YACtC,CAAC,CAAC,KAAK;YACP,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;QAEnB,IAAI,WAAW,IAAI,IAAI;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;aACvC,IAAI,WAAW,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACjD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC;gBACpD,OAAO,CAAC,CAAC;;gBAET,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;YAChD,aAAa,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;YAC9C,cAAc,IAAI,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;QACnD,CAAC;QAED,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAC3C,OAAO,CAAC,CAAC;aACR,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAChD,OAAO,CAAC,CAAC;QAEb,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAE5C,OAAO,UAAU,GAAG,CAAC,QAAQ,GAAG,SAAS,CAAC,CAAC;IAC/C,CAAC;IAEM,MAAM,CAAC,QAAQ,CAAC,IAAwB;QAC3C,OAAO,IAAI,cAAc,CAAC;YACtB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC,CAAC;IACP,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.d.ts
@@ -0,0 +1,52 @@
+import { EventRelay } from "lifecycle-utils";
+import { Token } from "../types.js";
+import { LlamaText } from "../utils/LlamaText.js";
+import { LlamaEmbedding } from "./LlamaEmbedding.js";
+import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
+export type LlamaEmbeddingContextOptions = {
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number;
+        max?: number;
+    };
+    /** prompt processing batch size */
+    batchSize?: number;
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware
+     */
+    threads?: number;
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal;
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean;
+};
+/**
+ * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+ */
+export declare class LlamaEmbeddingContext {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    getEmbeddingFor(input: Token[] | string | LlamaText): Promise<LlamaEmbedding>;
+    dispose(): Promise<void>;
+    /** @hidden */
+    [Symbol.asyncDispose](): Promise<void>;
+    get disposed(): boolean;
+    get model(): LlamaModel;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js
@@ -0,0 +1,86 @@
+import { AsyncDisposeAggregator, EventRelay, withLock } from "lifecycle-utils";
+import { tokenizeInput } from "../utils/tokenizeInput.js";
+import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
+import { LlamaEmbedding } from "./LlamaEmbedding.js";
+/**
+ * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+ */
+export class LlamaEmbeddingContext {
+    /** @internal */ _llamaContext;
+    /** @internal */ _sequence;
+    /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
+    onDispose = new EventRelay();
+    constructor({ _llamaContext }) {
+        this._llamaContext = _llamaContext;
+        this._sequence = this._llamaContext.getSequence();
+        this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
+            void this._disposeAggregator.dispose();
+        }));
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(async () => {
+            await this._llamaContext.dispose();
+        });
+    }
+    async getEmbeddingFor(input) {
+        const resolvedInput = tokenizeInput(input, this._llamaContext.model.tokenizer, undefined, true);
+        if (resolvedInput.length > this._llamaContext.contextSize)
+            throw new Error("Input is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts.");
+        else if (resolvedInput.length === 0)
+            return new LlamaEmbedding({
+                vector: []
+            });
+        const beginningToken = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
+        if (beginningToken != null && resolvedInput[0] !== beginningToken)
+            resolvedInput.unshift(beginningToken);
+        const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
+        if (endToken != null && resolvedInput.at(-1) !== endToken)
+            resolvedInput.push(endToken);
+        return await withLock([this, "evaluate"], async () => {
+            await this._sequence.eraseContextTokenRanges([{
+                    start: 0,
+                    end: this._sequence.nextTokenIndex
+                }]);
+            const iterator = this._sequence.evaluate(resolvedInput, { _noSampling: true });
+            // eslint-disable-next-line @typescript-eslint/no-unused-vars
+            for await (const token of iterator) {
+                break; // only generate one token to get embeddings
+            }
+            const embedding = this._llamaContext._ctx.getEmbedding(resolvedInput.length);
+            const embeddingVector = Array.from(embedding);
+            return new LlamaEmbedding({
+                vector: embeddingVector
+            });
+        });
+    }
+    async dispose() {
+        await this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    [Symbol.asyncDispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._llamaContext.disposed;
+    }
+    get model() {
+        return this._llamaContext.model;
+    }
+    /** @internal */
+    static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, ignoreMemorySafetyChecks }) {
+        if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
+            throw new Error("Computing embeddings is not supported for encoder-decoder models.");
+        const llamaContext = await _model.createContext({
+            contextSize,
+            batchSize,
+            threads,
+            createSignal,
+            ignoreMemorySafetyChecks,
+            _embeddings: true
+        });
+        return new LlamaEmbeddingContext({
+            _llamaContext: llamaContext
+        });
+    }
+}
+//# sourceMappingURL=LlamaEmbeddingContext.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaEmbeddingContext.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbeddingContext.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,sBAAsB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,iBAAiB,CAAC;AAG7E,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAC,8BAA8B,EAAE,uBAAuB,EAAC,MAAM,4BAA4B,CAAC;AACnG,OAAO,EAAC,cAAc,EAAC,MAAM,qBAAqB,CAAC;AA2CnD;;GAEG;AACH,MAAM,OAAO,qBAAqB;IAC9B,gBAAgB,CAAkB,aAAa,CAAe;IAC9D,gBAAgB,CAAkB,SAAS,CAAuB;IAClE,gBAAgB,CAAkB,kBAAkB,GAAG,IAAI,sBAAsB,EAAE,CAAC;IAEpE,SAAS,GAAG,IAAI,UAAU,EAAQ,CAAC;IAEnD,YAAoB,EAChB,aAAa,EAGhB;QACG,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;QACnC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC;QAElD,IAAI,CAAC,kBAAkB,CAAC,GAAG,CACvB,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,cAAc,CAAC,GAAG,EAAE;YAC7C,KAAK,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;QAC3C,CAAC,CAAC,CACL,CAAC;QACF,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;QAC1D,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE;YACnC,MAAM,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;QACvC,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,eAAe,CAAC,KAAmC;QAC5D,MAAM,aAAa,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEhG,IAAI,aAAa,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW;YACrD,MAAM,IAAI,KAAK,CACX,yCAAyC;gBACzC,sFAAsF,CACzF,CAAC;aACD,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;YAC/B,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,EAAE;aACb,CAAC,CAAC;QAEP,MAAM,cAAc,GAAG,8BAA8B,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACpG,IAAI,cAAc,IAAI,IAAI,IAAI,aAAa,CAAC,CAAC,CAAC,KAAK,cAAc;YAC7D,aAAa,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAE1C,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACvF,IAAI,QAAQ,IAAI,IAAI,IAAI,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,QAAQ;YACrD,aAAa,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEjC,OAAO,MAAM,QAAQ,CAAC,CAAC,IAA6B,EAAE,UAAU,CAAC,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,IAAI,CAAC,SAAS,CAAC,uBAAuB,CAAC,CAAC;oBAC1C,KAAK,EAAE,CAAC;oBACR,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc;iBACrC,CAAC,CAAC,CAAC;YAEJ,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAC,WAAW,EAAE,IAAI,EAAC,CAAC,CAAC;YAC7E,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;gBACjC,MAAM,CAAC,4CAA4C;YACvD,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC7E,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAE9C,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,eAAe;aAC1B,CAAC,CAAC;QACP,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,OAAO;QAChB,MAAM,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;IAC5C,CAAC;IAED,cAAc;IACP,CAAC,MAAM,CAAC,YAAY,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;IAED,IAAW,QAAQ;QACf,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;IACvC,CAAC;IAED,IAAW,KAAK;QACZ,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC;IACpC,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,EACxB,MAAM,EAGT,EAAE,EACC,WAAW,EACX,SAAS,EACT,OAAO,GAAG,CAAC,EACX,YAAY,EACZ,wBAAwB,EACG;QAC3B,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU;YAChE,MAAM,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC;QAEzF,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC;YAC5C,WAAW;YACX,SAAS;YACT,OAAO;YACP,YAAY;YACZ,wBAAwB;YACxB,WAAW,EAAE,IAAI;SACpB,CAAC,CAAC;QAEH,OAAO,IAAI,qBAAqB,CAAC;YAC7B,aAAa,EAAE,YAAY;SAC9B,CAAC,CAAC;IACP,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.d.ts
@@ -0,0 +1,39 @@
+import { LlamaText } from "../utils/LlamaText.js";
+import { Llama } from "../bindings/Llama.js";
+import { Token } from "../types.js";
+export type LlamaGrammarOptions = {
+    /** GBNF grammar */
+    grammar: string;
+    /** Consider any of these as EOS for the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
+    stopGenerationTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
+    /** Trim whitespace from the end of the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
+    trimWhitespaceSuffix?: boolean;
+    /**
+     * Root rule name.
+     *
+     * Defaults to `"root"`.
+     */
+    rootRuleName?: string;
+};
+/**
+ * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+ */
+export declare class LlamaGrammar {
+    /**
+     * > GBNF files are supported.
+     * > More info here: [
+     * github:ggml-org/llama.cpp:grammars/README.md
+     * ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
+     *
+     * Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
+     * @deprecated Use `llama.createGrammar(...)` instead.
+     * @param llama
+     * @param options
+     */
+    constructor(llama: Llama, { grammar, stopGenerationTriggers, trimWhitespaceSuffix, rootRuleName }: LlamaGrammarOptions);
+    get grammar(): string;
+    get rootRuleName(): string;
+    get stopGenerationTriggers(): readonly (string | import("../utils/LlamaText.js")._LlamaText | readonly (string | Token)[])[];
+    get trimWhitespaceSuffix(): boolean;
+    static getFor(llama: Llama, type: "json" | "json_arr" | "english" | "list" | "c" | "arithmetic" | "japanese" | "chess"): Promise<LlamaGrammar>;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js
@@ -0,0 +1,72 @@
+import path from "path";
+import fs from "fs-extra";
+import { getGrammarsFolder } from "../utils/getGrammarsFolder.js";
+import { LlamaText } from "../utils/LlamaText.js";
+/**
+ * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+ */
+export class LlamaGrammar {
+    /** @internal */ _llama;
+    /** @internal */ _grammar;
+    /** @internal */ _stopGenerationTriggers;
+    /** @internal */ _trimWhitespaceSuffix;
+    /** @internal */ _grammarText;
+    /** @internal */ _rootRuleName;
+    /**
+     * > GBNF files are supported.
+     * > More info here: [
+     * github:ggml-org/llama.cpp:grammars/README.md
+     * ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
+     *
+     * Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
+     * @deprecated Use `llama.createGrammar(...)` instead.
+     * @param llama
+     * @param options
+     */
+    constructor(llama, { grammar, stopGenerationTriggers = [], trimWhitespaceSuffix = false, rootRuleName = "root" }) {
+        this._llama = llama;
+        this._grammar = new this._llama._bindings.AddonGrammar(grammar, {
+            addonExports: this._llama._bindings,
+            rootRuleName
+        });
+        this._stopGenerationTriggers = stopGenerationTriggers ?? [];
+        this._trimWhitespaceSuffix = trimWhitespaceSuffix;
+        this._grammarText = grammar;
+        this._rootRuleName = rootRuleName;
+    }
+    get grammar() {
+        return this._grammarText;
+    }
+    get rootRuleName() {
+        return this._rootRuleName;
+    }
+    get stopGenerationTriggers() {
+        return this._stopGenerationTriggers;
+    }
+    get trimWhitespaceSuffix() {
+        return this._trimWhitespaceSuffix;
+    }
+    /**
+     * Test if the given text is compatible with the grammar.
+     * @internal
+     */
+    _testText(text) {
+        return this._grammar.isTextCompatible(String(text));
+    }
+    static async getFor(llama, type) {
+        const grammarsFolder = await getGrammarsFolder(llama.buildType);
+        const grammarFile = path.join(grammarsFolder, type + ".gbnf");
+        if (await fs.pathExists(grammarFile)) {
+            const grammar = await fs.readFile(grammarFile, "utf8");
+            return new LlamaGrammar(llama, {
+                grammar,
+                stopGenerationTriggers: [LlamaText(["\n".repeat((type === "json" || type === "json_arr")
+                            ? 4
+                            : 10)])], // this is a workaround for the model not stopping to generate text,
+                trimWhitespaceSuffix: true
+            });
+        }
+        throw new Error(`Grammar file for type "${type}" was not found in "${grammarsFolder}"`);
+    }
+}
+//# sourceMappingURL=LlamaGrammar.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,UAAU,CAAC;AAC1B,OAAO,EAAC,iBAAiB,EAAC,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAwBhD;;GAEG;AACH,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAkB,uBAAuB,CAAgE;IACzH,gBAAgB,CAAkB,qBAAqB,CAAU;IACjE,gBAAgB,CAAkB,YAAY,CAAS;IACvD,gBAAgB,CAAkB,aAAa,CAAS;IAExD;;;;;;;;;;OAUG;IACH,YAAmB,KAAY,EAAE,EAC7B,OAAO,EAAE,sBAAsB,GAAG,EAAE,EAAE,oBAAoB,GAAG,KAAK,EAAE,YAAY,GAAG,MAAM,EACvE;QAClB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC;QACpB,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE;YAC5D,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YACnC,YAAY;SACf,CAAC,CAAC;QACH,IAAI,CAAC,uBAAuB,GAAG,sBAAsB,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,qBAAqB,GAAG,oBAAoB,CAAC;QAClD,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC;QAC5B,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED,IAAW,YAAY;QACnB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED,IAAW,sBAAsB;QAC7B,OAAO,IAAI,CAAC,uBAAuB,CAAC;IACxC,CAAC;IAED,IAAW,oBAAoB;QAC3B,OAAO,IAAI,CAAC,qBAAqB,CAAC;IACtC,CAAC;IAED;;;OAGG;IACI,SAAS,CAAC,IAAY;QACzB,OAAO,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,KAAY,EAAE,IAA0F;QAC/H,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAEhE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,GAAG,OAAO,CAAC,CAAC;QAE9D,IAAI,MAAM,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YACnC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;YACvD,OAAO,IAAI,YAAY,CAAC,KAAK,EAAE;gBAC3B,OAAO;gBACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAC3C,CAAC,IAAI,KAAK,MAAM,IAAI,IAAI,KAAK,UAAU,CAAC;4BACpC,CAAC,CAAC,CAAC;4BACH,CAAC,CAAC,EAAE,CACX,CAAC,CAAC,CAAC,EAAE,oEAAoE;gBAC1E,oBAAoB,EAAE,IAAI;aAC7B,CAAC,CAAC;QACP,CAAC;QAED,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,uBAAuB,cAAc,GAAG,CAAC,CAAC;IAC5F,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.d.ts
@@ -0,0 +1,19 @@
+import type { LlamaGrammar } from "./LlamaGrammar.js";
+import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
+export type LlamaGrammarEvaluationStateOptions = {
+    model: LlamaModel;
+    grammar: LlamaGrammar;
+};
+/**
+ * Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
+ *
+ * Create a new grammar evaluation state for every response you generate with the model.
+ *
+ * This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
+ */
+export declare class LlamaGrammarEvaluationState {
+    constructor(options: LlamaGrammarEvaluationStateOptions);
+    constructor(existingState: LlamaGrammarEvaluationState);
+    /** Clone the grammar evaluation state */
+    clone(): LlamaGrammarEvaluationState;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js
@@ -0,0 +1,29 @@
+/**
+ * Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
+ *
+ * Create a new grammar evaluation state for every response you generate with the model.
+ *
+ * This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
+ */
+export class LlamaGrammarEvaluationState {
+    /** @internal */ _llama;
+    /** @internal */ _state;
+    constructor(existingStateOrOptions) {
+        if (existingStateOrOptions instanceof LlamaGrammarEvaluationState) {
+            this._llama = existingStateOrOptions._llama;
+            this._state = new this._llama._bindings.AddonGrammarEvaluationState(existingStateOrOptions._state);
+        }
+        else {
+            const { model, grammar } = existingStateOrOptions;
+            this._llama = model._llama;
+            if (model._llama !== grammar._llama)
+                throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
+            this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
+        }
+    }
+    /** Clone the grammar evaluation state */
+    clone() {
+        return new LlamaGrammarEvaluationState(this);
+    }
+}
+//# sourceMappingURL=LlamaGrammarEvaluationState.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaGrammarEvaluationState.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammarEvaluationState.ts"],"names":[],"mappings":"AAWA;;;;;;GAMG;AACH,MAAM,OAAO,2BAA2B;IACpC,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,MAAM,CAA8B;IAIrE,YAAmB,sBAAwF;QACvG,IAAI,sBAAsB,YAAY,2BAA2B,EAAE,CAAC;YAChE,IAAI,CAAC,MAAM,GAAG,sBAAsB,CAAC,MAAM,CAAC;YAC5C,IAAI,CAAC,MAAM,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,sBAAsB,CAAC,MAAM,CAAC,CAAC;QACvG,CAAC;aAAM,CAAC;YACJ,MAAM,EAAC,KAAK,EAAE,OAAO,EAAC,GAAG,sBAAsB,CAAC;YAChD,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YAE3B,IAAI,KAAK,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM;gBAC/B,MAAM,IAAI,KAAK,CAAC,4EAA4E,CAAC,CAAC;YAElG,IAAI,CAAC,MAAM,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzG,CAAC;IACL,CAAC;IAED,yCAAyC;IAClC,KAAK;QACR,OAAO,IAAI,2BAA2B,CAAC,IAAI,CAAC,CAAC;IACjD,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.d.ts
@@ -0,0 +1,17 @@
+import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../utils/gbnfJson/types.js";
+import { Llama } from "../bindings/Llama.js";
+import { LlamaGrammar } from "./LlamaGrammar.js";
+/**
+ * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+ * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+ */
+export declare class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs> = Record<any, any>> extends LlamaGrammar {
+    private readonly _schema;
+    /**
+     * Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
+     * @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
+     */
+    constructor(llama: Llama, schema: Readonly<T> & GbnfJsonSchema<Defs>);
+    get schema(): Readonly<T>;
+    parse(json: string): GbnfJsonSchemaToType<T>;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js
@@ -0,0 +1,35 @@
+import { getGbnfGrammarForGbnfJsonSchema } from "../utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.js";
+import { validateObjectAgainstGbnfSchema } from "../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
+import { LlamaText } from "../utils/LlamaText.js";
+import { LlamaGrammar } from "./LlamaGrammar.js";
+/* eslint-disable @stylistic/max-len */
+/**
+ * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+ * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+ */
+export class LlamaJsonSchemaGrammar extends LlamaGrammar {
+    _schema;
+    /**
+     * Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
+     * @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
+     */
+    constructor(llama, schema) {
+        const grammar = getGbnfGrammarForGbnfJsonSchema(schema);
+        super(llama, {
+            grammar,
+            stopGenerationTriggers: [LlamaText(["\n".repeat(4)])],
+            trimWhitespaceSuffix: true
+        });
+        this._schema = schema;
+    }
+    get schema() {
+        return this._schema;
+    }
+    parse(json) {
+        const parsedJson = JSON.parse(json);
+        validateObjectAgainstGbnfSchema(parsedJson, this._schema);
+        return parsedJson;
+    }
+}
+/* eslint-enable @stylistic/max-len */
+//# sourceMappingURL=LlamaJsonSchemaGrammar.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaJsonSchemaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaJsonSchemaGrammar.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,+BAA+B,EAAC,MAAM,sDAAsD,CAAC;AACrG,OAAO,EAAC,+BAA+B,EAAC,MAAM,4DAA4D,CAAC;AAC3G,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAEhD,OAAO,EAAC,YAAY,EAAC,MAAM,mBAAmB,CAAC;AAE/C,uCAAuC;AACvC;;;GAGG;AACH,MAAM,OAAO,sBAGX,SAAQ,YAAY;IACD,OAAO,CAAI;IAE5B;;;OAGG;IACH,YAAmB,KAAY,EAAE,MAA0C;QACvE,MAAM,OAAO,GAAG,+BAA+B,CAAC,MAAM,CAAC,CAAC;QAExD,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;IAC1B,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,OAAO,CAAC;IACxB,CAAC;IAEM,KAAK,CAAC,IAAY;QACrB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,+BAA+B,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAE1D,OAAO,UAAU,CAAC;IACtB,CAAC;CACJ;AACD,sCAAsC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.d.ts
@@ -0,0 +1,311 @@
+import { EventRelay } from "lifecycle-utils";
+import { Token, Tokenizer } from "../../types.js";
+import { ModelTypeDescription } from "../../bindings/AddonTypes.js";
+import { LlamaVocabularyType } from "../../bindings/types.js";
+import { GgufFileInfo } from "../../gguf/types/GgufFileInfoTypes.js";
+import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
+import { LlamaContextOptions } from "../LlamaContext/types.js";
+import { LlamaContext } from "../LlamaContext/LlamaContext.js";
+import { LlamaEmbeddingContext, LlamaEmbeddingContextOptions } from "../LlamaEmbeddingContext.js";
+import { GgufMetadata } from "../../gguf/types/GgufMetadataTypes.js";
+import { OverridesObject } from "../../utils/OverridesObject.js";
+import { LlamaRankingContext, LlamaRankingContextOptions } from "../LlamaRankingContext.js";
+import { TokenAttributes } from "./utils/TokenAttributes.js";
+import type { Llama } from "../../bindings/Llama.js";
+import type { BuiltinSpecialTokenValue } from "../../utils/LlamaText.js";
+export type LlamaModelOptions = {
+    /** path to the model on the filesystem */
+    modelPath: string;
+    /**
+     * Number of layers to store in VRAM.
+     * - **`"auto"`** - adapt to the current VRAM state and try to fit as many layers as possible in it.
+     * Takes into account the VRAM required to create a context with a `contextSize` set to `"auto"`.
+     * - **`"max"`** - store all layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
+     * - **`number`** - store the specified number of layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
+     * - **`{min?: number, max?: number, fitContext?: {contextSize: number}}`** - adapt to the current VRAM state and try to fit as
+     * many layers as possible in it, but at least `min` and at most `max` layers. Set `fitContext` to the parameters of a context you
+     * intend to create with the model, so it'll take it into account in the calculations and leave enough memory for such a context.
+     *
+     * If GPU support is disabled, will be set to `0` automatically.
+     *
+     * Defaults to `"auto"`.
+     */
+    gpuLayers?: "auto" | "max" | number | {
+        min?: number;
+        max?: number;
+        fitContext?: {
+            contextSize?: number;
+            /**
+             * Defaults to `false`.
+             */
+            embeddingContext?: boolean;
+        };
+    };
+    /**
+     * Only load the vocabulary, not weight tensors.
+     *
+     * Useful when you only want to use the model to use its tokenizer but not for evaluation.
+     *
+     * Defaults to `false`.
+     */
+    vocabOnly?: boolean;
+    /**
+     * Use mmap (memory-mapped file) to load the model.
+     *
+     * Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
+     * and makes it easier for the system to manage memory.
+     *
+     * When using mmap, you might notice a delay the first time you actually use the model,
+     * which is caused by the OS itself loading the model into memory.
+     *
+     * Defaults to `true` if the current system supports it.
+     */
+    useMmap?: boolean;
+    /**
+     * Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory,
+     * bypassing OS in-memory caches.
+     *
+     * It leads to improved model loading times and reduced RAM usage,
+     * on the expense of higher loading times when the model unloaded and loaded again repeatedly in a short period of time.
+     *
+     * When this option is enabled, if Direct I/O is supported by the system (and for the given file)
+     * it will be used and mmap will be disabled.
+     *
+     * Unsupported on macOS.
+     *
+     * Defaults to `true`.
+     */
+    useDirectIo?: boolean;
+    /**
+     * Force the system to keep the model in the RAM/VRAM.
+     * Use with caution as this can crash your system if the available resources are insufficient.
+     */
+    useMlock?: boolean;
+    /**
+     * Check for tensor validity before actually loading the model.
+     * Using it increases the time it takes to load the model.
+     *
+     * Defaults to `false`.
+     */
+    checkTensors?: boolean;
+    /**
+     * Enable flash attention by default for contexts created with this model.
+     * Only works with models that support flash attention.
+     *
+     * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
+     *
+     * The support for flash attention is currently experimental and may not always work as expected.
+     * Use with caution.
+     *
+     * This option will be ignored if flash attention is not supported by the model.
+     *
+     * Enabling this affects the calculations of default values for the model and contexts created with it
+     * as flash attention reduces the amount of memory required,
+     * which allows for more layers to be offloaded to the GPU and for context sizes to be bigger.
+     *
+     * Defaults to `false`.
+     *
+     * Upon flash attention exiting the experimental status, the default value will become `true`.
+     */
+    defaultContextFlashAttention?: boolean;
+    /**
+     * When using SWA (Sliding Window Attention) on a supported model,
+     * extend the sliding window size to the current context size (meaning practically disabling SWA)
+     * by default for contexts created with this model.
+     *
+     * See the `swaFullCache` option of the `.createContext()` method for more information.
+     *
+     * Defaults to `false`.
+     */
+    defaultContextSwaFullCache?: boolean;
+    /**
+     * Called with the load percentage when the model is being loaded.
+     * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
+     */
+    onLoadProgress?(loadProgress: number): void;
+    /** An abort signal to abort the model load */
+    loadSignal?: AbortSignal;
+    /**
+     * Ignore insufficient memory errors and continue with the model load.
+     * Can cause the process to crash if there's not enough VRAM to fit the model.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean;
+    /**
+     * Metadata overrides to load the model with.
+     *
+     * > **Note:** Most metadata value overrides aren't supported and overriding them will have no effect on `llama.cpp`.
+     * > Only use this for metadata values that are explicitly documented to be supported by `llama.cpp` to be overridden,
+     * > and only in cases when this is crucial, as this is not guaranteed to always work as expected.
+     */
+    metadataOverrides?: OverridesObject<GgufMetadata, number | bigint | boolean | string>;
+};
+export declare class LlamaModel {
+    readonly tokenizer: Tokenizer;
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    dispose(): Promise<void>;
+    /** @hidden */
+    [Symbol.asyncDispose](): Promise<void>;
+    get disposed(): boolean;
+    get llama(): Llama;
+    get tokens(): LlamaModelTokens;
+    get filename(): string | undefined;
+    get fileInfo(): GgufFileInfo;
+    get fileInsights(): GgufInsights;
+    /**
+     * Number of layers offloaded to the GPU.
+     * If GPU support is disabled, this will always be `0`.
+     */
+    get gpuLayers(): number;
+    /**
+     * Total model size in memory in bytes.
+     *
+     * When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
+     */
+    get size(): number;
+    get flashAttentionSupported(): boolean;
+    get defaultContextFlashAttention(): boolean;
+    get defaultContextSwaFullCache(): boolean;
+    /**
+     * Transform text into tokens that can be fed to the model
+     * @param text - the text to tokenize
+     * @param [specialTokens] - if set to true, text that correspond to special tokens will be tokenized to those tokens.
+     * For example, `<s>` will be tokenized to the BOS token if `specialTokens` is set to `true`,
+     * otherwise it will be tokenized to tokens that corresponds to the plaintext `<s>` string.
+     * @param [options] - additional options for tokenization.
+     * If set to `"trimLeadingSpace"`, a leading space will be trimmed from the tokenized output if the output has an
+     * additional space at the beginning.
+     */
+    tokenize(text: string, specialTokens?: boolean, options?: "trimLeadingSpace"): Token[];
+    tokenize(text: BuiltinSpecialTokenValue, specialTokens: "builtin"): Token[];
+    /**
+     * Transform tokens into text
+     * @param tokens - the tokens to detokenize.
+     * @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
+     *
+     * Recommended for debugging purposes only.
+     *
+     * > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
+     * this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
+     *
+     * Defaults to `false`.
+     * @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
+     * If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
+     * and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
+     *
+     * Using it may have no effect with some models, but it is still recommended.
+     */
+    detokenize(tokens: readonly Token[], specialTokens?: boolean, lastTokens?: readonly Token[]): string;
+    getTokenAttributes(token: Token): TokenAttributes;
+    /** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
+    isSpecialToken(token: Token | undefined): boolean;
+    iterateAllTokens(): Generator<Token, void, unknown>;
+    /** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
+    isEogToken(token: Token | undefined): boolean;
+    createContext(options?: LlamaContextOptions): Promise<LlamaContext>;
+    /**
+     * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+     */
+    createEmbeddingContext(options?: LlamaEmbeddingContextOptions): Promise<LlamaEmbeddingContext>;
+    /**
+     * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+     */
+    createRankingContext(options?: LlamaRankingContextOptions): Promise<LlamaRankingContext>;
+    /**
+     * Get warnings about the model file that would affect its usage.
+     *
+     * These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
+     */
+    getWarnings(): string[];
+    /** @hidden `ModelTypeDescription` type alias is too long in the documentation */
+    get typeDescription(): ModelTypeDescription;
+    /** The context size the model was trained on */
+    get trainContextSize(): number;
+    /** The size of an embedding vector the model can produce */
+    get embeddingVectorSize(): number;
+    get vocabularyType(): LlamaVocabularyType;
+}
+export declare class LlamaModelTokens {
+    private constructor();
+    /**
+     * @returns infill tokens
+     */
+    get infill(): LlamaModelInfillTokens;
+    /**
+     * @returns The BOS (Beginning Of Sequence) token.
+     */
+    get bos(): Token | null;
+    /**
+     * @returns The EOS (End Of Sequence) token.
+     */
+    get eos(): Token | null;
+    /**
+     * @returns The EOT (End Of Turn) token.
+     */
+    get eot(): Token | null;
+    /**
+     * @returns The SEP (Sentence Separator) token.
+     */
+    get sep(): Token | null;
+    /**
+     * @returns The NL (New Line) token.
+     */
+    get nl(): Token | null;
+    /**
+     * @returns The BOS (Beginning Of Sequence) token text representation.
+     */
+    get bosString(): string | null;
+    /**
+     * @returns The EOS (End Of Sequence) token text representation.
+     */
+    get eosString(): string | null;
+    /**
+     * @returns The EOT (End Of Turn) token text representation.
+     */
+    get eotString(): string | null;
+    /**
+     * @returns The SEP (Sentence Separator) token text representation.
+     */
+    get sepString(): string | null;
+    /**
+     * @returns The NL (New Line) token text representation.
+     */
+    get nlString(): string | null;
+    /**
+     * @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
+     */
+    get shouldPrependBosToken(): boolean;
+    /**
+     * @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
+     */
+    get shouldAppendEosToken(): boolean;
+}
+export declare class LlamaModelInfillTokens {
+    private constructor();
+    /**
+     * @returns The beginning of infill prefix token.
+     */
+    get prefix(): Token | null;
+    /**
+     * @returns The beginning of infill middle token.
+     */
+    get middle(): Token | null;
+    /**
+     * @returns The beginning of infill suffix token.
+     */
+    get suffix(): Token | null;
+    /**
+     * @returns The beginning of infill prefix token as a string.
+     */
+    get prefixString(): string | null;
+    /**
+     * @returns The beginning of infill middle token as a string.
+     */
+    get middleString(): string | null;
+    /**
+     * @returns The beginning of infill suffix token as a string.
+     */
+    get suffixString(): string | null;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js
@@ -0,0 +1,832 @@
+import process from "process";
+import path from "path";
+import { AsyncDisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { removeNullFields } from "../../utils/removeNullFields.js";
+import { DisposeGuard } from "../../utils/DisposeGuard.js";
+import { LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues } from "../../bindings/types.js";
+import { readGgufFileInfo } from "../../gguf/readGgufFileInfo.js";
+import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
+import { getConsoleLogPrefix } from "../../utils/getConsoleLogPrefix.js";
+import { getReadablePath } from "../../cli/utils/getReadablePath.js";
+import { LlamaContext } from "../LlamaContext/LlamaContext.js";
+import { LlamaEmbeddingContext } from "../LlamaEmbeddingContext.js";
+import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
+import { maxRecentDetokenizerTokens } from "../../consts.js";
+import { LlamaRankingContext } from "../LlamaRankingContext.js";
+import { TokenAttribute, TokenAttributes } from "./utils/TokenAttributes.js";
+const defaultUseMmap = true;
+const defaultUseDirectIo = true;
+const defaultContextFlashAttentionEnabled = false;
+const defaultContextSwaFullCache = false;
+export class LlamaModel {
+    /** @internal */ _llama;
+    /** @internal */ _model;
+    /** @internal */ _backendModelDisposeGuard;
+    /** @internal */ _tokens;
+    /** @internal */ _modelPath;
+    /** @internal */ _fileInfo;
+    /** @internal */ _fileInsights;
+    /** @internal */ _gpuLayers;
+    /** @internal */ _vocabOnly;
+    /** @internal */ _filename;
+    /** @internal */ _disposedState = { disposed: false };
+    /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
+    /** @internal */ _llamaPreventDisposalHandle;
+    /** @internal */ _defaultContextFlashAttentionOptionEnabled;
+    /** @internal */ _defaultContextFlashAttention;
+    /** @internal */ _defaultContextSwaFullCache;
+    /** @internal */ _flashAttentionSupported;
+    /** @internal */ _loraAdapters = new Map();
+    /** @internal */ _typeDescription;
+    /** @internal */ _trainContextSize;
+    /** @internal */ _embeddingVectorSize;
+    /** @internal */ _vocabularyType;
+    tokenizer;
+    onDispose = new EventRelay();
+    constructor({ modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides }, { _llama, _fileInfo, _fileInsights, _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, _defaultContextSwaFullCache, _flashAttentionSupported }) {
+        this._llama = _llama;
+        this._fileInfo = _fileInfo;
+        this._modelPath = path.resolve(process.cwd(), modelPath);
+        this._fileInsights = _fileInsights;
+        this._gpuLayers = gpuLayers;
+        this._vocabOnly = vocabOnly ?? false;
+        this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]);
+        this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
+        this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
+        this._defaultContextFlashAttention = _defaultContextFlashAttention;
+        this._defaultContextSwaFullCache = _defaultContextSwaFullCache;
+        this._flashAttentionSupported = _flashAttentionSupported;
+        const overridesList = ggufMetadataOverridesToList(metadataOverrides);
+        this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
+            addonExports: this._llama._bindings,
+            gpuLayers,
+            vocabOnly: this._vocabOnly,
+            useMmap,
+            useDirectIo,
+            useMlock: _llama.supportsMlock
+                ? useMlock
+                : undefined,
+            checkTensors: checkTensors ?? false,
+            onLoadProgress: onLoadProgress == null
+                ? undefined
+                : (loadPercentage) => {
+                    try {
+                        onLoadProgress(loadPercentage);
+                    }
+                    catch (err) {
+                        // the native addon code calls this function, so there's no use to throw an error here
+                        console.error(err);
+                    }
+                },
+            hasLoadAbortSignal: loadSignal != null,
+            overridesList: overridesList.length > 0
+                ? overridesList
+                : undefined
+        }));
+        this._tokens = LlamaModelTokens._create(this._model, this._disposedState);
+        this._filename = path.basename(modelPath);
+        this._disposeAggregator.add(() => {
+            this._disposedState.disposed = true;
+        });
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(this._llama.onDispose.createListener(disposeModelIfReferenced.bind(null, new WeakRef(this))));
+        this._disposeAggregator.add(async () => {
+            await this._backendModelDisposeGuard.acquireDisposeLock();
+            await this._model.dispose();
+            this._llamaPreventDisposalHandle.dispose();
+        });
+        this.tokenize = this.tokenize.bind(this);
+        this.detokenize = this.detokenize.bind(this);
+        this.isSpecialToken = this.isSpecialToken.bind(this);
+        this.isEogToken = this.isEogToken.bind(this);
+        this.tokenize.detokenize = this.detokenize;
+        this.tokenize.isSpecialToken = this.isSpecialToken;
+        this.tokenize.isEogToken = this.isEogToken;
+        Object.freeze(this.tokenize);
+        this.tokenizer = this.tokenize;
+    }
+    async dispose() {
+        if (this._disposedState.disposed)
+            return;
+        this._disposedState.disposed = true;
+        await this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    async [Symbol.asyncDispose]() {
+        await this.dispose();
+    }
+    get disposed() {
+        return this._disposedState.disposed;
+    }
+    get llama() {
+        return this._llama;
+    }
+    get tokens() {
+        return this._tokens;
+    }
+    get filename() {
+        return this._filename;
+    }
+    get fileInfo() {
+        return this._fileInfo;
+    }
+    get fileInsights() {
+        return this._fileInsights;
+    }
+    /**
+     * Number of layers offloaded to the GPU.
+     * If GPU support is disabled, this will always be `0`.
+     */
+    get gpuLayers() {
+        return this._gpuLayers;
+    }
+    /**
+     * Total model size in memory in bytes.
+     *
+     * When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
+     */
+    get size() {
+        this._ensureNotDisposed();
+        return this._model.getModelSize();
+    }
+    get flashAttentionSupported() {
+        return this._flashAttentionSupported;
+    }
+    get defaultContextFlashAttention() {
+        return this._defaultContextFlashAttention;
+    }
+    get defaultContextSwaFullCache() {
+        return this._defaultContextSwaFullCache;
+    }
+    tokenize(text, specialTokens = false, options) {
+        this._ensureNotDisposed();
+        if (text === "")
+            return [];
+        if (specialTokens === "builtin") {
+            const builtinToken = text;
+            switch (builtinToken) {
+                case "BOS": return this.tokens.bos == null ? [] : [this.tokens.bos];
+                case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
+                case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
+                case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
+                case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
+            }
+            void builtinToken;
+            throw new Error(`Unknown builtin special token: ${builtinToken}`);
+        }
+        if (options === "trimLeadingSpace") {
+            if (specialTokens) {
+                const countLeadingSpaces = (text) => {
+                    let count = 0;
+                    for (; count < text.length; count++) {
+                        if (text[count] !== " ")
+                            break;
+                    }
+                    return count;
+                };
+                const textLeadingSpaces = countLeadingSpaces(text);
+                const [workaroundToken, workaroundTokenString] = (this.tokens.bos != null && this.tokens.bosString != null)
+                    ? [this.tokens.bos, this.tokens.bosString]
+                    : (this.tokens.eos != null && this.tokens.eosString != null)
+                        ? [this.tokens.eos, this.tokens.eosString]
+                        : (this.tokens.nl != null && this.tokens.nlString != null)
+                            ? [this.tokens.nl, this.tokens.nlString]
+                            : (this.tokens.eot != null && this.tokens.eotString != null)
+                                ? [this.tokens.eot, this.tokens.eotString]
+                                : [null, null];
+                if (workaroundToken != null && workaroundTokenString != null) {
+                    const tokens = Array.from(this._model.tokenize(workaroundTokenString + text, true));
+                    const workaroundTokenIndex = tokens.indexOf(workaroundToken);
+                    // only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
+                    if (workaroundTokenIndex >= 0 && workaroundTokenIndex <= 1) {
+                        tokens.splice(0, workaroundTokenIndex + 1);
+                        if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
+                            return tokens;
+                    }
+                }
+                const workaroundTokensString = "\n";
+                const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, true));
+                if (text.startsWith(workaroundTokensString)) {
+                    const tokens = Array.from(this._model.tokenize(text, true));
+                    if (this.detokenize(tokens, true).startsWith(workaroundTokensString))
+                        return tokens;
+                }
+                const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, true));
+                // only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
+                if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
+                    tokens.splice(0, workaroundTokens.length);
+                    if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
+                        return tokens;
+                }
+            }
+            else {
+                const workaroundTokensString = "\n";
+                const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, false));
+                if (text.startsWith(workaroundTokensString)) {
+                    const tokens = Array.from(this._model.tokenize(text, false));
+                    if (this.detokenize(tokens, false).startsWith(workaroundTokensString))
+                        return tokens;
+                }
+                const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, false));
+                // only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
+                if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
+                    tokens.splice(0, workaroundTokens.length);
+                    return tokens;
+                }
+            }
+        }
+        return Array.from(this._model.tokenize(text, specialTokens));
+    }
+    /**
+     * Transform tokens into text
+     * @param tokens - the tokens to detokenize.
+     * @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
+     *
+     * Recommended for debugging purposes only.
+     *
+     * > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
+     * this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
+     *
+     * Defaults to `false`.
+     * @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
+     * If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
+     * and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
+     *
+     * Using it may have no effect with some models, but it is still recommended.
+     */
+    detokenize(tokens, specialTokens = false, lastTokens) {
+        this._ensureNotDisposed();
+        if (tokens.length === 0)
+            return "";
+        if (lastTokens == null || lastTokens.length === 0)
+            return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
+        const addedTokens = lastTokens.slice(-maxRecentDetokenizerTokens);
+        const addedTokensText = this._model.detokenize(Uint32Array.from(addedTokens), Boolean(specialTokens));
+        if (addedTokensText === "")
+            return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
+        const text = this._model.detokenize(Uint32Array.from([...addedTokens, ...tokens]), Boolean(specialTokens));
+        if (text.startsWith(addedTokensText))
+            return text.slice(addedTokensText.length);
+        return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
+    }
+    getTokenAttributes(token) {
+        if (token == null)
+            throw new Error("Token cannot be null");
+        if (this.vocabularyType === LlamaVocabularyType.none)
+            return TokenAttributes._create(token, TokenAttribute.undefined);
+        return TokenAttributes._create(token, this._model.getTokenAttributes(token));
+    }
+    /** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
+    isSpecialToken(token) {
+        if (token == null)
+            return false;
+        if (this.getTokenAttributes(token).control)
+            return true;
+        const normalText = this.detokenize([token], false);
+        if (normalText === "")
+            return this.detokenize([token], true) !== "";
+        return false;
+    }
+    *iterateAllTokens() {
+        if (this.vocabularyType === LlamaVocabularyType.none)
+            return;
+        const totalTokens = this.fileInfo.metadata?.tokenizer?.ggml?.tokens?.length;
+        if (typeof totalTokens !== "number")
+            return;
+        for (let i = 0; i < totalTokens; i++)
+            yield i;
+    }
+    /** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
+    isEogToken(token) {
+        if (token == null)
+            return false;
+        return token === this.tokens.eos || token === this.tokens.eot || this._model.isEogToken(token);
+    }
+    async createContext(options = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+        return await withLock([this._llama._memoryLock, LlamaLocks.loadToMemory], options.createSignal, async () => {
+            const preventDisposalHandle = this._backendModelDisposeGuard.createPreventDisposalHandle();
+            try {
+                return await LlamaContext._create(options, { _model: this });
+            }
+            finally {
+                preventDisposalHandle.dispose();
+            }
+        });
+    }
+    /**
+     * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+     */
+    async createEmbeddingContext(options = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+        return await LlamaEmbeddingContext._create({ _model: this }, options);
+    }
+    /**
+     * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+     */
+    async createRankingContext(options = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+        return await LlamaRankingContext._create({ _model: this }, options);
+    }
+    /**
+     * Get warnings about the model file that would affect its usage.
+     *
+     * These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
+     */
+    getWarnings() {
+        this._ensureNotDisposed();
+        const warnings = this._fileInsights.getWarnings(this._modelPath);
+        const modelFilePathText = `("${getReadablePath(this._modelPath)}")`;
+        try {
+            const beforeTextNoSpecialTokens = "some test text here";
+            const afterTextNoSpecialTokens = this.detokenize(this.tokenize(beforeTextNoSpecialTokens, false, "trimLeadingSpace"), false);
+            if (beforeTextNoSpecialTokens !== afterTextNoSpecialTokens)
+                warnings.push(`Using this model ${modelFilePathText} to tokenize text and then detokenize it resulted in a different text. ` +
+                    "There might be an issue with the model or the tokenizer implementation. " +
+                    "Using this model may not work as intended");
+        }
+        catch (err) {
+            // do nothing
+        }
+        try {
+            if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) {
+                if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
+                    warnings.push("Flash attention is incompatible with Grok and thus was turned off");
+                else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
+                    warnings.push("Flash attention is incompatible with Gemma2 and thus was turned off");
+                else {
+                    const nHead = this.fileInfo.architectureMetadata?.attention?.head_count ?? 0;
+                    const nEmbd = this.fileInfo.architectureMetadata?.embedding_length ?? 0;
+                    const nEmbdHeadK = this.fileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
+                    const nEmbdHeadV = this.fileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+                    if (nEmbdHeadK !== nEmbdHeadV)
+                        warnings.push("Flash attention is incompatible with this model and thus was turned off");
+                }
+            }
+        }
+        catch (err) {
+            // do nothing
+        }
+        return warnings;
+    }
+    /** @hidden `ModelTypeDescription` type alias is too long in the documentation */
+    get typeDescription() {
+        this._ensureNotDisposed();
+        if (this._typeDescription == null)
+            this._typeDescription = this._model.getModelDescription();
+        return this._typeDescription;
+    }
+    /** The context size the model was trained on */
+    get trainContextSize() {
+        this._ensureNotDisposed();
+        if (this._trainContextSize == null)
+            this._trainContextSize = this._model.getTrainContextSize();
+        return this._trainContextSize;
+    }
+    /** The size of an embedding vector the model can produce */
+    get embeddingVectorSize() {
+        this._ensureNotDisposed();
+        if (this._embeddingVectorSize == null)
+            this._embeddingVectorSize = this._model.getEmbeddingVectorSize();
+        return this._embeddingVectorSize;
+    }
+    get vocabularyType() {
+        this._ensureNotDisposed();
+        if (this._vocabularyType == null) {
+            const vocabType = this._model.getVocabularyType();
+            this._vocabularyType = LlamaVocabularyTypeValues[vocabType];
+            if (this._vocabularyType == null) {
+                console.warn(getConsoleLogPrefix() + "Unknown vocabulary type:", vocabType);
+                this._vocabularyType = LlamaVocabularyType.none;
+            }
+        }
+        return this._vocabularyType;
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposedState.disposed)
+            throw new DisposedError();
+    }
+    /** @internal */
+    async _getOrLoadLora(filePath) {
+        const resolvedPath = path.resolve(process.cwd(), filePath);
+        if (this._loraAdapters.has(resolvedPath))
+            return this._loraAdapters.get(resolvedPath);
+        return await withLock([this._loraAdapters, "modify"], async () => {
+            if (this._loraAdapters.has(resolvedPath))
+                return this._loraAdapters.get(resolvedPath);
+            const lora = new this._llama._bindings.AddonModelLora(this._model, resolvedPath);
+            await this._model.loadLora(lora);
+            this._loraAdapters.set(resolvedPath, lora);
+            return lora;
+        });
+    }
+    /** @internal */
+    static async _create(modelOptions, { _llama }) {
+        const { loadSignal, defaultContextFlashAttention } = modelOptions;
+        const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
+        const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
+        const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
+            sourceType: "filesystem",
+            signal: loadSignal
+        });
+        applyGgufMetadataOverrides(fileInfo, modelOptions.metadataOverrides);
+        const ggufInsights = await GgufInsights.from(fileInfo, _llama);
+        const flashAttentionSupported = ggufInsights.flashAttentionSupported;
+        const resolvedDefaultContextFlashAttention = flashAttentionSupported
+            ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
+            : false;
+        const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
+        const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
+            ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
+            defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
+            useMmap
+        });
+        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
+            gpuLayers: gpuLayers,
+            useMmap
+        });
+        const model = new LlamaModel({ ...modelOptions, gpuLayers, useMmap, useDirectIo }, {
+            _fileInfo: fileInfo,
+            _fileInsights: ggufInsights,
+            _llama,
+            _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
+            _flashAttentionSupported: flashAttentionSupported,
+            _defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache
+        });
+        const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
+            ? null
+            : _llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
+        const modelCreationRamReservation = modelOptions.ignoreMemorySafetyChecks
+            ? null
+            : _llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
+        const loggedWarnings = new Set();
+        function onAbort() {
+            model._model.abortActiveModelLoad();
+            loadSignal?.removeEventListener("abort", onAbort);
+        }
+        function logWarnings(warnings) {
+            for (const warning of warnings) {
+                if (loggedWarnings.has(warning))
+                    continue;
+                _llama._log(LlamaLogLevel.warn, warning);
+                loggedWarnings.add(warning);
+            }
+        }
+        if (loadSignal != null) {
+            if (loadSignal.aborted)
+                throw loadSignal.reason;
+            loadSignal.addEventListener("abort", onAbort);
+        }
+        logWarnings(ggufInsights.getWarnings(modelOptions.modelPath));
+        try {
+            const modelLoaded = await model._model.init();
+            if (loadSignal?.aborted) {
+                if (modelLoaded)
+                    await model._model.dispose();
+                throw loadSignal.reason;
+            }
+            else if (!modelLoaded)
+                throw new Error("Failed to load model");
+            loadSignal?.removeEventListener("abort", onAbort);
+            logWarnings(model.getWarnings());
+            return model;
+        }
+        finally {
+            loadSignal?.removeEventListener("abort", onAbort);
+            modelCreationVramReservation?.dispose?.();
+            modelCreationRamReservation?.dispose?.();
+        }
+    }
+}
+export class LlamaModelTokens {
+    /** @internal */ _model;
+    /** @internal */ _disposedState;
+    /** @internal */ _infillTokens;
+    /** @internal */ _bosToken;
+    /** @internal */ _eosToken;
+    /** @internal */ _eotToken;
+    /** @internal */ _sepToken;
+    /** @internal */ _nlToken;
+    /** @internal */ _bosString;
+    /** @internal */ _eosString;
+    /** @internal */ _eotString;
+    /** @internal */ _sepString;
+    /** @internal */ _nlString;
+    /** @internal */ _shouldPrependBosToken;
+    /** @internal */ _shouldAppendEosToken;
+    constructor(model, disposedState) {
+        this._model = model;
+        this._disposedState = disposedState;
+    }
+    /**
+     * @returns infill tokens
+     */
+    get infill() {
+        this._ensureNotDisposed();
+        if (this._infillTokens == null)
+            this._infillTokens = LlamaModelInfillTokens._create(this._model, this._disposedState);
+        return this._infillTokens;
+    }
+    /**
+     * @returns The BOS (Beginning Of Sequence) token.
+     */
+    get bos() {
+        this._ensureNotDisposed();
+        if (this._bosToken == null)
+            this._bosToken = this._model.tokenBos();
+        if (this._bosToken === -1)
+            return null;
+        return this._bosToken;
+    }
+    /**
+     * @returns The EOS (End Of Sequence) token.
+     */
+    get eos() {
+        this._ensureNotDisposed();
+        if (this._eosToken == null)
+            this._eosToken = this._model.tokenEos();
+        if (this._eosToken === -1)
+            return null;
+        return this._eosToken;
+    }
+    /**
+     * @returns The EOT (End Of Turn) token.
+     */
+    get eot() {
+        this._ensureNotDisposed();
+        if (this._eotToken == null)
+            this._eotToken = this._model.eotToken();
+        if (this._eotToken === -1)
+            return null;
+        return this._eotToken;
+    }
+    /**
+     * @returns The SEP (Sentence Separator) token.
+     */
+    get sep() {
+        this._ensureNotDisposed();
+        if (this._sepToken == null)
+            this._sepToken = this._model.sepToken();
+        if (this._sepToken === -1)
+            return null;
+        return this._sepToken;
+    }
+    /**
+     * @returns The NL (New Line) token.
+     */
+    get nl() {
+        this._ensureNotDisposed();
+        if (this._nlToken == null)
+            this._nlToken = this._model.tokenNl();
+        if (this._nlToken === -1)
+            return null;
+        return this._nlToken;
+    }
+    /**
+     * @returns The BOS (Beginning Of Sequence) token text representation.
+     */
+    get bosString() {
+        this._ensureNotDisposed();
+        const bosToken = this.bos;
+        if (bosToken == null)
+            return null;
+        if (this._bosString == null)
+            this._bosString = this._model.getTokenString(bosToken);
+        return this._bosString;
+    }
+    /**
+     * @returns The EOS (End Of Sequence) token text representation.
+     */
+    get eosString() {
+        this._ensureNotDisposed();
+        const eosToken = this.eos;
+        if (eosToken == null)
+            return null;
+        if (this._eosString == null)
+            this._eosString = this._model.getTokenString(eosToken);
+        return this._eosString;
+    }
+    /**
+     * @returns The EOT (End Of Turn) token text representation.
+     */
+    get eotString() {
+        this._ensureNotDisposed();
+        const eotToken = this.eot;
+        if (eotToken == null)
+            return null;
+        if (this._eotString == null)
+            this._eotString = this._model.getTokenString(eotToken);
+        return this._eotString;
+    }
+    /**
+     * @returns The SEP (Sentence Separator) token text representation.
+     */
+    get sepString() {
+        this._ensureNotDisposed();
+        const sepToken = this.sep;
+        if (sepToken == null)
+            return null;
+        if (this._sepString == null)
+            this._sepString = this._model.getTokenString(sepToken);
+        return this._sepString;
+    }
+    /**
+     * @returns The NL (New Line) token text representation.
+     */
+    get nlString() {
+        this._ensureNotDisposed();
+        const nlToken = this.nl;
+        if (nlToken == null)
+            return null;
+        if (this._nlString == null)
+            this._nlString = this._model.getTokenString(nlToken);
+        return this._nlString;
+    }
+    /**
+     * @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
+     */
+    get shouldPrependBosToken() {
+        this._ensureNotDisposed();
+        if (this._shouldPrependBosToken == null)
+            this._shouldPrependBosToken = this.bos != null && this._model.shouldPrependBosToken();
+        return this._shouldPrependBosToken;
+    }
+    /**
+     * @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
+     */
+    get shouldAppendEosToken() {
+        this._ensureNotDisposed();
+        if (this._shouldAppendEosToken == null)
+            this._shouldAppendEosToken = this.bos != null && this._model.shouldAppendEosToken();
+        return this._shouldAppendEosToken;
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposedState.disposed)
+            throw new DisposedError();
+    }
+    /** @internal */
+    static _create(model, disposedState) {
+        return new LlamaModelTokens(model, disposedState);
+    }
+}
+export class LlamaModelInfillTokens {
+    /** @internal */ _model;
+    /** @internal */ _disposedState;
+    /** @internal */ _prefixToken;
+    /** @internal */ _middleToken;
+    /** @internal */ _suffixToken;
+    /** @internal */ _prefixString;
+    /** @internal */ _middleString;
+    /** @internal */ _suffixString;
+    constructor(model, disposedState) {
+        this._model = model;
+        this._disposedState = disposedState;
+    }
+    /**
+     * @returns The beginning of infill prefix token.
+     */
+    get prefix() {
+        this._ensureNotDisposed();
+        if (this._prefixToken == null)
+            this._prefixToken = this._resolveSpecialToken(this._model.prefixToken(), ["<fim_prefix>"]);
+        if (this._prefixToken === -1)
+            return null;
+        return this._prefixToken;
+    }
+    /**
+     * @returns The beginning of infill middle token.
+     */
+    get middle() {
+        this._ensureNotDisposed();
+        if (this._middleToken == null)
+            this._middleToken = this._resolveSpecialToken(this._model.middleToken(), ["<fim_middle>"]);
+        if (this._middleToken === -1)
+            return null;
+        return this._middleToken;
+    }
+    /**
+     * @returns The beginning of infill suffix token.
+     */
+    get suffix() {
+        this._ensureNotDisposed();
+        if (this._suffixToken == null)
+            this._suffixToken = this._resolveSpecialToken(this._model.suffixToken(), ["<fim_suffix>"]);
+        if (this._suffixToken === -1)
+            return null;
+        return this._suffixToken;
+    }
+    /**
+     * @returns The beginning of infill prefix token as a string.
+     */
+    get prefixString() {
+        this._ensureNotDisposed();
+        const prefixToken = this.prefix;
+        if (prefixToken == null)
+            return null;
+        if (this._prefixString == null)
+            this._prefixString = this._model.getTokenString(prefixToken);
+        return this._prefixString;
+    }
+    /**
+     * @returns The beginning of infill middle token as a string.
+     */
+    get middleString() {
+        this._ensureNotDisposed();
+        const middleToken = this.middle;
+        if (middleToken == null)
+            return null;
+        if (this._middleString == null)
+            this._middleString = this._model.getTokenString(middleToken);
+        return this._middleString;
+    }
+    /**
+     * @returns The beginning of infill suffix token as a string.
+     */
+    get suffixString() {
+        this._ensureNotDisposed();
+        const suffixToken = this.suffix;
+        if (suffixToken == null)
+            return null;
+        if (this._suffixString == null)
+            this._suffixString = this._model.getTokenString(suffixToken);
+        return this._suffixString;
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposedState.disposed)
+            throw new DisposedError();
+    }
+    /** @internal */
+    _resolveSpecialToken(token, fallbackTexts) {
+        if (token != null && token !== -1)
+            return token;
+        for (const text of fallbackTexts) {
+            const tokens = this._model.tokenize(text, true);
+            if (tokens.length !== 1)
+                continue;
+            return tokens[0];
+        }
+        return -1;
+    }
+    /** @internal */
+    static _create(model, disposedState) {
+        return new LlamaModelInfillTokens(model, disposedState);
+    }
+}
+function applyGgufMetadataOverrides(ggufFileInfo, overrides) {
+    function applyOverride(object, override) {
+        if (override == null || object == null)
+            return;
+        if (object instanceof Array || typeof object !== "object" || typeof override !== "object")
+            return;
+        for (const [key, value] of Object.entries(override)) {
+            if (value instanceof Array || typeof value !== "object" || (typeof value === "object" && typeof object[key] !== "object"))
+                object[key] = value;
+            else
+                applyOverride(object[key], value);
+        }
+    }
+    applyOverride(ggufFileInfo.metadata, overrides);
+}
+function ggufMetadataOverridesToList(overrides) {
+    const maxStringLength = 127;
+    const maxKeyLength = 127;
+    const res = [];
+    function addItem(object, path) {
+        if (object == null || object instanceof Array)
+            return;
+        if (typeof object !== "object") {
+            if (typeof object === "string" && object.length > maxStringLength)
+                throw new Error(`Metadata key "${path.join(".")}" override string value (${JSON.stringify(object)}) is longer than ${maxStringLength} characters`);
+            const key = path.join(".");
+            if (key.length > maxKeyLength)
+                throw new Error(`Metadata key "${key}" override path is longer than ${maxKeyLength} characters`);
+            let type = undefined;
+            if (typeof object === "number") {
+                if (typeof object === "bigint" || Number.isInteger(object))
+                    type = 0;
+                else
+                    type = 1;
+            }
+            res.push([key, object, type]);
+            return;
+        }
+        for (const [key, value] of Object.entries(object))
+            addItem(value, [...path, key]);
+    }
+    addItem(overrides ?? {}, []);
+    return res;
+}
+function disposeModelIfReferenced(modelRef) {
+    const model = modelRef.deref();
+    if (model != null)
+        void model.dispose();
+}
+//# sourceMappingURL=LlamaModel.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts
@@ -0,0 +1,29 @@
+import { Token } from "../../../types.js";
+export declare const enum TokenAttribute {
+    undefined = 0,
+    unknown = 1,
+    unused = 2,
+    normal = 4,
+    control = 8,// SPECIAL
+    userDefined = 16,
+    byte = 32,
+    normalized = 64,
+    lstrip = 128,
+    rstrip = 256,
+    singleWord = 512
+}
+export declare class TokenAttributes {
+    readonly token: Token;
+    private constructor();
+    get undefined(): boolean;
+    get unknown(): boolean;
+    get unused(): boolean;
+    get normal(): boolean;
+    get control(): boolean;
+    get userDefined(): boolean;
+    get byte(): boolean;
+    get normalized(): boolean;
+    get lstrip(): boolean;
+    get rstrip(): boolean;
+    get singleWord(): boolean;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js
@@ -0,0 +1,65 @@
+// updated against `enum llama_token_attr` from `llama.h`
+export var TokenAttribute;
+(function (TokenAttribute) {
+    TokenAttribute[TokenAttribute["undefined"] = 0] = "undefined";
+    TokenAttribute[TokenAttribute["unknown"] = 1] = "unknown";
+    TokenAttribute[TokenAttribute["unused"] = 2] = "unused";
+    TokenAttribute[TokenAttribute["normal"] = 4] = "normal";
+    TokenAttribute[TokenAttribute["control"] = 8] = "control";
+    TokenAttribute[TokenAttribute["userDefined"] = 16] = "userDefined";
+    TokenAttribute[TokenAttribute["byte"] = 32] = "byte";
+    TokenAttribute[TokenAttribute["normalized"] = 64] = "normalized";
+    TokenAttribute[TokenAttribute["lstrip"] = 128] = "lstrip";
+    TokenAttribute[TokenAttribute["rstrip"] = 256] = "rstrip";
+    TokenAttribute[TokenAttribute["singleWord"] = 512] = "singleWord";
+})(TokenAttribute || (TokenAttribute = {}));
+export class TokenAttributes {
+    token;
+    /** @internal */ _attributes;
+    constructor(token, attributes) {
+        this.token = token;
+        this._attributes = attributes;
+    }
+    get undefined() {
+        return this._attributes === TokenAttribute.undefined;
+    }
+    get unknown() {
+        return this._hasAttribute(TokenAttribute.unknown);
+    }
+    get unused() {
+        return this._hasAttribute(TokenAttribute.unused);
+    }
+    get normal() {
+        return this._hasAttribute(TokenAttribute.normal);
+    }
+    get control() {
+        return this._hasAttribute(TokenAttribute.control);
+    }
+    get userDefined() {
+        return this._hasAttribute(TokenAttribute.userDefined);
+    }
+    get byte() {
+        return this._hasAttribute(TokenAttribute.byte);
+    }
+    get normalized() {
+        return this._hasAttribute(TokenAttribute.normalized);
+    }
+    get lstrip() {
+        return this._hasAttribute(TokenAttribute.lstrip);
+    }
+    get rstrip() {
+        return this._hasAttribute(TokenAttribute.rstrip);
+    }
+    get singleWord() {
+        return this._hasAttribute(TokenAttribute.singleWord);
+    }
+    /** @internal */
+    _hasAttribute(attribute) {
+        return (this._attributes & attribute) === attribute;
+    }
+    /** @internal */
+    static _create(token, attributes) {
+        return new TokenAttributes(token, attributes);
+    }
+}
+//# sourceMappingURL=TokenAttributes.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"TokenAttributes.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaModel/utils/TokenAttributes.ts"],"names":[],"mappings":"AAEA,yDAAyD;AACzD,MAAM,CAAN,IAAkB,cAYjB;AAZD,WAAkB,cAAc;IAC5B,6DAAa,CAAA;IACb,yDAAgB,CAAA;IAChB,uDAAe,CAAA;IACf,uDAAe,CAAA;IACf,yDAAgB,CAAA;IAChB,kEAAoB,CAAA;IACpB,oDAAa,CAAA;IACb,gEAAmB,CAAA;IACnB,yDAAe,CAAA;IACf,yDAAe,CAAA;IACf,iEAAmB,CAAA;AACvB,CAAC,EAZiB,cAAc,KAAd,cAAc,QAY/B;AAED,MAAM,OAAO,eAAe;IACR,KAAK,CAAQ;IAC7B,gBAAgB,CAAkB,WAAW,CAAiB;IAE9D,YAAoB,KAAY,EAAE,UAA0B;QACxD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;IAClC,CAAC;IAED,IAAW,SAAS;QAChB,OAAO,IAAI,CAAC,WAAW,KAAK,cAAc,CAAC,SAAS,CAAC;IACzD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,WAAW;QAClB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC;IAED,IAAW,IAAI;QACX,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACnD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,gBAAgB;IACR,aAAa,CAAC,SAAyB;QAC3C,OAAO,CAAC,IAAI,CAAC,WAAW,GAAG,SAAS,CAAC,KAAK,SAAS,CAAC;IACxD,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,OAAO,CAAC,KAAY,EAAE,UAA0B;QAC1D,OAAO,IAAI,eAAe,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;IAClD,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.d.ts
@@ -0,0 +1,91 @@
+import { EventRelay } from "lifecycle-utils";
+import { Token } from "../types.js";
+import { LlamaText } from "../utils/LlamaText.js";
+import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
+export type LlamaRankingContextOptions = {
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attempt to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attempt to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number;
+        max?: number;
+    };
+    /** prompt processing batch size */
+    batchSize?: number;
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware
+     */
+    threads?: number;
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal;
+    /**
+     * The template to use for the ranking evaluation.
+     * If not provided, the model's template will be used by default.
+     *
+     * The template is tokenized with special tokens enabled, but the provided query and document are not.
+     *
+     * **<span v-pre>`{{query}}`</span>** is replaced with the query content.
+     *
+     * **<span v-pre>`{{document}}`</span>** is replaced with the document content.
+     *
+     * It's recommended to not set this option unless you know what you're doing.
+     *
+     * Defaults to the model's template.
+     */
+    template?: `${string}{{query}}${string}{{document}}${string}` | `${string}{{document}}${string}{{query}}${string}`;
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean;
+};
+/**
+ * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+ */
+export declare class LlamaRankingContext {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    /**
+     * Get the ranking score for a document for a query.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     * @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText): Promise<number>;
+    /**
+     * Get the ranking scores for all the given documents for a query.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     * @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]>;
+    /**
+     * Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    rankAndSort<const T extends string>(query: Token[] | string | LlamaText, documents: T[]): Promise<Array<{
+        document: T;
+        /**
+         * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+         */
+        score: number;
+    }>>;
+    dispose(): Promise<void>;
+    /** @hidden */
+    [Symbol.asyncDispose](): Promise<void>;
+    get disposed(): boolean;
+    get model(): LlamaModel;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js
@@ -0,0 +1,178 @@
+import { AsyncDisposeAggregator, EventRelay, splitText, withLock } from "lifecycle-utils";
+import { tokenizeInput } from "../utils/tokenizeInput.js";
+import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
+import { isRankingTemplateValid, parseRankingTemplate } from "../gguf/insights/GgufInsights.js";
+/**
+ * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+ */
+export class LlamaRankingContext {
+    /** @internal */ _llamaContext;
+    /** @internal */ _template;
+    /** @internal */ _sequence;
+    /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
+    onDispose = new EventRelay();
+    constructor({ _llamaContext, _template }) {
+        this._llamaContext = _llamaContext;
+        this._template = _template;
+        this._sequence = this._llamaContext.getSequence();
+        this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
+            void this._disposeAggregator.dispose();
+        }));
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(async () => {
+            await this._llamaContext.dispose();
+        });
+    }
+    /**
+     * Get the ranking score for a document for a query.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     * @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    async rank(query, document) {
+        const resolvedInput = this._getEvaluationInput(query, document);
+        if (resolvedInput.length > this._llamaContext.contextSize)
+            throw new Error("The input length exceed the context size. " +
+                `Try to increase the context size to at least ${resolvedInput.length + 1} ` +
+                "or use another model that supports longer contexts.");
+        return this._evaluateRankingForInput(resolvedInput);
+    }
+    /**
+     * Get the ranking scores for all the given documents for a query.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     * @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    async rankAll(query, documents) {
+        const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
+        const maxInputTokensLength = resolvedTokens.reduce((max, tokens) => Math.max(max, tokens.length), 0);
+        if (maxInputTokensLength > this._llamaContext.contextSize)
+            throw new Error("The input lengths of some of the given documents exceed the context size. " +
+                `Try to increase the context size to at least ${maxInputTokensLength + 1} ` +
+                "or use another model that supports longer contexts.");
+        else if (resolvedTokens.length === 0)
+            return [];
+        return await Promise.all(resolvedTokens.map((tokens) => this._evaluateRankingForInput(tokens)));
+    }
+    /**
+     * Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
+     *
+     * A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
+     */
+    async rankAndSort(query, documents) {
+        const scores = await this.rankAll(query, documents);
+        return documents
+            .map((document, index) => ({ document: document, score: scores[index] }))
+            .sort((a, b) => b.score - a.score);
+    }
+    async dispose() {
+        await this._disposeAggregator.dispose();
+    }
+    /** @hidden */
+    [Symbol.asyncDispose]() {
+        return this.dispose();
+    }
+    get disposed() {
+        return this._llamaContext.disposed;
+    }
+    get model() {
+        return this._llamaContext.model;
+    }
+    /** @internal */
+    _getEvaluationInput(query, document) {
+        if (this._template != null) {
+            const resolvedInput = splitText(this._template, ["{{query}}", "{{document}}"])
+                .flatMap((item) => {
+                if (typeof item === "string")
+                    return this._llamaContext.model.tokenize(item, true, "trimLeadingSpace");
+                else if (item.separator === "{{query}}")
+                    return tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+                else if (item.separator === "{{document}}")
+                    return tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+                else
+                    void item;
+                void item;
+                return [];
+            });
+            const beginningTokens = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
+            const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
+            if (beginningTokens != null && resolvedInput.at(0) !== beginningTokens)
+                resolvedInput.unshift(beginningTokens);
+            if (endToken != null && resolvedInput.at(-1) !== endToken)
+                resolvedInput.unshift(endToken);
+            return resolvedInput;
+        }
+        if (this.model.tokens.eos == null && this.model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+        const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+        const resolvedDocument = tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+        if (resolvedQuery.length === 0 && resolvedDocument.length === 0)
+            return [];
+        const resolvedInput = [
+            ...(this.model.tokens.bos == null ? [] : [this.model.tokens.bos]),
+            ...resolvedQuery,
+            ...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos]),
+            ...(this.model.tokens.sep == null ? [] : [this.model.tokens.sep]),
+            ...resolvedDocument,
+            ...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos])
+        ];
+        return resolvedInput;
+    }
+    /** @internal */
+    _evaluateRankingForInput(input) {
+        if (input.length === 0)
+            return Promise.resolve(0);
+        return withLock([this, "evaluate"], async () => {
+            await this._sequence.eraseContextTokenRanges([{
+                    start: 0,
+                    end: this._sequence.nextTokenIndex
+                }]);
+            const iterator = this._sequence.evaluate(input, { _noSampling: true });
+            // eslint-disable-next-line @typescript-eslint/no-unused-vars
+            for await (const token of iterator) {
+                break; // only generate one token to get embeddings
+            }
+            const embedding = this._llamaContext._ctx.getEmbedding(input.length, 1);
+            if (embedding.length === 0)
+                return 0;
+            const logit = embedding[0];
+            const probability = logitToSigmoid(logit);
+            return probability;
+        });
+    }
+    /** @internal */
+    static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, template, ignoreMemorySafetyChecks }) {
+        const resolvedTemplate = template ?? parseRankingTemplate(_model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"]);
+        if (_model.tokens.eos == null && _model.tokens.sep == null) {
+            if (!isRankingTemplateValid(resolvedTemplate)) {
+                if (resolvedTemplate === _model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"])
+                    throw new Error("The model's builtin template is invalid. It must contain both {query} and {document} placeholders.");
+                else
+                    throw new Error("The provided template is invalid. It must contain both {{query}} and {{document}} placeholders.");
+            }
+            else if (resolvedTemplate == null)
+                throw new Error("Computing rankings is not supported for this model.");
+        }
+        if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
+            throw new Error("Computing rankings is not supported for encoder-decoder models.");
+        if (!_model.fileInsights.supportsRanking)
+            throw new Error("Computing rankings is not supported for this model.");
+        const llamaContext = await _model.createContext({
+            contextSize,
+            batchSize,
+            threads,
+            createSignal,
+            ignoreMemorySafetyChecks,
+            _embeddings: true,
+            _ranking: true
+        });
+        return new LlamaRankingContext({
+            _llamaContext: llamaContext,
+            _template: resolvedTemplate
+        });
+    }
+}
+function logitToSigmoid(logit) {
+    return 1 / (1 + Math.exp(-logit));
+}
+//# sourceMappingURL=LlamaRankingContext.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenBias.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenBias.d.ts
@@ -0,0 +1,37 @@
+import { Token, Tokenizer } from "../types.js";
+import { LlamaText } from "../utils/LlamaText.js";
+import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
+/**
+ * @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
+ */
+export declare class TokenBias {
+    constructor(tokenizer: Tokenizer);
+    /**
+     * Adjust the bias of the given token(s).
+     *
+     * If a text is provided, the bias will be applied to each individual token in the text.
+     *
+     * Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
+     *
+     * Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
+     * @param input - The token(s) to apply the bias to
+     * @param bias - The probability bias to apply to the token(s).
+     *
+     * Setting to a positive number increases the probability of the token(s) being generated.
+     *
+     * Setting to a negative number decreases the probability of the token(s) being generated.
+     *
+     * Setting to `0` has no effect.
+     *
+     * For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
+     * Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
+     *
+     * Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
+     *
+     * Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
+     */
+    set(input: Token | Token[] | string | LlamaText, bias: "never" | number | {
+        logit: number;
+    }): this;
+    static for(modelOrTokenizer: LlamaModel | Tokenizer): TokenBias;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenBias.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenBias.js
@@ -0,0 +1,68 @@
+import { tokenizeInput } from "../utils/tokenizeInput.js";
+/**
+ * @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
+ */
+export class TokenBias {
+    /** @internal */ _tokenizer;
+    /** @internal */ _biases = new Map();
+    constructor(tokenizer) {
+        this._tokenizer = tokenizer;
+    }
+    /**
+     * Adjust the bias of the given token(s).
+     *
+     * If a text is provided, the bias will be applied to each individual token in the text.
+     *
+     * Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
+     *
+     * Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
+     * @param input - The token(s) to apply the bias to
+     * @param bias - The probability bias to apply to the token(s).
+     *
+     * Setting to a positive number increases the probability of the token(s) being generated.
+     *
+     * Setting to a negative number decreases the probability of the token(s) being generated.
+     *
+     * Setting to `0` has no effect.
+     *
+     * For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
+     * Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
+     *
+     * Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
+     *
+     * Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
+     */
+    set(input, bias) {
+        const resolvedLogit = bias === "never"
+            ? -Infinity
+            : typeof bias === "number"
+                ? probabilityToLogit(bias)
+                : bias.logit;
+        for (const token of tokenizeInput(input, this._tokenizer)) {
+            if (this._tokenizer.isEogToken(token))
+                continue;
+            this._biases.set(token, resolvedLogit);
+        }
+        for (const token of tokenizeInput(input, this._tokenizer, "trimLeadingSpace")) {
+            if (this._tokenizer.isEogToken(token))
+                continue;
+            this._biases.set(token, resolvedLogit);
+        }
+        return this;
+    }
+    static for(modelOrTokenizer) {
+        if (modelOrTokenizer.tokenizer != null)
+            return new TokenBias(modelOrTokenizer.tokenizer);
+        return new TokenBias(modelOrTokenizer);
+    }
+}
+function probabilityToLogit(probability) {
+    if (probability <= -1)
+        return -Infinity;
+    else if (probability >= 1)
+        return Infinity;
+    else if (probability === 0)
+        return 0;
+    return Math.log(probability / (1 - probability));
+}
+//# sourceMappingURL=TokenBias.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenBias.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenBias.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"TokenBias.js","sourceRoot":"","sources":["../../src/evaluator/TokenBias.ts"],"names":[],"mappings":"AAEA,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AAGxD;;GAEG;AACH,MAAM,OAAO,SAAS;IAClB,gBAAgB,CAAiB,UAAU,CAAY;IACvD,gBAAgB,CAAiB,OAAO,GAAG,IAAI,GAAG,EAAiB,CAAC;IAEpE,YAAmB,SAAoB;QACnC,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;IAChC,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACI,GAAG,CAAC,KAA2C,EAAE,IAAwC;QAC5F,MAAM,aAAa,GAAG,IAAI,KAAK,OAAO;YAClC,CAAC,CAAC,CAAC,QAAQ;YACX,CAAC,CAAC,OAAO,IAAI,KAAK,QAAQ;gBACtB,CAAC,CAAC,kBAAkB,CAAC,IAAI,CAAC;gBAC1B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC;QAErB,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACxD,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,kBAAkB,CAAC,EAAE,CAAC;YAC5E,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,IAAI,CAAC;IAChB,CAAC;IAEM,MAAM,CAAC,GAAG,CAAC,gBAAwC;QACtD,IAAK,gBAA+B,CAAC,SAAS,IAAI,IAAI;YAClD,OAAO,IAAI,SAAS,CAAE,gBAA+B,CAAC,SAAS,CAAC,CAAC;QAErE,OAAO,IAAI,SAAS,CAAC,gBAA6B,CAAC,CAAC;IACxD,CAAC;CACJ;AAED,SAAS,kBAAkB,CAAC,WAAmB;IAC3C,IAAI,WAAW,IAAI,CAAC,CAAC;QACjB,OAAO,CAAC,QAAQ,CAAC;SAChB,IAAI,WAAW,IAAI,CAAC;QACrB,OAAO,QAAQ,CAAC;SACf,IAAI,WAAW,KAAK,CAAC;QACtB,OAAO,CAAC,CAAC;IAEb,OAAO,IAAI,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC;AACrD,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.d.ts
@@ -0,0 +1,45 @@
+/**
+ * Tracks the usage of tokens.
+ */
+export declare class TokenMeter {
+    private _inputTokens;
+    private _outputTokens;
+    /**
+     * The number of input tokens used
+     */
+    get usedInputTokens(): number;
+    /**
+     * The number of tokens generated by a model
+     */
+    get usedOutputTokens(): number;
+    /**
+     * Get the current state of the token meter
+     */
+    getState(): TokenMeterState;
+    /**
+     * Log the usage of tokens
+     */
+    useTokens(tokens: number, type: "input" | "output"): void;
+    /**
+     * Get the difference between the current meter and another meter
+     */
+    diff(meter: TokenMeter | TokenMeterState): {
+        usedInputTokens: number;
+        usedOutputTokens: number;
+    };
+    /**
+     * Log the usage of tokens on multiple meters
+     */
+    static useTokens(meters: null | undefined | TokenMeter | readonly TokenMeter[] | ReadonlySet<TokenMeter>, tokens: number, type: "input" | "output"): void;
+    /**
+     * Get the difference between two meters
+     */
+    static diff(meter1: TokenMeter | TokenMeterState, meter2: TokenMeter | TokenMeterState): {
+        usedInputTokens: number;
+        usedOutputTokens: number;
+    };
+}
+export type TokenMeterState = {
+    usedInputTokens: number;
+    usedOutputTokens: number;
+};
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js
@@ -0,0 +1,74 @@
+/**
+ * Tracks the usage of tokens.
+ */
+export class TokenMeter {
+    _inputTokens = 0;
+    _outputTokens = 0;
+    /**
+     * The number of input tokens used
+     */
+    get usedInputTokens() {
+        return this._inputTokens;
+    }
+    /**
+     * The number of tokens generated by a model
+     */
+    get usedOutputTokens() {
+        return this._outputTokens;
+    }
+    /**
+     * Get the current state of the token meter
+     */
+    getState() {
+        return {
+            usedInputTokens: this.usedInputTokens,
+            usedOutputTokens: this.usedOutputTokens
+        };
+    }
+    /**
+     * Log the usage of tokens
+     */
+    useTokens(tokens, type) {
+        if (tokens < 0)
+            throw new RangeError("Tokens cannot be negative");
+        else if (tokens === 0)
+            return;
+        if (type === "input")
+            this._inputTokens += tokens;
+        else if (type === "output")
+            this._outputTokens += tokens;
+        else {
+            void type;
+            throw new TypeError(`Unknown token type: ${type}`);
+        }
+    }
+    /**
+     * Get the difference between the current meter and another meter
+     */
+    diff(meter) {
+        return TokenMeter.diff(this, meter);
+    }
+    /**
+     * Log the usage of tokens on multiple meters
+     */
+    static useTokens(meters, tokens, type) {
+        if (meters == null)
+            return;
+        if (meters instanceof TokenMeter)
+            meters.useTokens(tokens, type);
+        else {
+            for (const meter of meters)
+                meter.useTokens(tokens, type);
+        }
+    }
+    /**
+     * Get the difference between two meters
+     */
+    static diff(meter1, meter2) {
+        return {
+            usedInputTokens: meter1.usedInputTokens - meter2.usedInputTokens,
+            usedOutputTokens: meter1.usedOutputTokens - meter2.usedOutputTokens
+        };
+    }
+}
+//# sourceMappingURL=TokenMeter.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"TokenMeter.js","sourceRoot":"","sources":["../../src/evaluator/TokenMeter.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,OAAO,UAAU;IACX,YAAY,GAAW,CAAC,CAAC;IACzB,aAAa,GAAW,CAAC,CAAC;IAElC;;OAEG;IACH,IAAW,eAAe;QACtB,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,IAAW,gBAAgB;QACvB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED;;OAEG;IACI,QAAQ;QACX,OAAO;YACH,eAAe,EAAE,IAAI,CAAC,eAAe;YACrC,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;SAC1C,CAAC;IACN,CAAC;IAED;;OAEG;IACI,SAAS,CAAC,MAAc,EAAE,IAAwB;QACrD,IAAI,MAAM,GAAG,CAAC;YACV,MAAM,IAAI,UAAU,CAAC,2BAA2B,CAAC,CAAC;aACjD,IAAI,MAAM,KAAK,CAAC;YACjB,OAAO;QAEX,IAAI,IAAI,KAAK,OAAO;YAChB,IAAI,CAAC,YAAY,IAAI,MAAM,CAAC;aAC3B,IAAI,IAAI,KAAK,QAAQ;YACtB,IAAI,CAAC,aAAa,IAAI,MAAM,CAAC;aAC5B,CAAC;YACF,KAAM,IAAqB,CAAC;YAC5B,MAAM,IAAI,SAAS,CAAC,uBAAuB,IAAI,EAAE,CAAC,CAAC;QACvD,CAAC;IACL,CAAC;IAED;;OAEG;IACI,IAAI,CAAC,KAAmC;QAC3C,OAAO,UAAU,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IACxC,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,SAAS,CACnB,MAAuF,EACvF,MAAc,EACd,IAAwB;QAExB,IAAI,MAAM,IAAI,IAAI;YACd,OAAO;QAEX,IAAI,MAAM,YAAY,UAAU;YAC5B,MAAM,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;aAC9B,CAAC;YACF,KAAK,MAAM,KAAK,IAAI,MAAM;gBACtB,KAAK,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACtC,CAAC;IACL,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,IAAI,CACd,MAAoC,EACpC,MAAoC;QAEpC,OAAO;YACH,eAAe,EAAE,MAAM,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe;YAChE,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,GAAG,MAAM,CAAC,gBAAgB;SACtE,CAAC;IACN,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.d.ts
@@ -0,0 +1,86 @@
+import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
+import { Token, Tokenizer } from "../../types.js";
+import { LlamaText } from "../../utils/LlamaText.js";
+/**
+ * Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
+ *
+ * This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
+ *
+ * Based on https://github.com/ZeroEntropy-AI/llama-chunk
+ * @experimental - this API is experimental and may change or be removed in subsequent releases
+ * @hidden
+ */
+export declare function experimentalChunkDocument(options: {
+    contextSequence: LlamaContextSequence;
+    document: string;
+    /**
+     * The tokens to use as separators for chunking the document.
+     * Passed to the `getSystemPrompt` function to generate the prompt.
+     */
+    separatorTokens?: Token[];
+    getSystemPrompt?(options: {
+        separatorTokens: Token[];
+        tokenizer: Tokenizer;
+        maxChunkSize?: number;
+    }): LlamaText | string;
+    /**
+     * Maximum number of tokens to allow in a chunk.
+     *
+     * As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
+     *
+     * Set to `0` to disable this mechanism.
+     *
+     * Defaults to `500`.
+     */
+    maxChunkSize?: number;
+    /**
+     * The alignment curve for the maximum chunk size mechanism.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Play around with values between `1` and `4` to see what works best for you.
+     *
+     * Set to `1` to disable this mechanism.
+     *
+     * Defaults to `4`.
+     */
+    maxChunkSizeAlignmentCurve?: number;
+    /**
+     * Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
+     * matches any of the texts in `trimmedTexts`
+     */
+    syntaxAlignment?: {
+        /**
+         * The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
+         *
+         * Default: `4`
+         */
+        maxTokens?: number;
+        /**
+         * The trimmed texts to match for, to append the token to the current chunk.
+         *
+         * Default: `["", ".", ";"]`
+         */
+        trimmedTexts?: string[];
+    };
+    /**
+     * The number of tokens to skip before starting to use the generated separator tokens to split the document.
+     */
+    skipFirstTokens?: number;
+    /**
+     * The number of recent probabilities to keep in the trail for normalization.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Defaults to `200`.
+     */
+    normalizationTrailSize?: number;
+    /**
+     * Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
+     */
+    onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void;
+    /**
+     * Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
+     */
+    onChunkText?(chunkText: string, usedSeparatorToken: Token): void;
+}): Promise<string[]>;
--- a/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js
@@ -0,0 +1,212 @@
+import { LlamaText, SpecialTokensText } from "../../utils/LlamaText.js";
+import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
+import { safeEventCallback } from "../../utils/safeEventCallback.js";
+import { maxRecentDetokenizerTokens } from "../../consts.js";
+/**
+ * Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
+ *
+ * This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
+ *
+ * Based on https://github.com/ZeroEntropy-AI/llama-chunk
+ * @experimental - this API is experimental and may change or be removed in subsequent releases
+ * @hidden
+ */
+export async function experimentalChunkDocument(options) {
+    const { contextSequence, document, separatorTokens = findAppropriateSeparatorTokens(contextSequence.model), getSystemPrompt = getDefaultPrompt, maxChunkSize = 500, maxChunkSizeAlignmentCurve = 4, syntaxAlignment: { maxTokens: maxSyntaxAlignment = 4, trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"] } = {}, skipFirstTokens = 3, normalizationTrailSize = 100 } = options;
+    const onChunkTokens = safeEventCallback(options.onChunkTokens);
+    const onChunkText = safeEventCallback(options.onChunkText);
+    if (separatorTokens.length === 0)
+        throw new Error("Separator tokens must be provided");
+    const chatHistory = [{
+            type: "system",
+            text: LlamaText(getSystemPrompt({
+                separatorTokens,
+                tokenizer: contextSequence.model.tokenizer,
+                maxChunkSize: maxChunkSize <= 0
+                    ? undefined
+                    : maxChunkSize
+            })).toJSON()
+        }, {
+            type: "user",
+            text: document
+        }, {
+            type: "model",
+            response: [""]
+        }];
+    const chatWrapper = resolveChatWrapper(contextSequence.model);
+    const { contextText } = chatWrapper.generateContextState({ chatHistory });
+    const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
+    const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
+    const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
+    if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
+        throw new Error("The context size is too small to chunk the given document");
+    const evaluateInput = initialContextTokens.slice();
+    for (let i = 0; i < documentTokens.length - 1; i++) {
+        const token = documentTokens[i];
+        evaluateInput.push([token, {
+                generateNext: {
+                    probabilities: true
+                }
+            }]);
+    }
+    let weight = 1;
+    const recentProbabilitiesTrail = [];
+    let chunkStartIndex = 0;
+    let lastPushedSeparatorIndex = 0;
+    const chunks = [];
+    const res = [];
+    function pushSeparatorIndex(separateIndex, separatorToken) {
+        lastPushedSeparatorIndex = separateIndex;
+        if (separateIndex <= chunkStartIndex)
+            return;
+        let endIndex = separateIndex;
+        for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
+            const text = contextSequence.model.detokenize([documentTokens[endIndex + i]]);
+            if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
+                break;
+            endIndex++;
+        }
+        const chunk = documentTokens.slice(chunkStartIndex, endIndex);
+        const text = contextSequence.model.detokenize(chunk, false, documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex));
+        chunks.push(chunk);
+        chunkStartIndex = endIndex;
+        onChunkTokens?.(chunk, separatorToken);
+        onChunkText?.(text, separatorToken);
+        res.push(text);
+    }
+    await contextSequence.controlledEvaluate(evaluateInput, {
+        onTokenResult(inputTokenIndex, result) {
+            const i = inputTokenIndex - initialContextTokens.length;
+            const nextProbabilities = result?.next?.probabilities;
+            const nextDocumentToken = documentTokens[i + 1];
+            if (nextProbabilities == null)
+                throw new Error("received no result for token " + i);
+            const topProbabilityScore = nextProbabilities.entries()
+                .next().value?.[1];
+            const [usedSeparatorToken, separatorProbability] = separatorTokens
+                .filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
+                .map((token) => [token, nextProbabilities.get(token)])
+                .filter((pair) => pair[1] != null)
+                .reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
+                if (probabilityA >= probabilityB)
+                    return [tokenA, probabilityA];
+                return [tokenB, probabilityB];
+            }, [separatorTokens[0], 0]);
+            if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
+                return;
+            // console.log(
+            //     i, contextSequence.model.detokenize([documentTokens[i]!]),
+            //     Array.from(nextProbabilities.entries()).slice(0, 5)
+            //         .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
+            // );
+            if (separatorProbability >= topProbabilityScore)
+                pushSeparatorIndex(i + 1, usedSeparatorToken);
+            else if (i > skipFirstTokens) {
+                const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
+                let maxChunkSizeAlignment = 0;
+                if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
+                    const leftProbability = 1 - adjustedProbability;
+                    const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
+                    maxChunkSizeAlignment = currentChunkSize === 0
+                        ? 0
+                        : adjustExponential(leftProbability * Math.min(1, currentChunkSize / maxChunkSize), maxChunkSizeAlignmentCurve <= 0
+                            ? 1
+                            : maxChunkSizeAlignmentCurve, 0.8);
+                    if (currentChunkSize === maxChunkSize)
+                        maxChunkSizeAlignment = 1;
+                }
+                if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
+                    pushSeparatorIndex(i + 1, usedSeparatorToken);
+                    // update the weight of the current token with the adjusted probability in the trail
+                    if (recentProbabilitiesTrail.length > 1) {
+                        weight /= recentProbabilitiesTrail.pop();
+                        recentProbabilitiesTrail.push(adjustedProbability);
+                        weight *= adjustedProbability;
+                    }
+                }
+            }
+            const nextDocumentTokenProbability = nextDocumentToken == null
+                ? undefined
+                : nextProbabilities.get(nextDocumentToken);
+            if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
+                recentProbabilitiesTrail.push(nextDocumentTokenProbability);
+                weight *= nextDocumentTokenProbability;
+                if (recentProbabilitiesTrail.length > normalizationTrailSize)
+                    weight /= recentProbabilitiesTrail.shift();
+            }
+        }
+    });
+    if (lastPushedSeparatorIndex !== documentTokens.length)
+        pushSeparatorIndex(documentTokens.length, separatorTokens[0]);
+    return res;
+}
+const idealTokenTexts = [
+    "\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
+    "\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
+    "\u00a1", // inverted exclamation mark
+    "|",
+    "_"
+];
+function findAppropriateSeparatorTokens(model, maxTokens = 2) {
+    const idealTextsSet = new Set(idealTokenTexts);
+    const foundTokens = [];
+    for (const token of model.iterateAllTokens()) {
+        if (model.isSpecialToken(token))
+            continue;
+        const text = model.detokenize([token]);
+        const trimmedText = text.trim();
+        if (idealTextsSet.has(trimmedText)) {
+            const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
+            if (foundTokens[textIndex] == null || text === trimmedText)
+                foundTokens[textIndex] = token;
+        }
+    }
+    const res = [];
+    for (let i = 0; i < idealTokenTexts.length; i++) {
+        const token = foundTokens[i];
+        if (token != null)
+            res.push(token);
+    }
+    return res.slice(0, maxTokens);
+}
+function getDefaultPrompt({ separatorTokens, tokenizer, maxChunkSize = 500 }) {
+    if (separatorTokens.length === 0)
+        throw new Error("No separator tokens provided");
+    else if (separatorTokens.length > 2)
+        throw new Error("Maximum of 2 separator tokens are supported");
+    return LlamaText.joinValues("\n", [
+        'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
+        "",
+        "You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
+        "",
+        "# Instructions",
+        LlamaText([
+            "- For splits, use `",
+            new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]])),
+            '` as the "big split token" separator.'
+        ]),
+        separatorTokens.length > 1 && (LlamaText([
+            "- For small splits, use `",
+            new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]])),
+            '` as the "big split token" separator.'
+        ])),
+        "- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
+        LlamaText([
+            "- You may get a user message that is unstructured or not structured cleanly. " +
+                "Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
+            " characters, and a big split every ", Math.floor(maxChunkSize), " characters."
+        ]),
+        "- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
+        "- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
+        '- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
+        "- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
+    ].filter((x) => x !== false));
+}
+function adjustExponential(value, exponent, weight) {
+    if (value < 0)
+        return 0;
+    else if (value > 1)
+        return 1;
+    return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
+}
+//# sourceMappingURL=chunkDocument.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js.map
				`@@ -0,0 +1 @@`
				{"version":3,"file":"FunctionCallNameGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AAEtD,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AAErF,OAAO,EAAC,WAAW,EAAC,MAAM,kDAAkD,CAAC;AAE7E,OAAO,EAAC,MAAM,EAAC,MAAM,6CAA6C,CAAC;AACnE,OAAO,EAAC,gBAAgB,EAAC,MAAM,uDAAuD,CAAC;AAEvF,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,uBAAoE,SAAQ,YAAY;IAChF,UAAU,CAAY;IACtB,YAAY,CAAc;IAE3C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB;QAC3E,MAAM,OAAO,GAAG,6BAA6B,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;QAEtE,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACzC,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAEhC,IAAI,CAAC,kBAAkB,EAAE,CAAC;IAC9B,CAAC;IAEM,iBAAiB,CAAC,qBAA6B;QAClD,IAAI,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,IAAI,qBAAqB,CAAC,CAAC,CAAC,KAAK,GAAG;YACjG,qBAAqB,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAG,qBAAqB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAEzD,MAAM,YAAY,GAAG,qBAAqB,CAAC,KAAK,CAC5C,CAAC,EACD,YAAY,GAAG,CAAC;YACZ,CAAC,CAAC,qBAAqB,CAAC,MAAM;YAC9B,CAAC,CAAC,YAAY,CACO,CAAC;QAE9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC;YAC7C,MAAM,IAAI,gCAAgC,CACtC,kBAAkB,YAAY,2CAA2C,EACzE,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,qBAAqB,CACxB,CAAC;QAEN,OAAO,YAAY,CAAC;IACxB,CAAC;IAEO,kBAAkB;QACtB,KAAK,MAAM,aAAa,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACvD,IAAI,aAAa,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC;gBAC3F,MAAM,IAAI,KAAK,CAAC,kBAAkB,aAAa,sCAAsC,CAAC,CAAC;iBACtF,IAAI,aAAa,KAAK,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACnE,CAAC;IACL,CAAC;CACJ;AAED,SAAS,6BAA6B,CAClC,SAAoB,EAAE,WAAwB;IAE9C,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAEpD,MAAM,oBAAoB,GAAmB,EAAE,CAAC;IAEhD,KAAK,MAAM,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC;QAC7C,oBAAoB,CAAC,IAAI,CAAC,IAAI,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC;IAElE,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAErD,MAAM,YAAY,GAAG,IAAI,WAAW,CAAC;QACjC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,WAAW,CAAC,OAAO,CAAC,gBAAgB,CAAC;KACxC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC;IAE9C,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,QAAQ,CAAC,CAAC;AACrE,CAAC"}
				`@@ -0,0 +1 @@`
				{"version":3,"file":"FunctionCallParamsGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AACtD,OAAO,EAAC,+BAA+B,EAAC,MAAM,kEAAkE,CAAC;AAEjH,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AACrF,OAAO,EAAC,oCAAoC,EAAC,MAAM,uEAAuE,CAAC;AAI3H,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,yBAAsE,SAAQ,YAAY;IAClF,UAAU,CAAY;IACtB,YAAY,CAAc;IAC1B,aAAa,CAAS;IACtB,aAAa,CAAiB;IAE/C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB,EAAE,YAAoB,EAAE,YAA4B;QAC/H,MAAM,OAAO,GAAG,+BAA+B,CAAC,YAAY,CAAC,CAAC;QAE9D,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAChC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAEM,WAAW,CAAC,QAAgB;QAC/B,MAAM,QAAQ,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,CAAC;YACZ,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,uCAAuC,EACxG,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAEjD,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAChC,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,mBAAmB,EACpF,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAExC,+BAA+B,CAAC,MAAM,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAE5D,OAAO;YACH,MAAM,EAAE,MAAa,EAAE,yCAAyC;YAChE,GAAG,EAAE,YAAY;SACpB,CAAC;IACN,CAAC;CACJ;AAED,SAAS,+BAA+B,CAAC,YAA4B;IACjE,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IACpD,MAAM,YAAY,GAAG,oCAAoC,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;IAC1F,MAAM,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;IAEjE,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACpF,CAAC"}
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"LlamaFunctionCallValidationError.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts"],"names":[],"mappings":"AAIA,MAAM,OAAO,gCAA6E,SAAQ,KAAK;IACnF,SAAS,CAAY;IACrB,WAAW,CAAc;IACzB,QAAQ,CAAS;IAEjC,YAAmB,OAAe,EAAE,SAAoB,EAAE,WAAwB,EAAE,QAAgB;QAChG,KAAK,CAAC,OAAO,CAAC,CAAC;QAEf,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC7B,CAAC;CACJ"}`
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"defineChatSessionFunction.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AACH,MAAM,UAAU,yBAAyB,CAGvC,EACE,WAAW,EACX,MAAM,EACN,OAAO,EAKV;IACG,OAAO;QACH,WAAW;QACX,MAAM;QACN,OAAO;KACV,CAAC;AACN,CAAC"}`
				`@@ -0,0 +1 @@`
				{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}`
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}`
				`@@ -0,0 +1 @@`
				{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
				`@@ -0,0 +1 @@`
				{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
				`@@ -0,0 +1 @@`
				`export declare function padSafeContextSize(value: number, padDirection: "up" \| "down", padding?: number): number;`