First upload version 0.0.1

2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
@@ -0,0 +1,245 @@
+import { EventRelay } from "lifecycle-utils";
+import { Token } from "../../types.js";
+import { TokenMeter } from "../TokenMeter.js";
+import { LlamaModel } from "../LlamaModel/LlamaModel.js";
+import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
+import { TokenPredictor } from "./TokenPredictor.js";
+export declare class LlamaContext {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    dispose(): Promise<void>;
+    /** @hidden */
+    [Symbol.asyncDispose](): Promise<void>;
+    get disposed(): boolean;
+    get model(): LlamaModel;
+    get contextSize(): number;
+    get batchSize(): number;
+    get flashAttention(): boolean;
+    /**
+     * The actual size of the state in the memory in bytes.
+     * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
+     */
+    get stateSize(): number;
+    /** The number of threads currently used to evaluate tokens */
+    get currentThreads(): number;
+    /**
+     * The number of threads that are preferred to be used to evaluate tokens.
+     *
+     * The actual number of threads used may be lower when other evaluations are running in parallel.
+     */
+    get idealThreads(): number;
+    getAllocatedContextSize(): number;
+    get totalSequences(): number;
+    get sequencesLeft(): number;
+    /**
+     * Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
+     * When there are no sequences left, this method will throw an error.
+     */
+    getSequence(options?: {
+        contextShift?: ContextShiftOptions;
+        /**
+         * Token predictor to use for the sequence.
+         * Don't share the same token predictor between multiple sequences.
+         *
+         * Using a token predictor doesn't affect the generation output itself -
+         * it only allows for greater parallelization of the token evaluation to speed up the generation.
+         *
+         * > **Note:** that if a token predictor is too resource intensive,
+         * > it can slow down the generation process due to the overhead of running the predictor.
+         * >
+         * > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
+         *
+         * Automatically disposed when disposing the sequence.
+         * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
+         */
+        tokenPredictor?: TokenPredictor;
+    }): LlamaContextSequence;
+    dispatchPendingBatch(): void;
+    /**
+     * Print the timings of token evaluation since that last print for this context.
+     *
+     * Requires the `performanceTracking` option to be enabled.
+     *
+     * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
+     * it won't print anything.
+     */
+    printTimings(): Promise<void>;
+}
+export declare class LlamaContextSequence {
+    readonly onDispose: EventRelay<void>;
+    private constructor();
+    dispose(): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get context(): LlamaContext;
+    get model(): LlamaModel;
+    /** The maximum number of tokens that the sequence state can hold */
+    get contextSize(): number;
+    /** The index where the next evaluated token will be placed in the context */
+    get nextTokenIndex(): number;
+    /** The current context state tokens */
+    get contextTokens(): Token[];
+    get tokenMeter(): TokenMeter;
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    get tokenPredictor(): TokenPredictor | undefined;
+    /**
+     * Get the index of the first token in the KV cache.
+     *
+     * If you remove any tokens from the state that come before this index,
+     * no cached prefix tokens evaluation state will be used for the next evaluation.
+     *
+     * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
+     * then the cached state for range `0-10` will be used in the next evaluation,
+     * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
+     * and will be re-evaluated in the next evaluation.
+     *
+     * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
+     *
+     * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     *
+     * When the KV cache is empty, this index will be `-1`.
+     *
+     * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
+     */
+    get stateCellsStartIndex(): number;
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    get tokenPredictions(): {
+        /** Number of token predictions that were actually used (tokens that were validated and then consumed) */
+        used: number;
+        /** Number of token predictions that were not used (tokens that were validated and were not consumed) */
+        unused: number;
+        /** Number of token predictions that were validated successfully */
+        validated: number;
+        /** Number of token predictions that were refuted */
+        refuted: number;
+    };
+    get isLoadedToMemory(): boolean;
+    compareContextTokens(tokens: Token[]): {
+        firstDifferentIndex: number;
+    };
+    /**
+     * Erase parts of the context state to align it with the given tokens.
+     *
+     * If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
+     *
+     * To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
+     *
+     * If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
+     * which incurs token evaluation of the shifted tokens.
+     */
+    adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
+    /**
+     * Clear the history of the sequence.
+     */
+    clearHistory(): Promise<void>;
+    /**
+     * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
+     * The start of each range is inclusive, and the end of each range is exclusive.
+     * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
+     */
+    eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
+    /**
+     * Evaluate the provided tokens into the context sequence without generating new tokens.
+     */
+    evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority;
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions;
+    }): Promise<void>;
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority;
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions;
+        /** Called on each token result after it's generated */
+        onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
+    }): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
+    /**
+     * Save the current context sequence evaluation state to a file.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    saveStateToFile(filePath: string): Promise<{
+        fileSize: number;
+    }>;
+    /**
+     * Load a context sequence evaluation state from a file.
+     *
+     * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
+     *
+     * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     */
+    loadStateFromFile(filePath: string, acceptRisk: {
+        /**
+         * Loading a state file created using a different model may crash the process.
+         *
+         * You must accept this risk to use this feature.
+         */
+        acceptRisk: true;
+    }): Promise<void>;
+}
+export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
+    contextSize: number;
+    sequences: number;
+}): number;
+export declare function getDefaultContextSequences(): number;
+export declare function getDefaultModelContextSize({ trainContextSize }: {
+    trainContextSize?: number;
+}): number;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
@@ -0,0 +1 @@
+export {};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
@@ -0,0 +1,31 @@
+/** @internal */
+export class LlamaSampler {
+    /** @internal */ _llama;
+    /** @internal */ _sampler;
+    /** @internal */ disposed = false;
+    constructor(model) {
+        this._llama = model._llama;
+        this._sampler = new this._llama._bindings.AddonSampler(model._model);
+        this.asyncDispose = this.asyncDispose.bind(this);
+    }
+    dispose() {
+        this.disposed = true;
+        this._sampler.dispose();
+    }
+    async asyncDispose() {
+        this.disposed = true;
+        this._sampler.dispose();
+    }
+    applyConfig(config) {
+        return this._sampler.applyConfig(config);
+    }
+    /** @internal */
+    static _canBeNextTokenForGrammarEvaluationState(llama, grammarEvaluationState, token) {
+        return llama._bindings.AddonSampler.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
+    }
+    /** @internal */
+    static _acceptTokenOnGrammarEvaluationState(llama, grammarEvaluationState, token) {
+        llama._bindings.AddonSampler.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
+    }
+}
+//# sourceMappingURL=LlamaSampler.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
@@ -0,0 +1,55 @@
+import { Token } from "../../types.js";
+import { SequenceEvaluateOptions } from "./types.js";
+import { LlamaContextSequence } from "./LlamaContext.js";
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export declare abstract class TokenPredictor {
+    /**
+     * Resets the state of the predictor.
+     *
+     * Called before the generation starts.
+     */
+    abstract reset(params: {
+        /** The target sequence that this token predictor is generating tokens for */
+        targetSequence: LlamaContextSequence;
+        /**
+         * The tokens that are or will be loaded into the state.
+         *
+         * The initial predictions should be based on these tokens.
+         *
+         * When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
+         */
+        stateTokens: Token[];
+        /**
+         * Options used for the evaluation on the target sequence.
+         *
+         * The `grammarEvaluationState` is cloned before being passed to the token predictor,
+         * so it can be modified without affecting the original state.
+         */
+        evaluateOptions: Readonly<SequenceEvaluateOptions>;
+    }): Promise<void> | void;
+    abstract pushTokens(tokens: Token[]): void;
+    /**
+     * Predicts the next tokens based on the current state.
+     *
+     * If the generation should wait until the minimum predications are ready,
+     * this method should return a promise that resolves when the minimum predictions are ready.
+     *
+     * A background prediction process can be started when this function is called,
+     * so that the next predictions will be ready when this function is called again.
+     */
+    abstract predictTokens(): Promise<Token[]> | Token[];
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    stop(untilPredictionsExhausted?: boolean): Promise<void> | void;
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    updateInputTokens(tokens: Token[]): void;
+    dispose(): Promise<void> | void;
+    /** @hidden */
+    [Symbol.dispose](): void | Promise<void>;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
@@ -0,0 +1,20 @@
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export class TokenPredictor {
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    stop(untilPredictionsExhausted) { }
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    updateInputTokens(tokens) { }
+    dispose() { }
+    /** @hidden */
+    [Symbol.dispose]() {
+        return this.dispose();
+    }
+}
+//# sourceMappingURL=TokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
@@ -0,0 +1,56 @@
+import { Token } from "../../../types.js";
+import { SequenceEvaluateOptions } from "../types.js";
+import { LlamaContextSequence } from "../LlamaContext.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export declare class DraftSequenceTokenPredictor extends TokenPredictor {
+    constructor(draftSequence: LlamaContextSequence, options?: {
+        /**
+         * The minimum number of tokens to draft.
+         *
+         * Defaults to `0`.
+         */
+        minTokens?: number;
+        /**
+         * Maximum number of tokens to draft.
+         *
+         * Defaults to `16`.
+         */
+        maxTokens?: number;
+        /**
+         * Evaluate options default to the values of the target sequence.
+         *
+         * You can override any of the options for the prediction here.
+         */
+        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
+        /**
+         * Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
+         * When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
+         * are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
+         *
+         * A number between `0` and `1` representing the minimum probability of the token to be generated.
+         *
+         * Set to `0` to disable.
+         *
+         * Defaults to `0.6`.
+         */
+        minConfidence?: number;
+    });
+    get draftSequence(): LlamaContextSequence;
+    get minTokens(): number;
+    get maxTokens(): number;
+    get minConfidence(): number | undefined;
+    reset({ targetSequence, stateTokens, evaluateOptions }: {
+        targetSequence: LlamaContextSequence;
+        stateTokens: Token[];
+        evaluateOptions: Readonly<SequenceEvaluateOptions>;
+    }): Promise<void>;
+    pushTokens(tokens: Token[]): void;
+    predictTokens(): Token[] | Promise<Token[]>;
+    stop(untilPredictionsExhausted?: boolean): void;
+    dispose(): void;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
@@ -0,0 +1,266 @@
+import { withLock } from "lifecycle-utils";
+import { pushAll } from "../../../utils/pushAll.js";
+import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
+import { LlamaSampler } from "../LlamaSampler.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+const defaultPredictionMinTokens = 0;
+const defaultPredictionMaxTokens = 16;
+const defaultPredictionMinConfidence = 0.6;
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export class DraftSequenceTokenPredictor extends TokenPredictor {
+    /** @internal */ _draftSequence;
+    /** @internal */ _minTokens;
+    /** @internal */ _maxTokens;
+    /** @internal */ _minConfidence;
+    /** @internal */ _stateTokens = [];
+    /** @internal */ _pendingEvalTokens = [];
+    /** @internal */ _predictedTokens = [];
+    /** @internal */ _evaluateOptions = {};
+    /** @internal */ _overrideEvaluateOptions = {};
+    /** @internal */ _grammarEvaluationStateOption;
+    /** @internal */ _currentEvaluationAbortController = new AbortController();
+    /** @internal */ _resetAbortController = new AbortController();
+    /** @internal */ _stopped = true;
+    /** @internal */ _waitForPredictionExhaustion = false;
+    /** @internal */ _minTokensCallbacks = [];
+    /** @internal */ _resetPredictions = false;
+    /** @internal */ _iterator;
+    /** @internal */ _active = false;
+    /** @internal */ _disposed = false;
+    constructor(draftSequence, options = {}) {
+        super();
+        this._draftSequence = draftSequence;
+        this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
+        this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
+        this._overrideEvaluateOptions = options.evaluateOptions ?? {};
+        this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
+        if (draftSequence.disposed)
+            throw new Error("The draft sequence is disposed");
+    }
+    get draftSequence() {
+        return this._draftSequence;
+    }
+    get minTokens() {
+        return this._minTokens;
+    }
+    get maxTokens() {
+        return this._maxTokens;
+    }
+    get minConfidence() {
+        return this._minConfidence;
+    }
+    async reset({ targetSequence, stateTokens, evaluateOptions }) {
+        this._currentEvaluationAbortController.abort();
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        this._resetAbortController = new AbortController();
+        this._stopped = true;
+        this._waitForPredictionExhaustion = false;
+        this._iterator?.return();
+        this._iterator = undefined;
+        const currentAbortSignal = this._resetAbortController.signal;
+        targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
+        try {
+            await withLock([this, "evaluate"], currentAbortSignal, async () => {
+                this._stateTokens = stateTokens.slice();
+                this._pendingEvalTokens = [];
+                this._predictedTokens = [];
+                this._resetPredictions = false;
+                while (this._minTokensCallbacks.length > 0)
+                    this._minTokensCallbacks.shift()?.();
+                const lastToken = this._stateTokens.pop();
+                if (lastToken != null)
+                    this._pendingEvalTokens.push(lastToken);
+                this._evaluateOptions = evaluateOptions;
+                this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+                    ? this._evaluateOptions.grammarEvaluationState()?.clone()
+                    : this._evaluateOptions.grammarEvaluationState?.clone();
+                const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
+                await this._draftSequence.adaptStateToTokens(newStateTokens, true);
+                newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
+                await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
+                    contextShift: this._evaluateOptions.contextShift,
+                    evaluationPriority: this._evaluateOptions.evaluationPriority
+                });
+            });
+        }
+        catch (err) {
+            if (err !== currentAbortSignal.reason)
+                throw err;
+        }
+    }
+    pushTokens(tokens) {
+        const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+            ? this._evaluateOptions.grammarEvaluationState()?.clone()
+            : this._evaluateOptions.grammarEvaluationState?.clone();
+        void withLock([this, "pushTokens"], async () => {
+            this._grammarEvaluationStateOption = grammarEvaluationStateOption;
+            const tokensToPush = tokens.slice();
+            while (!this._resetPredictions && tokensToPush.length > 0) {
+                const token = tokensToPush.shift();
+                if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
+                    this._predictedTokens.shift();
+                }
+                else {
+                    tokensToPush.unshift(token);
+                    break;
+                }
+            }
+            if (tokensToPush.length === 0) {
+                if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
+                    this._resume();
+                return;
+            }
+            this._currentEvaluationAbortController.abort();
+            this._currentEvaluationAbortController = new AbortController();
+            pushAll(this._pendingEvalTokens, tokensToPush);
+            this._resetPredictions = true;
+            this._resume();
+        });
+    }
+    predictTokens() {
+        if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
+            return this._predictedTokens;
+        this._stopped = false;
+        if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
+            this._waitForPredictionExhaustion = false;
+            this._resume();
+        }
+        if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
+            return this._predictedTokens;
+        if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
+            if (this._resetPredictions)
+                return [];
+            return this._predictedTokens;
+        }
+        return new Promise((accept) => void this._minTokensCallbacks.push(accept))
+            .then(() => {
+            if (this._resetPredictions)
+                return [];
+            return this._predictedTokens;
+        });
+    }
+    stop(untilPredictionsExhausted = false) {
+        this._stopped = true;
+        this._currentEvaluationAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        if (untilPredictionsExhausted)
+            this._waitForPredictionExhaustion = true;
+        void withLock([this, "evaluate"], async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+    dispose() {
+        this._disposed = true;
+        this._stopped = true;
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController.abort();
+        void withLock([this, "evaluate"], async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+    /** @internal */
+    _canIterate() {
+        return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
+    }
+    /** @internal */
+    _resume() {
+        if (this._active || !this._canIterate())
+            return;
+        this._active = true;
+        void withLock([this, "evaluate"], async () => {
+            try {
+                const abortSignal = this._currentEvaluationAbortController.signal;
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+                const resetPredications = async () => {
+                    this._iterator?.return();
+                    this._iterator = undefined;
+                    this._waitForPredictionExhaustion = false;
+                    this._resetPredictions = false;
+                    const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
+                    this._predictedTokens = [];
+                    await this._draftSequence.eraseContextTokenRanges([{
+                            start: this._draftSequence.nextTokenIndex - tokenToDelete,
+                            end: this._draftSequence.nextTokenIndex
+                        }]);
+                };
+                const createIterator = () => {
+                    const tokens = this._pendingEvalTokens;
+                    this._pendingEvalTokens = [];
+                    return this.draftSequence.evaluateWithMetadata(tokens, { confidence: true }, {
+                        ...this._evaluateOptions,
+                        ...this._overrideEvaluateOptions,
+                        grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
+                    });
+                };
+                if (this._resetPredictions)
+                    await resetPredications();
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+                let iterator = createIterator();
+                this._iterator = iterator;
+                while (this._canIterate() && !abortSignal.aborted) {
+                    const { value, done } = await iterator.next();
+                    let shouldBreak = done;
+                    if (value != null) {
+                        const { token, confidence } = value;
+                        if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
+                            confidence < this._minConfidence) {
+                            this._iterator = undefined;
+                            await iterator.return();
+                            this._waitForPredictionExhaustion = true;
+                            shouldBreak = true;
+                        }
+                        else
+                            this._predictedTokens.push(token);
+                    }
+                    if (this._resetPredictions && !abortSignal.aborted) {
+                        await resetPredications();
+                        iterator = createIterator();
+                        this._iterator = iterator;
+                        continue;
+                    }
+                    if (this._predictedTokens.length >= this._minTokens) {
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                    }
+                    if (shouldBreak) {
+                        this._iterator = undefined;
+                        await iterator.return();
+                        this._waitForPredictionExhaustion = true;
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                        break;
+                    }
+                }
+            }
+            finally {
+                this._active = false;
+            }
+        });
+    }
+    /** @internal */
+    _getGrammarEvaluationStateWithTokens(tokens) {
+        if (this._grammarEvaluationStateOption == null)
+            return undefined;
+        const clone = this._grammarEvaluationStateOption.clone();
+        for (const token of tokens) {
+            const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+            if (!canAddToken) {
+                console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
+                this._grammarEvaluationStateOption = undefined;
+                return undefined;
+            }
+            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+        }
+        return clone;
+    }
+}
+//# sourceMappingURL=DraftSequenceTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
@@ -0,0 +1,58 @@
+import { Token } from "../../../types.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export declare class InputLookupTokenPredictor extends TokenPredictor {
+    constructor(options?: {
+        patternLength?: {
+            /**
+             * Min pattern length to look for in the input tokens.
+             *
+             * Defaults to `1`.
+             */
+            min?: number;
+            /**
+             * Max pattern length to look for in the input tokens.
+             *
+             * Set to `0` to disable the max pattern size.
+             *
+             * Defaults to `0`.
+             */
+            max?: number;
+        };
+        predictionLength?: {
+            /**
+             * Minimum number of tokens to predict.
+             *
+             * Defaults to `1`.
+             */
+            min?: number;
+            /**
+             * Maximum number of tokens to predict.
+             *
+             * Defaults to `3`.
+             */
+            max?: number;
+        };
+    });
+    get patternMinLength(): number;
+    get patternMaxLength(): number;
+    get predictionMinLength(): number;
+    get predictionMaxLength(): number;
+    reset({ stateTokens }: {
+        stateTokens: Token[];
+    }): void;
+    updateInputTokens(tokens: Token[]): void;
+    pushTokens(tokens: Token[]): void;
+    predictTokens(): Token[];
+    dispose(): void;
+}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
@@ -0,0 +1,138 @@
+import { DisposedError } from "lifecycle-utils";
+import { pushAll } from "../../../utils/pushAll.js";
+import { TokenPredictor } from "../TokenPredictor.js";
+const defaultPatternMinLength = 1;
+const defaultPatternMaxLength = 0;
+const defaultPredictionMinLength = 1;
+const defaultPredictionMaxLength = 3;
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export class InputLookupTokenPredictor extends TokenPredictor {
+    /** @internal */ _patternMinLength;
+    /** @internal */ _patternMaxLength;
+    /** @internal */ _predictionMinLength;
+    /** @internal */ _predictionMaxLength;
+    /** @internal */ _lastPredictionMatchStartIndex = undefined;
+    /** @internal */ _lastPredictionMatchLength = undefined;
+    /** @internal */ _stateTokens = [];
+    /** @internal */ _inputTokens = [];
+    /** @internal */ _disposed = false;
+    constructor(options = {}) {
+        super();
+        this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
+        this._patternMaxLength = Math.floor(Math.max(0, Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)));
+        this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
+        this._predictionMaxLength = Math.floor(Math.max(this._patternMinLength, options.predictionLength?.max ?? defaultPredictionMaxLength));
+    }
+    get patternMinLength() {
+        return this._patternMinLength;
+    }
+    get patternMaxLength() {
+        return this._patternMaxLength;
+    }
+    get predictionMinLength() {
+        return this._predictionMinLength;
+    }
+    get predictionMaxLength() {
+        return this._predictionMaxLength;
+    }
+    reset({ stateTokens }) {
+        this._stateTokens = stateTokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    updateInputTokens(tokens) {
+        this._inputTokens = tokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    pushTokens(tokens) {
+        pushAll(this._stateTokens, tokens);
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            this._lastPredictionMatchLength += tokens.length;
+        }
+    }
+    predictTokens() {
+        if (this._disposed)
+            throw new DisposedError();
+        if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
+            return [];
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            for (let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1, s = this._stateTokens.length - 1; p >= this._lastPredictionMatchStartIndex && s >= 0; p--, s--) {
+                if (this._inputTokens[p] !== this._stateTokens[s]) {
+                    delete this._lastPredictionMatchStartIndex;
+                    delete this._lastPredictionMatchLength;
+                    break;
+                }
+            }
+            if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+                const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
+                if (predictionEndIndex < this._inputTokens.length) {
+                    return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+                }
+            }
+        }
+        const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
+        if (matchStartIndex == null || matchLength == null)
+            return [];
+        const predictionEndIndex = matchStartIndex + matchLength;
+        const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+        if (res.length >= this._predictionMinLength) {
+            this._lastPredictionMatchStartIndex = matchStartIndex;
+            this._lastPredictionMatchLength = matchLength;
+            return res;
+        }
+        return [];
+    }
+    dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+    /** @internal */
+    _findLongestPatternIndex(findIn, lookupPattern) {
+        const checkIndexes = [];
+        let bestIndex = -1;
+        let bestIndexDiff = -1;
+        for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
+            const token = findIn[i];
+            for (let j = checkIndexes.length - 1; j >= 0; j--) {
+                const startIndex = checkIndexes[j];
+                const indexDiff = startIndex - i;
+                if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength)) {
+                    checkIndexes.splice(j, 1);
+                    if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                        bestIndex = startIndex;
+                        bestIndexDiff = indexDiff;
+                    }
+                }
+            }
+            if (token === lookupPattern[lookupPattern.length - 1])
+                checkIndexes.unshift(i);
+        }
+        for (let j = checkIndexes.length - 1; j >= 0; j--) {
+            const startIndex = checkIndexes[j];
+            const indexDiff = startIndex + 1;
+            checkIndexes.splice(j, 1);
+            if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                bestIndex = startIndex;
+                bestIndexDiff = indexDiff;
+            }
+        }
+        if (bestIndex >= 0)
+            return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
+        return [];
+    }
+}
+//# sourceMappingURL=InputLookupTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
@@ -0,0 +1,458 @@
+import { PickOptions } from "../../utils/utilTypes.js";
+import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
+import type { TokenBias } from "../TokenBias.js";
+import type { Token } from "../../types.js";
+import type { LlamaContextSequence } from "./LlamaContext.js";
+export type LlamaContextOptions = {
+    /**
+     * number of sequences for the context.
+     * Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context.
+     * Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other.
+     * This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch).
+     *
+     * Each sequence increases the memory usage of the context.
+     *
+     * Defaults to `1`.
+     */
+    sequences?: number;
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
+     * aligns the context size to multiples of 256 for performance reasons.
+     * To check the actual context size that gets created, use the `.contextSize` property
+     * of the created context instance or any of its sequences.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number;
+        max?: number;
+    };
+    /**
+     * The number of tokens that can be processed at once by the GPU.
+     *
+     * Defaults to `512` or `contextSize` if `contextSize` is less than `512`.
+     */
+    batchSize?: number;
+    /**
+     * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
+     *
+     * The support for flash attention is currently experimental and may not always work as expected.
+     * Use with caution.
+     *
+     * This option will be ignored if flash attention is not supported by the model.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
+     *
+     * Upon flash attention exiting the experimental status, the default value will become `true`
+     * (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
+     */
+    flashAttention?: boolean;
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware.
+     *
+     * This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running.
+     * To ensure the minimum number of threads you want to use are always used,
+     * set this to an object with a `min` property (see the `min` property description for more details).
+     *
+     * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+     *
+     * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+     * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+     */
+    threads?: number | {
+        /**
+         * The ideal number of threads to use for evaluations.
+         *
+         * If other evaluations are running, the actual number of threads may be lower than this value.
+         *
+         * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
+         *
+         * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
+         * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
+         */
+        ideal?: number;
+        /**
+         * Ensure evaluations always use at least this number of threads.
+         *
+         * Use with caution, since setting this value too high can lead to the context waiting too much time
+         * to reserve this number of threads before the evaluation can start.
+         */
+        min?: number;
+    };
+    /**
+     * Control the parallel sequences processing behavior.
+     *
+     * See {@link BatchingOptions} for more information.
+     */
+    batching?: BatchingOptions;
+    /**
+     * When using SWA (Sliding Window Attention) on a supported model,
+     * extend the sliding window size to the current context size (meaning practically disabling SWA).
+     *
+     * Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
+     * but will allow reusing the evaluation cache of any prefix length of the context sequence state
+     * (instead of just the size of the sliding window when SWA is used).
+     *
+     * This option has no effect on models that do not support SWA (Sliding Window Attention).
+     *
+     * > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
+     */
+    swaFullCache?: boolean;
+    /**
+     * Load the provided LoRA adapters onto the context.
+     * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
+     * without the need for extensive retraining from scratch.
+     *
+     * If a string is provided, it will be treated as a path to a single LoRA adapter file.
+     *
+     * The adapters will be released from memory once the model (not just the context) is disposed.
+     */
+    lora?: string | {
+        adapters: Array<{
+            filePath: string;
+            /**
+             * Defaults to `1`
+             */
+            scale?: number;
+        }>;
+        /**
+         * Called with the LoRA adapters load percentage when the LoRA adapters are being loaded.
+         * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
+         */
+        onLoadProgress?(loadProgress: number): void;
+    };
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal;
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean;
+    /**
+     * On failed context creation, retry the creation with a smaller context size.
+     *
+     * Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties.
+     *
+     * Set `retries` to `false` to disable.
+     */
+    failedCreationRemedy?: false | {
+        /**
+         * Retries to attempt to create the context.
+         *
+         * Defaults to `6`.
+         */
+        retries?: number;
+        /**
+         * The percentage to decrease the context size by on each retry.
+         * Should be a number between `0` and `1`.
+         *
+         * If a function is provided, it will be called with the current context size and should return the new context size.
+         *
+         * Defaults to `0.16`.
+         */
+        autoContextSizeShrink?: number | ((contextSize: number) => number);
+    };
+    /**
+     * Track the inference performance of the context, so using `.printTimings()` will work.
+     *
+     * Defaults to `false`.
+     */
+    performanceTracking?: boolean;
+};
+export type LlamaContextSequenceRepeatPenalty = {
+    /** Tokens to lower the predication probability of to be the next predicted token */
+    punishTokens: Token[] | (() => Token[]);
+    /**
+     * The maximum number of tokens that will be provided in the `punishTokens` array.
+     *
+     * This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation.
+     *
+     * Don't set this value too high, as it can allocate too much memory.
+     *
+     * Defaults to `64`.
+     */
+    maxPunishTokens?: number;
+    /**
+     * The relative amount to lower the probability of the tokens in `punishTokens` by.
+     *
+     * Defaults to `1.1`.
+     * Set to `1` to disable.
+     */
+    penalty?: number;
+    /**
+     * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`.
+     *
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    frequencyPenalty?: number;
+    /**
+     * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`.
+     *
+     * Disabled by default (`0`).
+     * Set to a value between `0` and `1` to enable.
+     */
+    presencePenalty?: number;
+};
+export type BatchingOptions = {
+    /**
+     * The strategy used to dispatch items to be processed when there are items pending to be processed.
+     * - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
+     * You can provide a custom function to define a custom dispatch schedule.
+     *
+     * Defaults to `"nextCycle"`.
+     */
+    dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule;
+    /**
+     * The strategy used to prioritize pending items to be processed.
+     * - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
+     * - **`"firstInFirstOut"`** - process items in the order they were added.
+     * - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
+     * See the {@link CustomBatchingPrioritizationStrategy} type for more information.
+     *
+     * Defaults to `"maximumParallelism"`.
+     */
+    itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy;
+};
+/**
+ * A function that schedules the dispatch of the batch items.
+ * Call the `dispatch` function to dispatch the items.
+ */
+export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void;
+/**
+ * A function that prioritizes the batch items to be processed.
+ * The function receives an array of `items` and the `size` of how many tokens can be processed in this batch.
+ *
+ * The function should return an array of prioritized items,
+ * where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received,
+ * and where the `item` of each prioritized item is the same reference to an original item in the `items` array.
+ */
+export type CustomBatchingPrioritizationStrategy = (options: {
+    items: readonly BatchItem[];
+    size: number;
+}) => PrioritizedBatchItem[];
+export type ContextShiftOptions = {
+    size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+    strategy?: "eraseBeginning" | ((options: {
+        sequence: LlamaContextSequence;
+        size: number;
+    }) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>);
+};
+export type ContextTokensDeleteRange = {
+    start: number;
+    end: number;
+};
+export type SequenceEvaluateOptions = {
+    temperature?: number;
+    minP?: number;
+    topK?: number;
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Defaults to the current epoch time.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
+    repeatPenalty?: LlamaContextSequenceRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+     * evaluated based on the strategy chosen for the context.
+     * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+     * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+     * highest evaluation priority.
+     * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+     * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+     */
+    evaluationPriority?: EvaluationPriority;
+    /**
+     * Override the sequence context shift options for this evaluation
+     *
+     * See {@link ContextShiftOptions} for more information.
+     */
+    contextShift?: ContextShiftOptions;
+    /**
+     * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
+     * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
+     * Defaults to `false`.
+     */
+    yieldEogToken?: boolean;
+};
+export type SequenceEvaluateMetadataOptions = {
+    /**
+     * Get the confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)` from the output.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    readonly confidence?: boolean;
+    /**
+     * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+     *
+     * Only enable when needed, as it impacts the performance.
+     *
+     * Defaults to `false`.
+     */
+    readonly probabilities?: boolean;
+};
+export type SequenceEvaluateOutput<Options extends {
+    readonly confidence?: boolean;
+    readonly probabilities?: boolean;
+} = {
+    readonly confidence: true;
+    readonly probabilities: true;
+}> = PickOptions<{
+    /**
+     * The next token generated by the model and selected using the given options (such a temperature).
+     */
+    token: Token;
+    /**
+     * The confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)`.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    confidence: number;
+    /**
+     * The probabilities of the tokens from the vocabulary to be the next token.
+     *
+     * A probability is a number from `0` to `1`.
+     *
+     * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+     *
+     * The map is sorted by the probability of the tokens from the highest to the lowest,
+     * and is reflected in the order of the entries when iterating over the map.
+     * Use `.entries().next().value` to get the top probability pair
+     * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+     */
+    probabilities: Map<Token, number>;
+}, Options & {
+    token: true;
+}>;
+export type ControlledEvaluateInputItem = Token | [
+    token: Token,
+    options: {
+        generateNext?: {
+            /**
+             * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+             *
+             * Only enable when needed, as it impacts the performance.
+             *
+             * Defaults to `false`.
+             */
+            probabilities?: boolean;
+            /**
+             * Get the confidence (probability) of the selected token.
+             *
+             * Same as `next.probabilities.get(next.token)` from the output.
+             *
+             * If you need only this value, you can skip getting the full probabilities list to improve performance.
+             *
+             * This value might be slightly different when evaluated on different GPUs and configurations.
+             */
+            confidence?: boolean;
+            /**
+             * Generate the next token with the provided options using sampling.
+             *
+             * Setting this to `true` will generate probabilities for the next token and sample it.
+             */
+            token?: boolean;
+            options?: {
+                temperature?: number;
+                minP?: number;
+                topK?: number;
+                topP?: number;
+                /**
+                 * Used to control the randomness of the generated text.
+                 *
+                 * Change the seed to get different results.
+                 *
+                 * Defaults to the current epoch time.
+                 *
+                 * Only relevant when using `temperature`.
+                 */
+                seed?: number;
+                repeatPenalty?: LlamaContextSequenceRepeatPenalty;
+                /**
+                 * Adjust the probability of tokens being generated.
+                 * Can be used to bias the model to generate tokens that you want it to lean towards,
+                 * or to avoid generating tokens that you want it to avoid.
+                 */
+                tokenBias?: TokenBias | (() => TokenBias);
+            };
+        };
+    }
+];
+export type ControlledEvaluateIndexOutput = {
+    next: {
+        token?: Token | null;
+        /**
+         * The confidence (probability) of the selected token (the `token` field in this object).
+         *
+         * Same as `next.probabilities.get(next.token)`.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
+         */
+        confidence?: number;
+        /**
+         * The probabilities of the tokens from the vocabulary to be the next token.
+         *
+         * A probability is a number from `0` to `1`.
+         *
+         * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+         *
+         * The map is sorted by the probability of the tokens from the highest to the lowest,
+         * and is reflected in the order of the entries when iterating over the map.
+         * Use `.entries().next().value` to get the top probability pair
+         * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+         */
+        probabilities?: Map<Token, number>;
+    };
+};
+/**
+ * 1 - low
+ *
+ * 5 - high
+ */
+export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
+export type BatchItem = {
+    readonly tokens: readonly Token[];
+    readonly logits: readonly (true | undefined)[];
+    readonly evaluationPriority: EvaluationPriority;
+};
+export type PrioritizedBatchItem = {
+    item: BatchItem;
+    processAmount: number;
+};
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
@@ -0,0 +1,2 @@
+export {};
+//# sourceMappingURL=types.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
@@ -0,0 +1,5 @@
+import { BatchItem, PrioritizedBatchItem } from "../../types.js";
+export declare function firstInFirstOutStrategy({ items, size }: {
+    items: readonly BatchItem[];
+    size: number;
+}): PrioritizedBatchItem[];
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
@@ -0,0 +1,16 @@
+export function firstInFirstOutStrategy({ items, size }) {
+    const res = [];
+    const sortedItems = items
+        .slice()
+        .sort((a, b) => b.evaluationPriority - a.evaluationPriority);
+    let leftFreeTokens = size;
+    for (const item of sortedItems) {
+        const processAmount = Math.min(item.tokens.length, leftFreeTokens);
+        res.push({ item, processAmount });
+        leftFreeTokens -= processAmount;
+        if (leftFreeTokens === 0)
+            break;
+    }
+    return res;
+}
+//# sourceMappingURL=firstInFirstOutStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
@@ -0,0 +1,5 @@
+import { BatchItem, PrioritizedBatchItem } from "../../types.js";
+export declare function maximumParallelismStrategy({ items, size }: {
+    items: readonly BatchItem[];
+    size: number;
+}): PrioritizedBatchItem[];
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
@@ -0,0 +1,42 @@
+export function maximumParallelismStrategy({ items, size }) {
+    let leftFreeTokens = size;
+    const minTokensForEachItem = Math.floor(leftFreeTokens / items.length);
+    const res = [];
+    const clippedItems = [];
+    for (const item of items) {
+        const processAmount = Math.min(item.tokens.length, leftFreeTokens, minTokensForEachItem);
+        const prioritizeItem = { item, processAmount };
+        res.push(prioritizeItem);
+        leftFreeTokens -= processAmount;
+        if (processAmount < item.tokens.length)
+            clippedItems.push(prioritizeItem);
+        if (leftFreeTokens === 0)
+            break;
+    }
+    for (let passesLeft = 3; leftFreeTokens > 0 && clippedItems.length > 0 && passesLeft > 0; passesLeft--) {
+        const minIncreaseAmount = Math.ceil(leftFreeTokens / clippedItems.length);
+        for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
+            const prioritizeItem = clippedItems[i];
+            const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
+            const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens, minIncreaseAmount);
+            prioritizeItem.processAmount += increaseAmount;
+            if (increaseAmount === unprocessedAmount) {
+                clippedItems.splice(i, 1);
+                i--;
+            }
+        }
+    }
+    clippedItems.sort((a, b) => b.item.evaluationPriority - a.item.evaluationPriority);
+    for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
+        const prioritizeItem = clippedItems[i];
+        const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
+        const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens);
+        prioritizeItem.processAmount += increaseAmount;
+        if (increaseAmount === unprocessedAmount) {
+            clippedItems.splice(i, 1);
+            i--;
+        }
+    }
+    return res;
+}
+//# sourceMappingURL=maximumParallelismStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
@@ -0,0 +1 @@
+export declare function padSafeContextSize(value: number, padDirection: "up" | "down", padding?: number): number;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
@@ -0,0 +1,18 @@
+import { contextSizePad } from "../../../config.js";
+export function padSafeContextSize(value, padDirection, padding = contextSizePad) {
+    const paddedSize = ggmlPad(value, padding);
+    if (paddedSize === value)
+        return value;
+    else if (padDirection === "up")
+        return paddedSize;
+    else if (padDirection === "down") {
+        const smallerPaddedSize = ggmlPad(value - padding, padding);
+        if (smallerPaddedSize >= padding)
+            return smallerPaddedSize;
+    }
+    return paddedSize;
+}
+function ggmlPad(value, padding) {
+    return ((value + padding - 1) & ~(padding - 1));
+}
+//# sourceMappingURL=padSafeContextSize.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
@@ -0,0 +1,2 @@
+import { BatchingOptions } from "../types.js";
+export declare function resolveBatchItemsPrioritizationStrategy(strategy: Required<BatchingOptions>["itemPrioritizationStrategy"]): import("../types.js").CustomBatchingPrioritizationStrategy;
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
@@ -0,0 +1,13 @@
+import { maximumParallelismStrategy } from "./batchItemsPrioritizationStrategies/maximumParallelismStrategy.js";
+import { firstInFirstOutStrategy } from "./batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js";
+export function resolveBatchItemsPrioritizationStrategy(strategy) {
+    if (strategy instanceof Function)
+        return strategy;
+    else if (strategy === "maximumParallelism")
+        return maximumParallelismStrategy;
+    else if (strategy === "firstInFirstOut")
+        return firstInFirstOutStrategy;
+    void strategy;
+    throw new Error(`Unknown batch items prioritize strategy: ${strategy}`);
+}
+//# sourceMappingURL=resolveBatchItemsPrioritizationStrategy.js.map
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}
				`@@ -0,0 +1 @@`
				{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}`
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}`
				`@@ -0,0 +1 @@`
				{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
				`@@ -0,0 +1 @@`
				{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
				`@@ -0,0 +1 @@`
				`export declare function padSafeContextSize(value: number, padDirection: "up" \| "down", padding?: number): number;`
				`@@ -0,0 +1 @@`
				{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}
				`@@ -0,0 +1 @@`
				{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}