First upload version 0.0.1

2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions
--- a/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
+++ b/node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
@@ -0,0 +1,459 @@
+import { EventRelay } from "lifecycle-utils";
+import { ChatWrapper } from "../../ChatWrapper.js";
+import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
+import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
+import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
+import { LlamaGrammar } from "../LlamaGrammar.js";
+import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
+import { EvaluationPriority } from "../LlamaContext/types.js";
+import { TokenBias } from "../TokenBias.js";
+import { LlamaModel } from "../LlamaModel/LlamaModel.js";
+export type LlamaChatOptions = {
+    contextSequence: LlamaContextSequence;
+    /** `"auto"` is used by default */
+    chatWrapper?: "auto" | ChatWrapper;
+    /**
+     * Automatically dispose the sequence when the session is disposed
+     *
+     * Defaults to `false`.
+     */
+    autoDisposeSequence?: boolean;
+};
+export type LlamaChatResponseChunk = LlamaChatResponseTextChunk | LlamaChatResponseSegmentChunk;
+export type LlamaChatResponseTextChunk = {
+    /** When `type` is `undefined`, the chunk is part of the main response and is not a segment */
+    type: undefined;
+    /**
+     * `segmentType` has no purpose when `type` is `undefined` (meaning that this chunk is part of the main response and is not a segment).
+     */
+    segmentType: undefined;
+    /**
+     * The generated text chunk.
+     *
+     * Detokenized from the `tokens` property,
+     * but with the context of the previous generation (for better spacing of the text with some models).
+     *
+     * Prefer using this property over `tokens` when streaming the generated response as text.
+     */
+    text: string;
+    /** The generated tokens */
+    tokens: Token[];
+};
+export type LlamaChatResponseSegmentChunk = {
+    type: "segment";
+    /** Segment type */
+    segmentType: ChatModelSegmentType;
+    /**
+     * The generated text chunk.
+     *
+     * Detokenized from the `tokens` property,
+     * but with the context of the previous generation (for better spacing of the text with some models).
+     *
+     * Prefer using this property over `tokens` when streaming the generated response as text.
+     */
+    text: string;
+    /** The generated tokens */
+    tokens: Token[];
+    /**
+     * When the current chunk is the start of a segment, this field will be set.
+     *
+     * It's possible that a chunk with no tokens and empty text will be emitted just to set this field
+     * to signify that the segment has started.
+     */
+    segmentStartTime?: Date;
+    /**
+     * When the current chunk is the last one of a segment (meaning the current segment has ended), this field will be set.
+     *
+     * It's possible that a chunk with no tokens and empty text will be emitted just to set this field
+     * to signify that the segment has ended.
+     */
+    segmentEndTime?: Date;
+};
+export type LlamaChatResponseFunctionCallParamsChunk = {
+    /**
+     * Each different function call has a different `callIndex`.
+     *
+     * When the previous function call has finished being generated, the `callIndex` of the next one will increment.
+     *
+     * Use this value to distinguish between different function calls.
+     */
+    callIndex: number;
+    /**
+     * The name of the function being called
+     */
+    functionName: string;
+    /**
+     * A chunk of the generated text used for the function call parameters.
+     *
+     * Collect all the chunks together to construct the full function call parameters.
+     *
+     * After the function call is finished, the entire constructed params text can be parsed as a JSON object,
+     * according to the function parameters schema.
+     */
+    paramsChunk: string;
+    /**
+     * When this is `true`, the current chunk is the last chunk in the generation of the current function call parameters.
+     */
+    done: boolean;
+};
+export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * Called as the model generates the main response with the generated text chunk.
+     *
+     * Useful for streaming the generated response as it's being generated.
+     *
+     * Includes only the main response without any text segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onTextChunk?: (text: string) => void;
+    /**
+     * Called as the model generates the main response with the generated tokens.
+     *
+     * Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
+     *
+     * Includes only the main response without any segments (like thoughts).
+     * For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
+     */
+    onToken?: (tokens: Token[]) => void;
+    /**
+     * Called as the model generates a response with the generated text and tokens,
+     * including segment information (when the generated output is part of a segment).
+     *
+     * Useful for streaming the generated response as it's being generated, including the main response and all segments.
+     *
+     * Only use this function when you need the segmented texts, like thought segments (chain of thought text).
+     */
+    onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
+    signal?: AbortSignal;
+    /**
+     * When a response already started being generated and then the signal is aborted,
+     * the generation will stop and the response will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /** Maximum number of tokens to generate */
+    maxTokens?: number;
+    /**
+     * Temperature is a hyperparameter that controls the randomness of the generated text.
+     * It affects the probability distribution of the model's output tokens.
+     *
+     * A higher temperature (e.g., 1.5) makes the output more random and creative,
+     * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
+     *
+     * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
+     *
+     * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+     *
+     * Set to `0` to disable.
+     * Disabled by default (set to `0`).
+     */
+    temperature?: number;
+    /**
+     * From the next token candidates, discard the percentage of tokens with the lowest probability.
+     * For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
+     * This is useful for generating more high-quality results when using a high temperature.
+     * Set to a value between `0` and `1` to enable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     * Disabled by default.
+     */
+    minP?: number;
+    /**
+     * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
+     * An integer number between `1` and the size of the vocabulary.
+     * Set to `0` to disable (which uses the full vocabulary).
+     *
+     * Only relevant when `temperature` is set to a value greater than 0.
+     */
+    topK?: number;
+    /**
+     * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
+     * and samples the next token only from this set.
+     * A float number between `0` and `1`.
+     * Set to `1` to disable.
+     *
+     * Only relevant when `temperature` is set to a value greater than `0`.
+     */
+    topP?: number;
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number;
+    /**
+     * Trim whitespace from the end of the generated text
+     *
+     * Defaults to `false`.
+     */
+    trimWhitespaceSuffix?: boolean;
+    repeatPenalty?: false | LLamaContextualRepeatPenalty;
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias);
+    /**
+     * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
+     */
+    evaluationPriority?: EvaluationPriority;
+    contextShift?: LLamaChatContextShiftOptions;
+    /**
+     * Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
+     */
+    customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
+    /**
+     * The evaluation context window returned from the last evaluation.
+     * This is an optimization to utilize existing context sequence state better when possible.
+     */
+    lastEvaluationContextWindow?: {
+        /** The history of the last evaluation. */
+        history?: ChatHistoryItem[];
+        /**
+         * Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
+         * If the last evaluation context window is not used, a new context will be generated based on the full history,
+         * which will decrease the likelihood of another context shift happening so soon.
+         *
+         * A number between `0` (exclusive) and `1` (inclusive).
+         */
+        minimumOverlapPercentageToPreventContextShift?: number;
+    };
+    /**
+     * Called as the model generates function calls with the generated parameters chunk for each function call.
+     *
+     * Useful for streaming the generated function call parameters as they're being generated.
+     * Only useful in specific use cases,
+     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
+     *
+     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
+     * according to the function parameters schema.
+     *
+     * Each function call has its own `callIndex` you can use to distinguish between them.
+     *
+     * Only relevant when using function calling (via passing the `functions` option).
+     */
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+    /**
+     * Set the maximum number of tokens the model is allowed to spend on various segmented responses.
+     */
+    budgets?: {
+        /**
+         * Whether to include the tokens already consumed by the current model response being completed in the budget.
+         *
+         * Defaults to `true`.
+         */
+        includeCurrentResponse?: boolean;
+        /**
+         * Budget for thought tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        thoughtTokens?: number;
+        /**
+         * Budget for comment tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        commentTokens?: number;
+    };
+    /**
+     * Stop the generation when the model tries to generate a non-textual segment or call a function.
+     *
+     * Useful for generating completions in a form of a model response.
+     *
+     * Defaults to `false`.
+     */
+    abortOnNonText?: boolean;
+} & ({
+    grammar?: LlamaGrammar;
+    functions?: never;
+    documentFunctionParams?: never;
+    maxParallelFunctionCalls?: never;
+    onFunctionCall?: never;
+    onFunctionCallParamsChunk?: never;
+} | {
+    grammar?: never;
+    functions?: Functions | ChatModelFunctions;
+    documentFunctionParams?: boolean;
+    maxParallelFunctionCalls?: number;
+    onFunctionCall?: (functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>) => void;
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
+});
+export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * Complete the given user prompt without adding it or the completion to the returned context window.
+     */
+    initialUserPrompt?: string;
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean;
+    /**
+     * Called as the model generates a completion with the generated text chunk.
+     *
+     * Useful for streaming the generated completion as it's being generated.
+     */
+    onTextChunk?: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
+    /**
+     * Called as the model generates a completion with the generated tokens.
+     *
+     * Preferably, you'd want to use `onTextChunk` instead of this.
+     */
+    onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"];
+    signal?: LLamaChatGenerateResponseOptions<Functions>["signal"];
+    maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
+    temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"];
+    minP?: LLamaChatGenerateResponseOptions<Functions>["minP"];
+    topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
+    topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
+    seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
+    trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
+    repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
+    tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
+    evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
+    contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
+    customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
+    lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"];
+    grammar?: LlamaGrammar;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same functions that were used for the previous prompt here.
+     */
+    functions?: Functions | ChatModelFunctions;
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same value that was used for the previous prompt here.
+     */
+    documentFunctionParams?: boolean;
+};
+export type LLamaChatContextShiftOptions = {
+    /**
+     * The number of tokens to delete from the context window to make space for new ones.
+     * Defaults to 10% of the context size.
+     */
+    size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
+    /**
+     * The strategy to use when deleting tokens from the context window.
+     *
+     * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
+     */
+    strategy?: "eraseFirstResponseAndKeepFirstSystem" | ((options: {
+        /** Full chat history */
+        chatHistory: readonly ChatHistoryItem[];
+        /** Maximum number of tokens that the new chat history should fit under when tokenized */
+        maxTokensCount: number;
+        /** Tokenizer used to tokenize the chat history */
+        tokenizer: Tokenizer;
+        /** Chat wrapper used to generate the context state */
+        chatWrapper: ChatWrapper;
+        /**
+         * The metadata returned from the last context shift strategy call.
+         * Will be `null` on the first call.
+         */
+        lastShiftMetadata?: object | null;
+    }) => {
+        chatHistory: ChatHistoryItem[];
+        metadata?: object | null;
+    } | Promise<{
+        chatHistory: ChatHistoryItem[];
+        metadata?: object | null;
+    }>);
+    /**
+     * The `contextShiftMetadata` returned from the last evaluation.
+     * This is an optimization to utilize the existing context state better when possible.
+     */
+    lastEvaluationMetadata?: object | undefined | null;
+};
+export declare class LlamaChat {
+    readonly onDispose: EventRelay<void>;
+    constructor({ contextSequence, chatWrapper, autoDisposeSequence }: LlamaChatOptions);
+    dispose({ disposeSequence }?: {
+        disposeSequence?: boolean;
+    }): void;
+    /** @hidden */
+    [Symbol.dispose](): void;
+    get disposed(): boolean;
+    get chatWrapper(): ChatWrapper;
+    get sequence(): LlamaContextSequence;
+    get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
+    get model(): LlamaModel;
+    generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatGenerateResponseOptions<Functions>): Promise<LlamaChatResponse<Functions>>;
+    loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatLoadAndCompleteUserMessageOptions<Functions>): Promise<LlamaChatLoadAndCompleteUserResponse>;
+}
+export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * The response text only, _without_ any text segments (like thoughts).
+     */
+    response: string;
+    /**
+     * The full response, including all text and text segments (like thoughts).
+     */
+    fullResponse: Array<string | LlamaChatResponseSegment>;
+    functionCalls?: Functions extends ChatModelFunctions ? LlamaChatResponseFunctionCall<Functions>[] : never;
+    lastEvaluation: {
+        cleanHistory: ChatHistoryItem[];
+        contextWindow: ChatHistoryItem[];
+        contextShiftMetadata: any;
+    };
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort";
+    } | {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+    };
+};
+export type LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions, FunctionCallName extends keyof Functions & string = string & keyof Functions, Params = Functions[FunctionCallName]["params"] extends undefined | null | void ? undefined : GbnfJsonSchemaToType<Functions[FunctionCallName]["params"]>> = {
+    functionName: FunctionCallName;
+    params: Params;
+    raw: LlamaTextJSON;
+};
+export type LlamaChatResponseSegment = {
+    type: "segment";
+    segmentType: ChatModelSegmentType;
+    text: string;
+    ended: boolean;
+    raw: LlamaTextJSON;
+    startTime?: string;
+    endTime?: string;
+};
+export type LlamaChatLoadAndCompleteUserResponse = {
+    completion: string;
+    lastEvaluation: {
+        /**
+         * The completion and initial user prompt are not added to this context window result,
+         * but are loaded to the current context sequence state as tokens
+         */
+        contextWindow: ChatHistoryItem[];
+        contextShiftMetadata: any;
+    };
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
+    } | {
+        remainingGenerationAfterStop?: string | Token[];
+        stopReason: "customStopTrigger";
+        customStopTrigger: (string | Token)[];
+    };
+};