434 lines
19 KiB
TypeScript
434 lines
19 KiB
TypeScript
import { EventRelay } from "lifecycle-utils";
|
|
import { ChatWrapper } from "../../ChatWrapper.js";
|
|
import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
|
|
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
|
|
import { LlamaGrammar } from "../LlamaGrammar.js";
|
|
import { LLamaChatContextShiftOptions, LlamaChatResponseChunk, LlamaChatResponseFunctionCallParamsChunk } from "../LlamaChat/LlamaChat.js";
|
|
import { EvaluationPriority } from "../LlamaContext/types.js";
|
|
import { TokenBias } from "../TokenBias.js";
|
|
import { LlamaText } from "../../utils/LlamaText.js";
|
|
import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
|
|
export type LlamaChatSessionOptions = {
|
|
contextSequence: LlamaContextSequence;
|
|
/** `"auto"` is used by default */
|
|
chatWrapper?: "auto" | ChatWrapper;
|
|
systemPrompt?: string;
|
|
/**
|
|
* Add the system prompt even on models that don't support a system prompt.
|
|
*
|
|
* Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
|
|
* but forcing the system prompt on unsupported models may not always work as expected.
|
|
*
|
|
* Use with caution.
|
|
*/
|
|
forceAddSystemPrompt?: boolean;
|
|
/**
|
|
* Automatically dispose the sequence when the session is disposed.
|
|
*
|
|
* Defaults to `false`.
|
|
*/
|
|
autoDisposeSequence?: boolean;
|
|
contextShift?: LlamaChatSessionContextShiftOptions;
|
|
};
|
|
export type LlamaChatSessionContextShiftOptions = {
|
|
/**
|
|
* The number of tokens to delete from the context window to make space for new ones.
|
|
* Defaults to 10% of the context size.
|
|
*/
|
|
size?: LLamaChatContextShiftOptions["size"];
|
|
/**
|
|
* The strategy to use when deleting tokens from the context window.
|
|
*
|
|
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
|
|
*/
|
|
strategy?: LLamaChatContextShiftOptions["strategy"];
|
|
};
|
|
export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
|
|
/**
|
|
* Called as the model generates the main response with the generated text chunk.
|
|
*
|
|
* Useful for streaming the generated response as it's being generated.
|
|
*
|
|
* Includes only the main response without any text segments (like thoughts).
|
|
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
|
*/
|
|
onTextChunk?: (text: string) => void;
|
|
/**
|
|
* Called as the model generates the main response with the generated tokens.
|
|
*
|
|
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
|
|
*
|
|
* Includes only the main response without any segments (like thoughts).
|
|
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
|
*/
|
|
onToken?: (tokens: Token[]) => void;
|
|
/**
|
|
* Called as the model generates a response with the generated text and tokens,
|
|
* including segment information (when the generated output is part of a segment).
|
|
*
|
|
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
|
|
*
|
|
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
|
|
*/
|
|
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
|
|
/**
|
|
* An AbortSignal to later abort the generation.
|
|
*
|
|
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
|
|
*
|
|
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
|
|
*/
|
|
signal?: AbortSignal;
|
|
/**
|
|
* When a response already started being generated and then the signal is aborted,
|
|
* the generation will stop and the response will be returned as is instead of throwing an error.
|
|
*
|
|
* Defaults to `false`.
|
|
*/
|
|
stopOnAbortSignal?: boolean;
|
|
/** Maximum number of tokens to generate */
|
|
maxTokens?: number;
|
|
/**
|
|
* Temperature is a hyperparameter that controls the randomness of the generated text.
|
|
* It affects the probability distribution of the model's output tokens.
|
|
*
|
|
* A higher temperature (e.g., 1.5) makes the output more random and creative,
|
|
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
|
|
*
|
|
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
|
|
*
|
|
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
|
*
|
|
* Set to `0` to disable.
|
|
* Disabled by default (set to `0`).
|
|
*/
|
|
temperature?: number;
|
|
/**
|
|
* From the next token candidates, discard the percentage of tokens with the lowest probability.
|
|
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
|
|
* This is useful for generating more high-quality results when using a high temperature.
|
|
* Set to a value between `0` and `1` to enable.
|
|
*
|
|
* Only relevant when `temperature` is set to a value greater than `0`.
|
|
* Disabled by default.
|
|
*/
|
|
minP?: number;
|
|
/**
|
|
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
|
|
* An integer number between `1` and the size of the vocabulary.
|
|
* Set to `0` to disable (which uses the full vocabulary).
|
|
*
|
|
* Only relevant when `temperature` is set to a value greater than 0.
|
|
*/
|
|
topK?: number;
|
|
/**
|
|
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
|
|
* and samples the next token only from this set.
|
|
* A float number between `0` and `1`.
|
|
* Set to `1` to disable.
|
|
*
|
|
* Only relevant when `temperature` is set to a value greater than `0`.
|
|
*/
|
|
topP?: number;
|
|
/**
|
|
* Used to control the randomness of the generated text.
|
|
*
|
|
* Change the seed to get different results.
|
|
*
|
|
* Only relevant when using `temperature`.
|
|
*/
|
|
seed?: number;
|
|
/**
|
|
* Trim whitespace from the end of the generated text
|
|
* Disabled by default.
|
|
*/
|
|
trimWhitespaceSuffix?: boolean;
|
|
/**
|
|
* Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
|
|
*
|
|
* May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
|
|
* so avoid using it together with function calling if you notice unexpected behavior.
|
|
*/
|
|
responsePrefix?: string;
|
|
/**
|
|
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
|
|
*/
|
|
evaluationPriority?: EvaluationPriority;
|
|
repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
|
|
/**
|
|
* Adjust the probability of tokens being generated.
|
|
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
|
* or to avoid generating tokens that you want it to avoid.
|
|
*/
|
|
tokenBias?: TokenBias | (() => TokenBias);
|
|
/**
|
|
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
|
|
*/
|
|
customStopTriggers?: (LlamaText | string | (string | Token)[])[];
|
|
/**
|
|
* Called as the model generates function calls with the generated parameters chunk for each function call.
|
|
*
|
|
* Useful for streaming the generated function call parameters as they're being generated.
|
|
* Only useful in specific use cases,
|
|
* such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
|
|
*
|
|
* The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
|
|
* according to the function parameters schema.
|
|
*
|
|
* Each function call has its own `callIndex` you can use to distinguish between them.
|
|
*
|
|
* Only relevant when using function calling (via passing the `functions` option).
|
|
*/
|
|
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
|
/**
|
|
* Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
|
|
*/
|
|
budgets?: {
|
|
/**
|
|
* Budget for thought tokens.
|
|
*
|
|
* Defaults to `Infinity`.
|
|
*/
|
|
thoughtTokens?: number;
|
|
/**
|
|
* Budget for comment tokens.
|
|
*
|
|
* Defaults to `Infinity`.
|
|
*/
|
|
commentTokens?: number;
|
|
};
|
|
} & ({
|
|
grammar?: LlamaGrammar;
|
|
functions?: never;
|
|
documentFunctionParams?: never;
|
|
maxParallelFunctionCalls?: never;
|
|
onFunctionCallParamsChunk?: never;
|
|
} | {
|
|
grammar?: never;
|
|
functions?: Functions | ChatSessionModelFunctions;
|
|
documentFunctionParams?: boolean;
|
|
maxParallelFunctionCalls?: number;
|
|
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
|
});
|
|
export type LLamaChatCompletePromptOptions = {
|
|
/**
|
|
* Generate a completion for the given user prompt up to the given number of tokens.
|
|
*
|
|
* Defaults to `256` or half the context size, whichever is smaller.
|
|
*/
|
|
maxTokens?: LLamaChatPromptOptions["maxTokens"];
|
|
/**
|
|
* When a completion already started being generated and then the given `signal` is aborted,
|
|
* the generation will stop and the completion will be returned as-is instead of throwing an error.
|
|
*
|
|
* Defaults to `false`.
|
|
*/
|
|
stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
|
|
/**
|
|
* Called as the model generates a completion with the generated text chunk.
|
|
*
|
|
* Useful for streaming the generated completion as it's being generated.
|
|
*/
|
|
onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
|
|
/**
|
|
* Called as the model generates a completion with the generated tokens.
|
|
*
|
|
* Preferably, you'd want to use `onTextChunk` instead of this.
|
|
*/
|
|
onToken?: LLamaChatPromptOptions["onToken"];
|
|
signal?: LLamaChatPromptOptions["signal"];
|
|
temperature?: LLamaChatPromptOptions["temperature"];
|
|
minP?: LLamaChatPromptOptions["minP"];
|
|
topK?: LLamaChatPromptOptions["topK"];
|
|
topP?: LLamaChatPromptOptions["topP"];
|
|
seed?: LLamaChatPromptOptions["seed"];
|
|
trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
|
|
evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
|
|
repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
|
|
tokenBias?: LLamaChatPromptOptions["tokenBias"];
|
|
customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
|
|
grammar?: LlamaGrammar;
|
|
/**
|
|
* Functions are not used by the model here,
|
|
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
|
* to avoid context shifts.
|
|
*
|
|
* It's best to provide the same functions that were used for the previous prompt here.
|
|
*/
|
|
functions?: ChatSessionModelFunctions;
|
|
/**
|
|
* Functions are not used by the model here,
|
|
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
|
* to avoid context shifts.
|
|
*
|
|
* It's best to provide the same value that was used for the previous prompt here.
|
|
*/
|
|
documentFunctionParams?: boolean;
|
|
/**
|
|
* Whether to complete the prompt as a model response.
|
|
*
|
|
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
|
|
* This is a good option to workaround some models that don't support used prompt completions.
|
|
* - **`true`**: Always complete as a model response
|
|
* - **`false`**: Never complete as a model response
|
|
*
|
|
* Defaults to `"auto"`.
|
|
*/
|
|
completeAsModel?: "auto" | boolean | {
|
|
/**
|
|
* Whether to complete the prompt as a model response.
|
|
*
|
|
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
|
|
* This is a good option to workaround some models that don't support used prompt completions.
|
|
* - **`true`**: Always complete as a model response
|
|
* - **`false`**: Never complete as a model response
|
|
*
|
|
* Defaults to `"auto"`.
|
|
*/
|
|
enabled?: "auto" | boolean;
|
|
/**
|
|
* The messages to append to the chat history to generate a completion as a model response.
|
|
*
|
|
* If the last message is a model message, the prompt will be pushed to it for the completion,
|
|
* otherwise a new model message will be added with the prompt.
|
|
*
|
|
* It must contain a user message or a system message before the model message.
|
|
*
|
|
* Default to:
|
|
* ```ts
|
|
* [
|
|
* {
|
|
* type: "system",
|
|
* text: "For your next response predict what the user may send next. " +
|
|
* "No yapping, no whitespace. Match the user's language and tone."
|
|
* },
|
|
* {type: "user", text: ""},
|
|
* {type: "model", response: [""]}
|
|
* ]
|
|
* ```
|
|
*/
|
|
appendedMessages?: ChatHistoryItem[];
|
|
};
|
|
};
|
|
export type LLamaChatPreloadPromptOptions = {
|
|
signal?: LLamaChatCompletePromptOptions["signal"];
|
|
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
|
|
functions?: LLamaChatCompletePromptOptions["functions"];
|
|
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
|
|
};
|
|
export type LlamaChatSessionRepeatPenalty = {
|
|
/**
|
|
* Number of recent tokens generated by the model to apply penalties to repetition of.
|
|
* Defaults to `64`.
|
|
*/
|
|
lastTokens?: number;
|
|
punishTokensFilter?: (tokens: Token[]) => Token[];
|
|
/**
|
|
* Penalize new line tokens.
|
|
* Enabled by default.
|
|
*/
|
|
penalizeNewLine?: boolean;
|
|
/**
|
|
* The relative amount to lower the probability of the tokens in `punishTokens` by
|
|
* Defaults to `1.1`.
|
|
* Set to `1` to disable.
|
|
*/
|
|
penalty?: number;
|
|
/**
|
|
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
|
|
* Disabled by default (`0`).
|
|
* Set to a value between `0` and `1` to enable.
|
|
*/
|
|
frequencyPenalty?: number;
|
|
/**
|
|
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
|
|
* Disabled by default (`0`).
|
|
* Set to a value between `0` and `1` to enable.
|
|
*/
|
|
presencePenalty?: number;
|
|
};
|
|
/**
|
|
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
|
|
*/
|
|
export declare class LlamaChatSession {
|
|
readonly onDispose: EventRelay<void>;
|
|
constructor(options: LlamaChatSessionOptions);
|
|
dispose({ disposeSequence }?: {
|
|
disposeSequence?: boolean;
|
|
}): void;
|
|
/** @hidden */
|
|
[Symbol.dispose](): void;
|
|
get disposed(): boolean;
|
|
get chatWrapper(): ChatWrapper;
|
|
get sequence(): LlamaContextSequence;
|
|
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
|
|
get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
|
|
prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
|
|
/**
|
|
* @param prompt
|
|
* @param [options]
|
|
*/
|
|
promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
|
|
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
|
|
responseText: string;
|
|
stopReason: "customStopTrigger";
|
|
customStopTrigger: (string | Token)[];
|
|
remainingGenerationAfterStop: string | Token[] | undefined;
|
|
} | {
|
|
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
|
|
responseText: string;
|
|
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
|
|
remainingGenerationAfterStop: string | Token[] | undefined;
|
|
customStopTrigger?: undefined;
|
|
}>;
|
|
/**
|
|
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
|
|
* and feel faster.
|
|
*
|
|
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
|
|
* @param prompt - the prompt to preload
|
|
* @param [options]
|
|
*/
|
|
preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
|
|
/**
|
|
* Preload a user prompt into the current context sequence state and generate a completion for it.
|
|
*
|
|
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
|
|
* > so consider limiting the length of prompts you preload.
|
|
* >
|
|
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
|
|
* @param prompt - the prompt to preload
|
|
* @param [options]
|
|
*/
|
|
completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
|
|
/**
|
|
* Create a smart completion engine that caches the prompt completions
|
|
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
|
|
*
|
|
* All completions are made and cache is used only for the current chat session state.
|
|
* You can create a single completion engine for an entire chat session.
|
|
*/
|
|
createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
|
|
/**
|
|
* See `completePrompt` for more information.
|
|
* @param prompt
|
|
* @param [options]
|
|
*/
|
|
completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel }?: LLamaChatCompletePromptOptions): Promise<{
|
|
completion: string;
|
|
stopReason: "customStopTrigger";
|
|
customStopTrigger: (string | Token)[];
|
|
remainingGenerationAfterStop: string | Token[] | undefined;
|
|
} | {
|
|
completion: string;
|
|
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
|
|
remainingGenerationAfterStop: string | Token[] | undefined;
|
|
customStopTrigger?: undefined;
|
|
}>;
|
|
getChatHistory(): ChatHistoryItem[];
|
|
getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
|
|
setChatHistory(chatHistory: ChatHistoryItem[]): void;
|
|
/** Clear the chat history and reset it to the initial state. */
|
|
resetChatHistory(): void;
|
|
}
|