First upload version 0.0.1

This commit is contained in:
Neyra
2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions

View File

@@ -0,0 +1,459 @@
import { EventRelay } from "lifecycle-utils";
import { ChatWrapper } from "../../ChatWrapper.js";
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
import { LlamaGrammar } from "../LlamaGrammar.js";
import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
import { EvaluationPriority } from "../LlamaContext/types.js";
import { TokenBias } from "../TokenBias.js";
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
export type LlamaChatOptions = {
contextSequence: LlamaContextSequence;
/** `"auto"` is used by default */
chatWrapper?: "auto" | ChatWrapper;
/**
* Automatically dispose the sequence when the session is disposed
*
* Defaults to `false`.
*/
autoDisposeSequence?: boolean;
};
export type LlamaChatResponseChunk = LlamaChatResponseTextChunk | LlamaChatResponseSegmentChunk;
export type LlamaChatResponseTextChunk = {
/** When `type` is `undefined`, the chunk is part of the main response and is not a segment */
type: undefined;
/**
* `segmentType` has no purpose when `type` is `undefined` (meaning that this chunk is part of the main response and is not a segment).
*/
segmentType: undefined;
/**
* The generated text chunk.
*
* Detokenized from the `tokens` property,
* but with the context of the previous generation (for better spacing of the text with some models).
*
* Prefer using this property over `tokens` when streaming the generated response as text.
*/
text: string;
/** The generated tokens */
tokens: Token[];
};
export type LlamaChatResponseSegmentChunk = {
type: "segment";
/** Segment type */
segmentType: ChatModelSegmentType;
/**
* The generated text chunk.
*
* Detokenized from the `tokens` property,
* but with the context of the previous generation (for better spacing of the text with some models).
*
* Prefer using this property over `tokens` when streaming the generated response as text.
*/
text: string;
/** The generated tokens */
tokens: Token[];
/**
* When the current chunk is the start of a segment, this field will be set.
*
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
* to signify that the segment has started.
*/
segmentStartTime?: Date;
/**
* When the current chunk is the last one of a segment (meaning the current segment has ended), this field will be set.
*
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
* to signify that the segment has ended.
*/
segmentEndTime?: Date;
};
export type LlamaChatResponseFunctionCallParamsChunk = {
/**
* Each different function call has a different `callIndex`.
*
* When the previous function call has finished being generated, the `callIndex` of the next one will increment.
*
* Use this value to distinguish between different function calls.
*/
callIndex: number;
/**
* The name of the function being called
*/
functionName: string;
/**
* A chunk of the generated text used for the function call parameters.
*
* Collect all the chunks together to construct the full function call parameters.
*
* After the function call is finished, the entire constructed params text can be parsed as a JSON object,
* according to the function parameters schema.
*/
paramsChunk: string;
/**
* When this is `true`, the current chunk is the last chunk in the generation of the current function call parameters.
*/
done: boolean;
};
export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* Called as the model generates the main response with the generated text chunk.
*
* Useful for streaming the generated response as it's being generated.
*
* Includes only the main response without any text segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onTextChunk?: (text: string) => void;
/**
* Called as the model generates the main response with the generated tokens.
*
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
*
* Includes only the main response without any segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onToken?: (tokens: Token[]) => void;
/**
* Called as the model generates a response with the generated text and tokens,
* including segment information (when the generated output is part of a segment).
*
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
*
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
*/
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
/**
* An AbortSignal to later abort the generation.
*
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
*
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
*/
signal?: AbortSignal;
/**
* When a response already started being generated and then the signal is aborted,
* the generation will stop and the response will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
/** Maximum number of tokens to generate */
maxTokens?: number;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
*
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
*
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
*
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* Disabled by default (set to `0`).
*/
temperature?: number;
/**
* From the next token candidates, discard the percentage of tokens with the lowest probability.
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
* This is useful for generating more high-quality results when using a high temperature.
* Set to a value between `0` and `1` to enable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* Disabled by default.
*/
minP?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
*/
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
*/
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Only relevant when using `temperature`.
*/
seed?: number;
/**
* Trim whitespace from the end of the generated text
*
* Defaults to `false`.
*/
trimWhitespaceSuffix?: boolean;
repeatPenalty?: false | LLamaContextualRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
*/
evaluationPriority?: EvaluationPriority;
contextShift?: LLamaChatContextShiftOptions;
/**
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
*/
customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
/**
* The evaluation context window returned from the last evaluation.
* This is an optimization to utilize existing context sequence state better when possible.
*/
lastEvaluationContextWindow?: {
/** The history of the last evaluation. */
history?: ChatHistoryItem[];
/**
* Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
* If the last evaluation context window is not used, a new context will be generated based on the full history,
* which will decrease the likelihood of another context shift happening so soon.
*
* A number between `0` (exclusive) and `1` (inclusive).
*/
minimumOverlapPercentageToPreventContextShift?: number;
};
/**
* Called as the model generates function calls with the generated parameters chunk for each function call.
*
* Useful for streaming the generated function call parameters as they're being generated.
* Only useful in specific use cases,
* such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
*
* The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
* according to the function parameters schema.
*
* Each function call has its own `callIndex` you can use to distinguish between them.
*
* Only relevant when using function calling (via passing the `functions` option).
*/
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
/**
* Set the maximum number of tokens the model is allowed to spend on various segmented responses.
*/
budgets?: {
/**
* Whether to include the tokens already consumed by the current model response being completed in the budget.
*
* Defaults to `true`.
*/
includeCurrentResponse?: boolean;
/**
* Budget for thought tokens.
*
* Defaults to `Infinity`.
*/
thoughtTokens?: number;
/**
* Budget for comment tokens.
*
* Defaults to `Infinity`.
*/
commentTokens?: number;
};
/**
* Stop the generation when the model tries to generate a non-textual segment or call a function.
*
* Useful for generating completions in a form of a model response.
*
* Defaults to `false`.
*/
abortOnNonText?: boolean;
} & ({
grammar?: LlamaGrammar;
functions?: never;
documentFunctionParams?: never;
maxParallelFunctionCalls?: never;
onFunctionCall?: never;
onFunctionCallParamsChunk?: never;
} | {
grammar?: never;
functions?: Functions | ChatModelFunctions;
documentFunctionParams?: boolean;
maxParallelFunctionCalls?: number;
onFunctionCall?: (functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>) => void;
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
});
export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* Complete the given user prompt without adding it or the completion to the returned context window.
*/
initialUserPrompt?: string;
/**
* When a completion already started being generated and then the signal is aborted,
* the generation will stop and the completion will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
/**
* Called as the model generates a completion with the generated text chunk.
*
* Useful for streaming the generated completion as it's being generated.
*/
onTextChunk?: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
/**
* Called as the model generates a completion with the generated tokens.
*
* Preferably, you'd want to use `onTextChunk` instead of this.
*/
onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"];
signal?: LLamaChatGenerateResponseOptions<Functions>["signal"];
maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"];
minP?: LLamaChatGenerateResponseOptions<Functions>["minP"];
topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"];
grammar?: LlamaGrammar;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same functions that were used for the previous prompt here.
*/
functions?: Functions | ChatModelFunctions;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same value that was used for the previous prompt here.
*/
documentFunctionParams?: boolean;
};
export type LLamaChatContextShiftOptions = {
/**
* The number of tokens to delete from the context window to make space for new ones.
* Defaults to 10% of the context size.
*/
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
/**
* The strategy to use when deleting tokens from the context window.
*
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
*/
strategy?: "eraseFirstResponseAndKeepFirstSystem" | ((options: {
/** Full chat history */
chatHistory: readonly ChatHistoryItem[];
/** Maximum number of tokens that the new chat history should fit under when tokenized */
maxTokensCount: number;
/** Tokenizer used to tokenize the chat history */
tokenizer: Tokenizer;
/** Chat wrapper used to generate the context state */
chatWrapper: ChatWrapper;
/**
* The metadata returned from the last context shift strategy call.
* Will be `null` on the first call.
*/
lastShiftMetadata?: object | null;
}) => {
chatHistory: ChatHistoryItem[];
metadata?: object | null;
} | Promise<{
chatHistory: ChatHistoryItem[];
metadata?: object | null;
}>);
/**
* The `contextShiftMetadata` returned from the last evaluation.
* This is an optimization to utilize the existing context state better when possible.
*/
lastEvaluationMetadata?: object | undefined | null;
};
export declare class LlamaChat {
readonly onDispose: EventRelay<void>;
constructor({ contextSequence, chatWrapper, autoDisposeSequence }: LlamaChatOptions);
dispose({ disposeSequence }?: {
disposeSequence?: boolean;
}): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get chatWrapper(): ChatWrapper;
get sequence(): LlamaContextSequence;
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
get model(): LlamaModel;
generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatGenerateResponseOptions<Functions>): Promise<LlamaChatResponse<Functions>>;
loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatLoadAndCompleteUserMessageOptions<Functions>): Promise<LlamaChatLoadAndCompleteUserResponse>;
}
export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* The response text only, _without_ any text segments (like thoughts).
*/
response: string;
/**
* The full response, including all text and text segments (like thoughts).
*/
fullResponse: Array<string | LlamaChatResponseSegment>;
functionCalls?: Functions extends ChatModelFunctions ? LlamaChatResponseFunctionCall<Functions>[] : never;
lastEvaluation: {
cleanHistory: ChatHistoryItem[];
contextWindow: ChatHistoryItem[];
contextShiftMetadata: any;
};
metadata: {
remainingGenerationAfterStop?: string | Token[];
stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort";
} | {
remainingGenerationAfterStop?: string | Token[];
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
};
};
export type LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions, FunctionCallName extends keyof Functions & string = string & keyof Functions, Params = Functions[FunctionCallName]["params"] extends undefined | null | void ? undefined : GbnfJsonSchemaToType<Functions[FunctionCallName]["params"]>> = {
functionName: FunctionCallName;
params: Params;
raw: LlamaTextJSON;
};
export type LlamaChatResponseSegment = {
type: "segment";
segmentType: ChatModelSegmentType;
text: string;
ended: boolean;
raw: LlamaTextJSON;
startTime?: string;
endTime?: string;
};
export type LlamaChatLoadAndCompleteUserResponse = {
completion: string;
lastEvaluation: {
/**
* The completion and initial user prompt are not added to this context window result,
* but are loaded to the current context sequence state as tokens
*/
contextWindow: ChatHistoryItem[];
contextShiftMetadata: any;
};
metadata: {
remainingGenerationAfterStop?: string | Token[];
stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
} | {
remainingGenerationAfterStop?: string | Token[];
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
};
};

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,11 @@
import { LlamaGrammar } from "../../LlamaGrammar.js";
import { ChatModelFunctions } from "../../../types.js";
import { ChatWrapper } from "../../../ChatWrapper.js";
import { Llama } from "../../../bindings/Llama.js";
export declare class FunctionCallNameGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
private readonly _functions;
private readonly _chatWrapper;
constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper);
parseFunctionName(generatedFunctionName: string): keyof Functions & string;
private _validateFunctions;
}

View File

@@ -0,0 +1,55 @@
import { LlamaGrammar } from "../../LlamaGrammar.js";
import { LlamaText } from "../../../utils/LlamaText.js";
import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
import { GbnfGrammar } from "../../../utils/gbnfJson/terminals/GbnfGrammar.js";
import { GbnfOr } from "../../../utils/gbnfJson/terminals/GbnfOr.js";
import { GbnfVerbatimText } from "../../../utils/gbnfJson/terminals/GbnfVerbatimText.js";
import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
export class FunctionCallNameGrammar extends LlamaGrammar {
_functions;
_chatWrapper;
constructor(llama, functions, chatWrapper) {
const grammar = getGbnfGrammarForFunctionName(functions, chatWrapper);
super(llama, {
grammar,
stopGenerationTriggers: [LlamaText("\n")],
trimWhitespaceSuffix: true
});
this._functions = functions;
this._chatWrapper = chatWrapper;
this._validateFunctions();
}
parseFunctionName(generatedFunctionName) {
if (this._chatWrapper.settings.functions.call.optionalPrefixSpace && generatedFunctionName[0] === " ")
generatedFunctionName = generatedFunctionName.slice(1);
const newlineIndex = generatedFunctionName.indexOf("\n");
const functionName = generatedFunctionName.slice(0, newlineIndex < 0
? generatedFunctionName.length
: newlineIndex);
if (!Object.hasOwn(this._functions, functionName))
throw new LlamaFunctionCallValidationError(`Function name "${functionName}" is not in the supplied functions object`, this._functions, this._chatWrapper, generatedFunctionName);
return functionName;
}
_validateFunctions() {
for (const functionsName of Object.keys(this._functions)) {
if (functionsName.includes(" ") || functionsName.includes("\n") || functionsName.includes("\t"))
throw new Error(`Function name "${functionsName}" contains spaces, new lines or tabs`);
else if (functionsName === "")
throw new Error("Function name cannot be an empty string");
}
}
}
function getGbnfGrammarForFunctionName(functions, chatWrapper) {
const grammarGenerator = new GbnfGrammarGenerator();
const functionNameGrammars = [];
for (const functionName of Object.keys(functions))
functionNameGrammars.push(new GbnfVerbatimText(functionName));
const callGrammar = new GbnfOr(functionNameGrammars);
const rootTerminal = new GbnfGrammar([
...(chatWrapper.settings.functions.call.optionalPrefixSpace ? ["[ ]?"] : []),
callGrammar.resolve(grammarGenerator)
]);
const rootGrammar = rootTerminal.getGrammar();
return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]");
}
//# sourceMappingURL=FunctionCallNameGrammar.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"FunctionCallNameGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AAEtD,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AAErF,OAAO,EAAC,WAAW,EAAC,MAAM,kDAAkD,CAAC;AAE7E,OAAO,EAAC,MAAM,EAAC,MAAM,6CAA6C,CAAC;AACnE,OAAO,EAAC,gBAAgB,EAAC,MAAM,uDAAuD,CAAC;AAEvF,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,uBAAoE,SAAQ,YAAY;IAChF,UAAU,CAAY;IACtB,YAAY,CAAc;IAE3C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB;QAC3E,MAAM,OAAO,GAAG,6BAA6B,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;QAEtE,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACzC,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAEhC,IAAI,CAAC,kBAAkB,EAAE,CAAC;IAC9B,CAAC;IAEM,iBAAiB,CAAC,qBAA6B;QAClD,IAAI,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,IAAI,qBAAqB,CAAC,CAAC,CAAC,KAAK,GAAG;YACjG,qBAAqB,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAG,qBAAqB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAEzD,MAAM,YAAY,GAAG,qBAAqB,CAAC,KAAK,CAC5C,CAAC,EACD,YAAY,GAAG,CAAC;YACZ,CAAC,CAAC,qBAAqB,CAAC,MAAM;YAC9B,CAAC,CAAC,YAAY,CACO,CAAC;QAE9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC;YAC7C,MAAM,IAAI,gCAAgC,CACtC,kBAAkB,YAAY,2CAA2C,EACzE,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,qBAAqB,CACxB,CAAC;QAEN,OAAO,YAAY,CAAC;IACxB,CAAC;IAEO,kBAAkB;QACtB,KAAK,MAAM,aAAa,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACvD,IAAI,aAAa,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC;gBAC3F,MAAM,IAAI,KAAK,CAAC,kBAAkB,aAAa,sCAAsC,CAAC,CAAC;iBACtF,IAAI,aAAa,KAAK,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACnE,CAAC;IACL,CAAC;CACJ;AAED,SAAS,6BAA6B,CAClC,SAAoB,EAAE,WAAwB;IAE9C,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAEpD,MAAM,oBAAoB,GAAmB,EAAE,CAAC;IAEhD,KAAK,MAAM,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC;QAC7C,oBAAoB,CAAC,IAAI,CAAC,IAAI,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC;IAElE,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAErD,MAAM,YAAY,GAAG,IAAI,WAAW,CAAC;QACjC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,WAAW,CAAC,OAAO,CAAC,gBAAgB,CAAC;KACxC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC;IAE9C,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,QAAQ,CAAC,CAAC;AACrE,CAAC"}

View File

@@ -0,0 +1,16 @@
import { LlamaGrammar } from "../../LlamaGrammar.js";
import { ChatModelFunctions } from "../../../types.js";
import { ChatWrapper } from "../../../ChatWrapper.js";
import { Llama } from "../../../bindings/Llama.js";
import { GbnfJsonSchema } from "../../../utils/gbnfJson/types.js";
export declare class FunctionCallParamsGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
private readonly _functions;
private readonly _chatWrapper;
private readonly _functionName;
private readonly _paramsSchema;
constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper, functionName: string, paramsSchema: GbnfJsonSchema);
parseParams(callText: string): {
params: any;
raw: string;
};
}

View File

@@ -0,0 +1,45 @@
import { LlamaGrammar } from "../../LlamaGrammar.js";
import { LlamaText } from "../../../utils/LlamaText.js";
import { validateObjectAgainstGbnfSchema } from "../../../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
import { getGbnfJsonTerminalForGbnfJsonSchema } from "../../../utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js";
import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
export class FunctionCallParamsGrammar extends LlamaGrammar {
_functions;
_chatWrapper;
_functionName;
_paramsSchema;
constructor(llama, functions, chatWrapper, functionName, paramsSchema) {
const grammar = getGbnfGrammarForFunctionParams(paramsSchema);
super(llama, {
grammar,
stopGenerationTriggers: [LlamaText("\n".repeat(4))],
trimWhitespaceSuffix: true
});
this._functions = functions;
this._chatWrapper = chatWrapper;
this._functionName = functionName;
this._paramsSchema = paramsSchema;
}
parseParams(callText) {
const endIndex = callText.lastIndexOf("\n".repeat(4));
if (endIndex < 0)
throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to end with stop generation trigger`, this._functions, this._chatWrapper, callText);
const paramsString = callText.slice(0, endIndex);
if (paramsString.trim().length === 0)
throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to not be empty`, this._functions, this._chatWrapper, callText);
const params = JSON.parse(paramsString);
validateObjectAgainstGbnfSchema(params, this._paramsSchema);
return {
params: params, // prevent infinite TS type instantiation
raw: paramsString
};
}
}
function getGbnfGrammarForFunctionParams(paramsSchema) {
const grammarGenerator = new GbnfGrammarGenerator();
const rootTerminal = getGbnfJsonTerminalForGbnfJsonSchema(paramsSchema, grammarGenerator);
const rootGrammar = rootTerminal.resolve(grammarGenerator, true);
return grammarGenerator.generateGbnfFile(rootGrammar + ` "${"\\n".repeat(4)}"`);
}
//# sourceMappingURL=FunctionCallParamsGrammar.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"FunctionCallParamsGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AACtD,OAAO,EAAC,+BAA+B,EAAC,MAAM,kEAAkE,CAAC;AAEjH,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AACrF,OAAO,EAAC,oCAAoC,EAAC,MAAM,uEAAuE,CAAC;AAI3H,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,yBAAsE,SAAQ,YAAY;IAClF,UAAU,CAAY;IACtB,YAAY,CAAc;IAC1B,aAAa,CAAS;IACtB,aAAa,CAAiB;IAE/C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB,EAAE,YAAoB,EAAE,YAA4B;QAC/H,MAAM,OAAO,GAAG,+BAA+B,CAAC,YAAY,CAAC,CAAC;QAE9D,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAChC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAEM,WAAW,CAAC,QAAgB;QAC/B,MAAM,QAAQ,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,CAAC;YACZ,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,uCAAuC,EACxG,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAEjD,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAChC,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,mBAAmB,EACpF,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAExC,+BAA+B,CAAC,MAAM,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAE5D,OAAO;YACH,MAAM,EAAE,MAAa,EAAE,yCAAyC;YAChE,GAAG,EAAE,YAAY;SACpB,CAAC;IACN,CAAC;CACJ;AAED,SAAS,+BAA+B,CAAC,YAA4B;IACjE,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IACpD,MAAM,YAAY,GAAG,oCAAoC,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;IAC1F,MAAM,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;IAEjE,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACpF,CAAC"}

View File

@@ -0,0 +1,8 @@
import { ChatModelFunctions } from "../../../types.js";
import { ChatWrapper } from "../../../ChatWrapper.js";
export declare class LlamaFunctionCallValidationError<const Functions extends ChatModelFunctions> extends Error {
readonly functions: Functions;
readonly chatWrapper: ChatWrapper;
readonly callText: string;
constructor(message: string, functions: Functions, chatWrapper: ChatWrapper, callText: string);
}

View File

@@ -0,0 +1,12 @@
export class LlamaFunctionCallValidationError extends Error {
functions;
chatWrapper;
callText;
constructor(message, functions, chatWrapper, callText) {
super(message);
this.functions = functions;
this.chatWrapper = chatWrapper;
this.callText = callText;
}
}
//# sourceMappingURL=LlamaFunctionCallValidationError.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaFunctionCallValidationError.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts"],"names":[],"mappings":"AAIA,MAAM,OAAO,gCAA6E,SAAQ,KAAK;IACnF,SAAS,CAAY;IACrB,WAAW,CAAc;IACzB,QAAQ,CAAS;IAEjC,YAAmB,OAAe,EAAE,SAAoB,EAAE,WAAwB,EAAE,QAAgB;QAChG,KAAK,CAAC,OAAO,CAAC,CAAC;QAEf,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC7B,CAAC;CACJ"}

View File

@@ -0,0 +1,16 @@
import { ChatHistoryItem, Tokenizer } from "../../../../types.js";
import { ChatWrapper } from "../../../../ChatWrapper.js";
export declare function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }: {
chatHistory: ChatHistoryItem[];
maxTokensCount: number;
tokenizer: Tokenizer;
chatWrapper: ChatWrapper;
lastShiftMetadata?: object | null;
}): Promise<{
chatHistory: ChatHistoryItem[];
metadata: CalculationMetadata;
}>;
type CalculationMetadata = {
removedCharactersNumber: number;
};
export {};

View File

@@ -0,0 +1,254 @@
import { isChatModelResponseFunctionCall, isChatModelResponseSegment } from "../../../../types.js";
import { findCharacterRemovalCountToFitChatHistoryInContext } from "../../../../utils/findCharacterRemovalCountToFitChatHistoryInContext.js";
import { truncateLlamaTextAndRoundToWords, truncateTextAndRoundToWords } from "../../../../utils/truncateTextAndRoundToWords.js";
import { LlamaText } from "../../../../utils/LlamaText.js";
export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }) {
let initialCharactersRemovalCount = 0;
if (isCalculationMetadata(lastShiftMetadata))
initialCharactersRemovalCount = lastShiftMetadata.removedCharactersNumber;
const { removedCharactersCount, compressedChatHistory } = await findCharacterRemovalCountToFitChatHistoryInContext({
chatHistory,
tokensCountToFit: maxTokensCount,
initialCharactersRemovalCount,
tokenizer,
chatWrapper,
failedCompressionErrorMessage: "Failed to compress chat history for context shift due to a too long prompt or system message that cannot be compressed without affecting the generation quality. " +
"Consider increasing the context size or shortening the long prompt or system message.",
compressChatHistory({ chatHistory, charactersToRemove, estimatedCharactersPerToken }) {
const res = chatHistory.map((item) => structuredClone(item));
let charactersLeftToRemove = charactersToRemove;
function compressFunctionCalls() {
for (let i = res.length - 1; i >= 0 && charactersLeftToRemove > 0; i--) {
const historyItem = res[i];
if (historyItem.type !== "model")
continue;
for (let t = historyItem.response.length - 1; t >= 0 && charactersLeftToRemove > 0; t--) {
const item = historyItem.response[t];
if (typeof item === "string" || item.type !== "functionCall")
continue;
if (item.rawCall == null)
continue;
const originalRawCallTokensLength = LlamaText.fromJSON(item.rawCall).tokenize(tokenizer, "trimLeadingSpace").length;
const newRawCallText = chatWrapper.generateFunctionCall(item.name, item.params);
const newRawCallTextTokensLength = newRawCallText.tokenize(tokenizer, "trimLeadingSpace").length;
if (newRawCallTextTokensLength < originalRawCallTokensLength) {
item.rawCall = newRawCallText.toJSON();
charactersLeftToRemove -= ((originalRawCallTokensLength - newRawCallTextTokensLength) * estimatedCharactersPerToken);
}
}
}
}
function removeHistoryThatLedToModelResponseAtIndex(index) {
let removedItems = 0;
for (let i = index - 1; i >= 0; i--) {
const historyItem = res[i];
if (historyItem == null)
continue;
if (historyItem.type === "model")
break; // stop removing history items if we reach another model response
if (i === 0 && historyItem.type === "system")
break; // keep the first system message
if (historyItem.type === "user" || historyItem.type === "system") {
const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove, undefined, false);
const newTextString = newText.toString();
const historyItemString = LlamaText.fromJSON(historyItem.text).toString();
if (newText.values.length === 0) {
res.splice(i, 1);
i++;
removedItems++;
charactersLeftToRemove -= historyItemString.length;
}
else if (newTextString.length < historyItemString.length) {
charactersLeftToRemove -= historyItemString.length - newTextString.length;
if (historyItem.type === "user")
historyItem.text = newText.toString();
else
historyItem.text = newText.toJSON();
}
}
else {
void historyItem;
}
}
return removedItems;
}
function compressHistoryThatLedToModelResponseAtIndex(index, keepTokensCount = 0) {
let removedItems = 0;
let promptStartIndex = undefined;
for (let i = index - 1; i >= 0; i--) {
const historyItem = res[i];
if (historyItem == null)
continue;
if (historyItem.type === "model") {
promptStartIndex = i + 1;
break;
}
if (i === 0 && historyItem.type === "system") {
promptStartIndex = i + 1;
break; // keep the first system message
}
}
if (promptStartIndex == null || promptStartIndex >= index)
return 0;
for (let i = promptStartIndex; i < index && charactersLeftToRemove > 0; i++) {
const historyItem = res[i];
if (historyItem == null || historyItem.type !== "user")
continue;
let removeChars = Math.min(charactersLeftToRemove, historyItem.text.length);
if (keepTokensCount > 0) {
removeChars -= Math.floor(keepTokensCount * estimatedCharactersPerToken);
if (removeChars < 0)
removeChars = 0;
keepTokensCount -= Math.min(keepTokensCount, Math.max(0, historyItem.text.length - removeChars) / estimatedCharactersPerToken);
}
const newText = truncateTextAndRoundToWords(historyItem.text, removeChars, undefined, false);
if (newText.length === 0) {
res.splice(i, 1);
i--;
index--;
removedItems++;
charactersLeftToRemove -= historyItem.text.length;
}
else {
charactersLeftToRemove -= historyItem.text.length - newText.length;
historyItem.text = newText;
}
}
return removedItems;
}
function removeEmptySegmentsFromModelResponse(modelResponse) {
const stack = [];
for (let t = 0; t < modelResponse.length && charactersLeftToRemove > 0; t++) {
const item = modelResponse[t];
const isLastItem = t === modelResponse.length - 1;
if (!isChatModelResponseSegment(item))
continue;
const type = item.segmentType;
const topStack = stack.at(-1);
if (topStack?.type === type) {
if (item.ended && item.text === "" && topStack.canRemove) {
modelResponse.splice(t, 1);
t--;
modelResponse.splice(topStack.startIndex, 1);
t--;
stack.pop();
}
else if (!item.ended && item.text === "" && !isLastItem) {
modelResponse.splice(t, 1);
t--;
}
else if (!item.ended && item.text !== "")
topStack.canRemove = false;
else if (item.ended)
stack.pop();
}
else if (!item.ended)
stack.push({
type,
startIndex: t,
canRemove: item.text === ""
});
}
}
function compressFirstModelResponse() {
for (let i = 0; i < res.length && charactersLeftToRemove > 0; i++) {
const historyItem = res[i];
const isLastHistoryItem = i === res.length - 1;
if (historyItem.type !== "model")
continue;
for (let t = 0; t < historyItem.response.length && charactersLeftToRemove > 0; t++) {
const item = historyItem.response[t];
const isLastText = t === historyItem.response.length - 1;
if (isLastHistoryItem && isLastText)
continue;
if (typeof item === "string") {
const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove, undefined, true);
if (newText === "") {
historyItem.response.splice(t, 1);
t--;
charactersLeftToRemove -= item.length;
}
else if (newText.length < item.length) {
historyItem.response[t] = newText;
charactersLeftToRemove -= item.length - newText.length;
}
}
else if (isChatModelResponseFunctionCall(item)) {
historyItem.response.splice(t, 1);
t--;
const functionCallAndResultTokenUsage = chatWrapper.generateFunctionCallsAndResults([item], true)
.tokenize(tokenizer, "trimLeadingSpace").length;
charactersLeftToRemove -= functionCallAndResultTokenUsage * estimatedCharactersPerToken;
}
else if (isChatModelResponseSegment(item)) {
if (item.text !== "") {
const newText = truncateTextAndRoundToWords(item.text, charactersLeftToRemove, undefined, true);
if (newText === "" && item.ended) {
const emptySegmentTokenUsage = chatWrapper.generateModelResponseText([{ ...item, text: "" }], true)
.tokenize(tokenizer, "trimLeadingSpace").length;
historyItem.response.splice(t, 1);
t--;
charactersLeftToRemove -= item.text.length + emptySegmentTokenUsage * estimatedCharactersPerToken;
}
else {
charactersLeftToRemove -= item.text.length - newText.length;
item.text = newText;
}
}
}
else
void item;
}
removeEmptySegmentsFromModelResponse(historyItem.response);
if (historyItem.response.length === 0) {
// if the model response is removed from the history,
// the things that led to it are not important anymore
i -= removeHistoryThatLedToModelResponseAtIndex(i);
res.splice(i, 1);
i--;
}
}
}
function compressLastModelResponse(minCharactersToKeep = 60) {
const lastHistoryItem = res[res.length - 1];
if (lastHistoryItem == null || lastHistoryItem.type !== "model")
return;
const lastResponseItem = lastHistoryItem.response[lastHistoryItem.response.length - 1];
if (lastResponseItem == null || typeof lastResponseItem !== "string")
return;
compressHistoryThatLedToModelResponseAtIndex(res.length - 1, maxTokensCount / 4);
if (charactersLeftToRemove <= 0)
return;
const nextTextLength = Math.max(Math.min(lastResponseItem.length, minCharactersToKeep), lastResponseItem.length - charactersLeftToRemove);
const charactersToRemoveFromText = lastResponseItem.length - nextTextLength;
const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText, undefined, true);
if (newText.length < lastResponseItem.length) {
lastHistoryItem.response[lastHistoryItem.response.length - 1] = newText;
charactersLeftToRemove -= lastResponseItem.length - newText.length;
}
if (charactersLeftToRemove <= 0)
return;
compressHistoryThatLedToModelResponseAtIndex(res.length - 1);
}
compressFunctionCalls();
if (charactersLeftToRemove <= 0)
return res;
compressFirstModelResponse();
if (charactersLeftToRemove <= 0)
return res;
compressLastModelResponse();
return res;
}
});
const newMetadata = {
removedCharactersNumber: removedCharactersCount
};
return {
chatHistory: compressedChatHistory,
metadata: newMetadata
};
}
function isCalculationMetadata(metadata) {
return metadata != null && typeof metadata === "object" && typeof metadata.removedCharactersNumber === "number";
}
//# sourceMappingURL=eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map

View File

@@ -0,0 +1,433 @@
import { EventRelay } from "lifecycle-utils";
import { ChatWrapper } from "../../ChatWrapper.js";
import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { LlamaGrammar } from "../LlamaGrammar.js";
import { LLamaChatContextShiftOptions, LlamaChatResponseChunk, LlamaChatResponseFunctionCallParamsChunk } from "../LlamaChat/LlamaChat.js";
import { EvaluationPriority } from "../LlamaContext/types.js";
import { TokenBias } from "../TokenBias.js";
import { LlamaText } from "../../utils/LlamaText.js";
import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
export type LlamaChatSessionOptions = {
contextSequence: LlamaContextSequence;
/** `"auto"` is used by default */
chatWrapper?: "auto" | ChatWrapper;
systemPrompt?: string;
/**
* Add the system prompt even on models that don't support a system prompt.
*
* Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
* but forcing the system prompt on unsupported models may not always work as expected.
*
* Use with caution.
*/
forceAddSystemPrompt?: boolean;
/**
* Automatically dispose the sequence when the session is disposed.
*
* Defaults to `false`.
*/
autoDisposeSequence?: boolean;
contextShift?: LlamaChatSessionContextShiftOptions;
};
export type LlamaChatSessionContextShiftOptions = {
/**
* The number of tokens to delete from the context window to make space for new ones.
* Defaults to 10% of the context size.
*/
size?: LLamaChatContextShiftOptions["size"];
/**
* The strategy to use when deleting tokens from the context window.
*
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
*/
strategy?: LLamaChatContextShiftOptions["strategy"];
};
export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
/**
* Called as the model generates the main response with the generated text chunk.
*
* Useful for streaming the generated response as it's being generated.
*
* Includes only the main response without any text segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onTextChunk?: (text: string) => void;
/**
* Called as the model generates the main response with the generated tokens.
*
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
*
* Includes only the main response without any segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onToken?: (tokens: Token[]) => void;
/**
* Called as the model generates a response with the generated text and tokens,
* including segment information (when the generated output is part of a segment).
*
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
*
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
*/
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
/**
* An AbortSignal to later abort the generation.
*
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
*
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
*/
signal?: AbortSignal;
/**
* When a response already started being generated and then the signal is aborted,
* the generation will stop and the response will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
/** Maximum number of tokens to generate */
maxTokens?: number;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
*
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
*
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
*
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* Disabled by default (set to `0`).
*/
temperature?: number;
/**
* From the next token candidates, discard the percentage of tokens with the lowest probability.
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
* This is useful for generating more high-quality results when using a high temperature.
* Set to a value between `0` and `1` to enable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* Disabled by default.
*/
minP?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
*/
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
*/
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Only relevant when using `temperature`.
*/
seed?: number;
/**
* Trim whitespace from the end of the generated text
* Disabled by default.
*/
trimWhitespaceSuffix?: boolean;
/**
* Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
*
* May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
* so avoid using it together with function calling if you notice unexpected behavior.
*/
responsePrefix?: string;
/**
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
*/
evaluationPriority?: EvaluationPriority;
repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
*/
customStopTriggers?: (LlamaText | string | (string | Token)[])[];
/**
* Called as the model generates function calls with the generated parameters chunk for each function call.
*
* Useful for streaming the generated function call parameters as they're being generated.
* Only useful in specific use cases,
* such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
*
* The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
* according to the function parameters schema.
*
* Each function call has its own `callIndex` you can use to distinguish between them.
*
* Only relevant when using function calling (via passing the `functions` option).
*/
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
/**
* Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
*/
budgets?: {
/**
* Budget for thought tokens.
*
* Defaults to `Infinity`.
*/
thoughtTokens?: number;
/**
* Budget for comment tokens.
*
* Defaults to `Infinity`.
*/
commentTokens?: number;
};
} & ({
grammar?: LlamaGrammar;
functions?: never;
documentFunctionParams?: never;
maxParallelFunctionCalls?: never;
onFunctionCallParamsChunk?: never;
} | {
grammar?: never;
functions?: Functions | ChatSessionModelFunctions;
documentFunctionParams?: boolean;
maxParallelFunctionCalls?: number;
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
});
export type LLamaChatCompletePromptOptions = {
/**
* Generate a completion for the given user prompt up to the given number of tokens.
*
* Defaults to `256` or half the context size, whichever is smaller.
*/
maxTokens?: LLamaChatPromptOptions["maxTokens"];
/**
* When a completion already started being generated and then the given `signal` is aborted,
* the generation will stop and the completion will be returned as-is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
/**
* Called as the model generates a completion with the generated text chunk.
*
* Useful for streaming the generated completion as it's being generated.
*/
onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
/**
* Called as the model generates a completion with the generated tokens.
*
* Preferably, you'd want to use `onTextChunk` instead of this.
*/
onToken?: LLamaChatPromptOptions["onToken"];
signal?: LLamaChatPromptOptions["signal"];
temperature?: LLamaChatPromptOptions["temperature"];
minP?: LLamaChatPromptOptions["minP"];
topK?: LLamaChatPromptOptions["topK"];
topP?: LLamaChatPromptOptions["topP"];
seed?: LLamaChatPromptOptions["seed"];
trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
tokenBias?: LLamaChatPromptOptions["tokenBias"];
customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
grammar?: LlamaGrammar;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same functions that were used for the previous prompt here.
*/
functions?: ChatSessionModelFunctions;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same value that was used for the previous prompt here.
*/
documentFunctionParams?: boolean;
/**
* Whether to complete the prompt as a model response.
*
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
* This is a good option to workaround some models that don't support used prompt completions.
* - **`true`**: Always complete as a model response
* - **`false`**: Never complete as a model response
*
* Defaults to `"auto"`.
*/
completeAsModel?: "auto" | boolean | {
/**
* Whether to complete the prompt as a model response.
*
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
* This is a good option to workaround some models that don't support used prompt completions.
* - **`true`**: Always complete as a model response
* - **`false`**: Never complete as a model response
*
* Defaults to `"auto"`.
*/
enabled?: "auto" | boolean;
/**
* The messages to append to the chat history to generate a completion as a model response.
*
* If the last message is a model message, the prompt will be pushed to it for the completion,
* otherwise a new model message will be added with the prompt.
*
* It must contain a user message or a system message before the model message.
*
* Default to:
* ```ts
* [
* {
* type: "system",
* text: "For your next response predict what the user may send next. " +
* "No yapping, no whitespace. Match the user's language and tone."
* },
* {type: "user", text: ""},
* {type: "model", response: [""]}
* ]
* ```
*/
appendedMessages?: ChatHistoryItem[];
};
};
export type LLamaChatPreloadPromptOptions = {
signal?: LLamaChatCompletePromptOptions["signal"];
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
functions?: LLamaChatCompletePromptOptions["functions"];
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
};
export type LlamaChatSessionRepeatPenalty = {
/**
* Number of recent tokens generated by the model to apply penalties to repetition of.
* Defaults to `64`.
*/
lastTokens?: number;
punishTokensFilter?: (tokens: Token[]) => Token[];
/**
* Penalize new line tokens.
* Enabled by default.
*/
penalizeNewLine?: boolean;
/**
* The relative amount to lower the probability of the tokens in `punishTokens` by
* Defaults to `1.1`.
* Set to `1` to disable.
*/
penalty?: number;
/**
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
frequencyPenalty?: number;
/**
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
presencePenalty?: number;
};
/**
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
*/
export declare class LlamaChatSession {
readonly onDispose: EventRelay<void>;
constructor(options: LlamaChatSessionOptions);
dispose({ disposeSequence }?: {
disposeSequence?: boolean;
}): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get chatWrapper(): ChatWrapper;
get sequence(): LlamaContextSequence;
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
/**
* @param prompt
* @param [options]
*/
promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
responseText: string;
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
remainingGenerationAfterStop: string | Token[] | undefined;
} | {
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
responseText: string;
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
remainingGenerationAfterStop: string | Token[] | undefined;
customStopTrigger?: undefined;
}>;
/**
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
* and feel faster.
*
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
* @param prompt - the prompt to preload
* @param [options]
*/
preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
/**
* Preload a user prompt into the current context sequence state and generate a completion for it.
*
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
* > so consider limiting the length of prompts you preload.
* >
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
* @param prompt - the prompt to preload
* @param [options]
*/
completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
/**
* Create a smart completion engine that caches the prompt completions
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
*
* All completions are made and cache is used only for the current chat session state.
* You can create a single completion engine for an entire chat session.
*/
createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
/**
* See `completePrompt` for more information.
* @param prompt
* @param [options]
*/
completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel }?: LLamaChatCompletePromptOptions): Promise<{
completion: string;
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
remainingGenerationAfterStop: string | Token[] | undefined;
} | {
completion: string;
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
remainingGenerationAfterStop: string | Token[] | undefined;
customStopTrigger?: undefined;
}>;
getChatHistory(): ChatHistoryItem[];
getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
setChatHistory(chatHistory: ChatHistoryItem[]): void;
/** Clear the chat history and reset it to the initial state. */
resetChatHistory(): void;
}

View File

@@ -0,0 +1,622 @@
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { appendUserMessageToChatHistory } from "../../utils/appendUserMessageToChatHistory.js";
import { LlamaChat } from "../LlamaChat/LlamaChat.js";
import { wrapAbortSignal } from "../../utils/wrapAbortSignal.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
import { LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
const defaultCompleteAsModel = {
enabled: "auto",
appendedMessages: [
{
type: "system",
text: "For your next response predict what the user may send next. No yapping, no whitespace. Match the user's language and tone."
},
{ type: "user", text: "" },
{ type: "model", response: [""] }
]
};
/**
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
*/
export class LlamaChatSession {
/** @internal */ _disposeAggregator = new DisposeAggregator();
/** @internal */ _autoDisposeSequence;
/** @internal */ _contextShift;
/** @internal */ _forceAddSystemPrompt;
/** @internal */ _systemPrompt;
/** @internal */ _chatLock = {};
/** @internal */ _chatHistory;
/** @internal */ _lastEvaluation;
/** @internal */ _canUseContextWindowForCompletion = true;
/** @internal */ _chat;
/** @internal */ _chatHistoryStateRef = {};
/** @internal */ _preloadAndCompleteAbortControllers = new Set();
onDispose = new EventRelay();
constructor(options) {
const { contextSequence, chatWrapper = "auto", systemPrompt, forceAddSystemPrompt = false, autoDisposeSequence = false, contextShift } = options;
if (contextSequence == null)
throw new Error("contextSequence cannot be null");
if (contextSequence.disposed)
throw new DisposedError();
this._contextShift = contextShift;
this._forceAddSystemPrompt = forceAddSystemPrompt;
this._systemPrompt = systemPrompt;
this._chat = new LlamaChat({
autoDisposeSequence,
chatWrapper,
contextSequence
});
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
this._chatHistory = this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt });
else
this._chatHistory = [];
this._autoDisposeSequence = autoDisposeSequence;
this._disposeAggregator.add(this._chat.onDispose.createListener(() => {
this.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
}
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
if (this._chat == null)
return;
this._chat.dispose({ disposeSequence });
this._chat = null;
this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.dispose]() {
return this.dispose();
}
get disposed() {
return this._chat == null || this._chat.disposed;
}
get chatWrapper() {
if (this._chat == null)
throw new DisposedError();
return this._chat.chatWrapper;
}
get sequence() {
if (this._chat == null)
throw new DisposedError();
return this._chat.sequence;
}
get context() {
return this.sequence.context;
}
get model() {
return this.sequence.model;
}
async prompt(prompt, options = {}) {
const { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers } = options;
const { responseText } = await this.promptWithMeta(prompt, {
// this is a workaround to allow passing both `functions` and `grammar`
functions: functions,
grammar: grammar,
documentFunctionParams: documentFunctionParams,
maxParallelFunctionCalls: maxParallelFunctionCalls,
onFunctionCallParamsChunk: onFunctionCallParamsChunk,
onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens,
temperature, minP, topK, topP, seed,
trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
});
return responseText;
}
/**
* @param prompt
* @param [options]
*/
async promptWithMeta(prompt, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority } = {}) {
this._ensureNotDisposed();
if (grammar != null && grammar._llama !== this.model._llama)
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
this._stopAllPreloadAndPromptCompletions();
return await withLock([this._chatLock, "evaluation"], signal, async () => {
this._ensureNotDisposed();
this._stopAllPreloadAndPromptCompletions();
if (this._chat == null)
throw new DisposedError();
const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
let lastEvaluation = this._canUseContextWindowForCompletion
? this._lastEvaluation
: undefined;
let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
? undefined
: appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
let previousFunctionCalls = 0;
const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
? responsePrefix
: undefined;
newChatHistory.push({
type: "model",
response: resolvedResponsePrefix != null
? [resolvedResponsePrefix]
: []
});
if (newContextWindowChatHistory != null)
newContextWindowChatHistory.push({
type: "model",
response: resolvedResponsePrefix != null
? [resolvedResponsePrefix]
: []
});
if (resolvedResponsePrefix != null) {
safeEventCallback(onToken)?.(this.model.tokenize(resolvedResponsePrefix));
safeEventCallback(onTextChunk)?.(resolvedResponsePrefix);
safeEventCallback(onResponseChunk)?.({
type: undefined,
segmentType: undefined,
text: resolvedResponsePrefix,
tokens: this.model.tokenize(resolvedResponsePrefix)
});
}
try {
while (true) {
const functionCallsAndResults = [];
let canThrowFunctionCallingErrors = false;
let abortedOnFunctionCallError = false;
const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
const { lastEvaluation: currentLastEvaluation, metadata } = await this._chat.generateResponse(newChatHistory, {
functions,
documentFunctionParams,
maxParallelFunctionCalls,
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
onTextChunk: safeEventCallback(onTextChunk),
onToken: safeEventCallback(onToken),
onResponseChunk: safeEventCallback(onResponseChunk),
onFunctionCallParamsChunk: onFunctionCallParamsChunk == null
? undefined
: safeEventCallback((chunk) => onFunctionCallParamsChunk?.({
callIndex: previousFunctionCalls + chunk.callIndex,
functionName: chunk.functionName,
paramsChunk: chunk.paramsChunk,
done: chunk.done
})),
budgets: {
includeCurrentResponse: true,
thoughtTokens: budgets?.thoughtTokens,
commentTokens: budgets?.commentTokens
},
signal: abortController.signal,
stopOnAbortSignal,
repeatPenalty,
minP,
topK,
topP,
seed,
tokenBias,
customStopTriggers,
maxTokens,
temperature,
trimWhitespaceSuffix,
contextShift: {
...this._contextShift,
lastEvaluationMetadata: lastEvaluation?.contextShiftMetadata
},
evaluationPriority,
lastEvaluationContextWindow: {
history: newContextWindowChatHistory,
minimumOverlapPercentageToPreventContextShift: 0.5
},
onFunctionCall: async (functionCall) => {
functionCallsAndResults.push((async () => {
try {
const functionDefinition = functions?.[functionCall.functionName];
if (functionDefinition == null)
throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
const functionCallResult = await functionDefinition.handler(functionCall.params);
return {
functionCall,
functionDefinition,
functionCallResult
};
}
catch (err) {
if (!abortController.signal.aborted) {
abortedOnFunctionCallError = true;
abortController.abort(err);
}
if (canThrowFunctionCallingErrors)
throw err;
return null;
}
})());
}
});
this._ensureNotDisposed();
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
throw abortController.signal.reason;
if (maxTokens != null)
maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
lastEvaluation = currentLastEvaluation;
newChatHistory = lastEvaluation.cleanHistory;
if (functionCallsAndResults.length > 0) {
canThrowFunctionCallingErrors = true;
const functionCallResultsPromise = Promise.all(functionCallsAndResults);
const raceEventAbortController = new AbortController();
await Promise.race([
functionCallResultsPromise,
new Promise((accept, reject) => {
abortController.signal.addEventListener("abort", () => {
if (abortedOnFunctionCallError || !stopOnAbortSignal)
reject(abortController.signal.reason);
else
accept();
}, { signal: raceEventAbortController.signal });
if (abortController.signal.aborted) {
if (abortedOnFunctionCallError || !stopOnAbortSignal)
reject(abortController.signal.reason);
else
accept();
}
})
]);
raceEventAbortController.abort();
this._ensureNotDisposed();
if (!abortController.signal.aborted) {
const functionCallResults = (await functionCallResultsPromise)
.filter((result) => result != null);
this._ensureNotDisposed();
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
throw abortController.signal.reason;
newContextWindowChatHistory = lastEvaluation.contextWindow;
let startNewChunk = supportsParallelFunctionCalling;
for (const { functionCall, functionDefinition, functionCallResult } of functionCallResults) {
newChatHistory = addFunctionCallToChatHistory({
chatHistory: newChatHistory,
functionName: functionCall.functionName,
functionDescription: functionDefinition.description,
callParams: functionCall.params,
callResult: functionCallResult,
rawCall: functionCall.raw,
startsNewChunk: startNewChunk
});
newContextWindowChatHistory = addFunctionCallToChatHistory({
chatHistory: newContextWindowChatHistory,
functionName: functionCall.functionName,
functionDescription: functionDefinition.description,
callParams: functionCall.params,
callResult: functionCallResult,
rawCall: functionCall.raw,
startsNewChunk: startNewChunk
});
startNewChunk = false;
previousFunctionCalls++;
}
lastEvaluation.cleanHistory = newChatHistory;
lastEvaluation.contextWindow = newContextWindowChatHistory;
if (abortController.signal.aborted && !abortedOnFunctionCallError && stopOnAbortSignal) {
metadata.stopReason = "abort";
metadata.remainingGenerationAfterStop = undefined;
}
else
continue;
}
}
this._lastEvaluation = lastEvaluation;
this._canUseContextWindowForCompletion = true;
this._chatHistory = newChatHistory;
this._chatHistoryStateRef = {};
const lastModelResponseItem = getLastModelResponseItem(newChatHistory);
const responseText = lastModelResponseItem.response
.filter((item) => typeof item === "string")
.join("");
if (metadata.stopReason === "customStopTrigger")
return {
response: lastModelResponseItem.response,
responseText,
stopReason: metadata.stopReason,
customStopTrigger: metadata.customStopTrigger,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
return {
response: lastModelResponseItem.response,
responseText,
stopReason: metadata.stopReason,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
}
}
finally {
disposeAbortController();
}
});
}
/**
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
* and feel faster.
*
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
* @param prompt - the prompt to preload
* @param [options]
*/
async preloadPrompt(prompt, options = {}) {
await this.completePromptWithMeta(prompt, {
...options,
completeAsModel: false,
maxTokens: 0
});
}
/**
* Preload a user prompt into the current context sequence state and generate a completion for it.
*
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
* > so consider limiting the length of prompts you preload.
* >
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
* @param prompt - the prompt to preload
* @param [options]
*/
async completePrompt(prompt, options = {}) {
const { completion } = await this.completePromptWithMeta(prompt, options);
return completion;
}
/**
* Create a smart completion engine that caches the prompt completions
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
*
* All completions are made and cache is used only for the current chat session state.
* You can create a single completion engine for an entire chat session.
*/
createPromptCompletionEngine(options) {
return LlamaChatSessionPromptCompletionEngine._create(this, options);
}
/**
* See `completePrompt` for more information.
* @param prompt
* @param [options]
*/
async completePromptWithMeta(prompt, { maxTokens, stopOnAbortSignal = false, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel } = {}) {
this._ensureNotDisposed();
if (grammar != null) {
if (grammar._llama == null)
throw new Error("The grammar passed to this function is not a LlamaGrammar instance.");
else if (grammar._llama !== this.model._llama)
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
}
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
this._preloadAndCompleteAbortControllers.add(abortController);
const completeAsModelEnabled = typeof completeAsModel == "boolean"
? completeAsModel
: completeAsModel === "auto"
? "auto"
: completeAsModel?.enabled ?? defaultCompleteAsModel.enabled;
const modelArchitecture = this.model.fileInfo.metadata?.general?.architecture;
const shouldCompleteAsModel = completeAsModelEnabled === "auto"
? modelArchitecture === GgufArchitectureType.gptOss
: completeAsModelEnabled;
try {
return await withLock([this._chatLock, "evaluation"], abortController.signal, async () => {
this._ensureNotDisposed();
if (this._chat == null)
throw new DisposedError();
if (shouldCompleteAsModel) {
const messagesToAppendOption = (typeof completeAsModel == "boolean" || completeAsModel === "auto")
? defaultCompleteAsModel.appendedMessages
: completeAsModel?.appendedMessages ?? defaultCompleteAsModel.appendedMessages;
const messagesToAppend = messagesToAppendOption.length === 0
? defaultCompleteAsModel.appendedMessages
: messagesToAppendOption;
const addMessageToChatHistory = (chatHistory) => {
const newHistory = chatHistory.slice();
if (messagesToAppend.at(0)?.type === "model")
newHistory.push({ type: "user", text: "" });
for (let i = 0; i < messagesToAppend.length; i++) {
const item = messagesToAppend[i];
const isLastItem = i === messagesToAppend.length - 1;
if (item == null)
continue;
if (isLastItem && item.type === "model") {
const newResponse = item.response.slice();
if (typeof newResponse.at(-1) === "string")
newResponse.push(newResponse.pop() + prompt);
else
newResponse.push(prompt);
newHistory.push({
type: "model",
response: newResponse
});
}
else
newHistory.push(item);
}
if (messagesToAppend.at(-1)?.type !== "model")
newHistory.push({ type: "model", response: [prompt] });
return {
history: newHistory,
addedCount: newHistory.length - chatHistory.length
};
};
const { history: messagesWithPrompt, addedCount } = addMessageToChatHistory(this._chatHistory);
const { response, lastEvaluation, metadata } = await this._chat.generateResponse(messagesWithPrompt, {
abortOnNonText: true,
functions,
documentFunctionParams,
grammar: grammar, // this is allowed only because `abortOnNonText` is enabled
onTextChunk,
onToken,
signal: abortController.signal,
stopOnAbortSignal: true,
repeatPenalty,
minP,
topK,
topP,
seed,
tokenBias,
customStopTriggers,
maxTokens: maxTokens == null
? undefined
: Math.min(1, maxTokens), // regular prompting ignores `maxTokens: 0`
temperature,
trimWhitespaceSuffix,
contextShift: {
...this._contextShift,
lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
},
evaluationPriority,
lastEvaluationContextWindow: {
history: this._lastEvaluation?.contextWindow == null
? undefined
: addMessageToChatHistory(this._lastEvaluation?.contextWindow).history,
minimumOverlapPercentageToPreventContextShift: 0.8
}
});
this._ensureNotDisposed();
this._lastEvaluation = {
cleanHistory: this._chatHistory,
contextWindow: lastEvaluation.contextWindow.slice(0, -addedCount),
contextShiftMetadata: lastEvaluation.contextShiftMetadata
};
this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
throw abortController.signal.reason;
if (metadata.stopReason === "customStopTrigger")
return {
completion: response,
stopReason: metadata.stopReason,
customStopTrigger: metadata.customStopTrigger,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
return {
completion: response,
stopReason: metadata.stopReason,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
}
else {
const { completion, lastEvaluation, metadata } = await this._chat.loadChatAndCompleteUserMessage(asWithLastUserMessageRemoved(this._chatHistory), {
initialUserPrompt: prompt,
functions,
documentFunctionParams,
grammar,
onTextChunk,
onToken,
signal: abortController.signal,
stopOnAbortSignal: true,
repeatPenalty,
minP,
topK,
topP,
seed,
tokenBias,
customStopTriggers,
maxTokens,
temperature,
trimWhitespaceSuffix,
contextShift: {
...this._contextShift,
lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
},
evaluationPriority,
lastEvaluationContextWindow: {
history: asWithLastUserMessageRemoved(this._lastEvaluation?.contextWindow),
minimumOverlapPercentageToPreventContextShift: 0.8
}
});
this._ensureNotDisposed();
this._lastEvaluation = {
cleanHistory: this._chatHistory,
contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
contextShiftMetadata: lastEvaluation.contextShiftMetadata
};
this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
throw abortController.signal.reason;
if (metadata.stopReason === "customStopTrigger")
return {
completion: completion,
stopReason: metadata.stopReason,
customStopTrigger: metadata.customStopTrigger,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
return {
completion: completion,
stopReason: metadata.stopReason,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
}
});
}
finally {
this._preloadAndCompleteAbortControllers.delete(abortController);
disposeAbortController();
}
}
getChatHistory() {
return structuredClone(this._chatHistory);
}
getLastEvaluationContextWindow() {
if (this._lastEvaluation == null)
return null;
return structuredClone(this._lastEvaluation?.contextWindow);
}
setChatHistory(chatHistory) {
this._chatHistory = structuredClone(chatHistory);
this._chatHistoryStateRef = {};
this._lastEvaluation = undefined;
this._canUseContextWindowForCompletion = false;
}
/** Clear the chat history and reset it to the initial state. */
resetChatHistory() {
if (this._chat == null || this.disposed)
throw new DisposedError();
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
this.setChatHistory(this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt }));
else
this.setChatHistory([]);
}
/** @internal */
_stopAllPreloadAndPromptCompletions() {
for (const abortController of this._preloadAndCompleteAbortControllers)
abortController.abort();
this._preloadAndCompleteAbortControllers.clear();
}
/** @internal */
_ensureNotDisposed() {
if (this.disposed)
throw new DisposedError();
}
}
function addFunctionCallToChatHistory({ chatHistory, functionName, functionDescription, callParams, callResult, rawCall, startsNewChunk }) {
const newChatHistory = chatHistory.slice();
if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
newChatHistory.push({
type: "model",
response: []
});
const lastModelResponseItem = newChatHistory[newChatHistory.length - 1];
const newLastModelResponseItem = { ...lastModelResponseItem };
newChatHistory[newChatHistory.length - 1] = newLastModelResponseItem;
const modelResponse = newLastModelResponseItem.response.slice();
newLastModelResponseItem.response = modelResponse;
const functionCall = {
type: "functionCall",
name: functionName,
description: functionDescription,
params: callParams,
result: callResult,
rawCall
};
if (startsNewChunk)
functionCall.startsNewChunk = true;
modelResponse.push(functionCall);
return newChatHistory;
}
function getLastModelResponseItem(chatHistory) {
if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "model")
throw new Error("Expected chat history to end with a model response");
return chatHistory[chatHistory.length - 1];
}
function asWithLastUserMessageRemoved(chatHistory) {
if (chatHistory == null)
return chatHistory;
const newChatHistory = chatHistory.slice();
while (newChatHistory.at(-1)?.type === "user")
newChatHistory.pop();
return newChatHistory;
}
//# sourceMappingURL=LlamaChatSession.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,43 @@
import type { LlamaContextSequence } from "../../LlamaContext/LlamaContext.js";
import type { LLamaChatCompletePromptOptions } from "../LlamaChatSession.js";
export type LLamaChatPromptCompletionEngineOptions = {
/**
* Max tokens to allow for preloading a prompt and generating a completion for it.
*
* Defaults to `256` or half of the context size, whichever is smaller.
*/
maxPreloadTokens?: number;
onGeneration?(prompt: string, completion: string): void;
/**
* Max number of completions to cache.
*
* Defaults to `100`.
*/
maxCachedCompletions?: number;
temperature?: LLamaChatCompletePromptOptions["temperature"];
minP?: LLamaChatCompletePromptOptions["minP"];
topK?: LLamaChatCompletePromptOptions["topK"];
topP?: LLamaChatCompletePromptOptions["topP"];
seed?: LLamaChatCompletePromptOptions["seed"];
trimWhitespaceSuffix?: LLamaChatCompletePromptOptions["trimWhitespaceSuffix"];
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
repeatPenalty?: LLamaChatCompletePromptOptions["repeatPenalty"];
tokenBias?: LLamaChatCompletePromptOptions["tokenBias"];
customStopTriggers?: LLamaChatCompletePromptOptions["customStopTriggers"];
grammar?: LLamaChatCompletePromptOptions["grammar"];
functions?: LLamaChatCompletePromptOptions["functions"];
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
completeAsModel?: LLamaChatCompletePromptOptions["completeAsModel"];
};
export declare const defaultMaxPreloadTokens: (sequence: LlamaContextSequence) => number;
export declare class LlamaChatSessionPromptCompletionEngine {
private constructor();
dispose(): void;
/**
* Get completion for the prompt from the cache,
* and begin preloading this prompt into the context sequence and completing it.
*
* On completion progress, `onGeneration` (configured for this engine instance) will be called.
*/
complete(prompt: string): string;
}

View File

@@ -0,0 +1,191 @@
import { DisposeAggregator, DisposedError } from "lifecycle-utils";
import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
import { LruCache } from "../../../utils/LruCache.js";
import { safeEventCallback } from "../../../utils/safeEventCallback.js";
export const defaultMaxPreloadTokens = (sequence) => {
const defaultValue = 256;
return sequence.model.fileInsights.swaSize != null
? Math.min(Math.ceil(sequence.model.fileInsights.swaSize / 2), defaultValue, Math.ceil(sequence.contextSize / 2))
: Math.min(defaultValue, Math.ceil(sequence.contextSize / 2));
};
const defaultMaxCachedCompletions = 100;
export class LlamaChatSessionPromptCompletionEngine {
/** @internal */ _chatSession;
/** @internal */ _maxPreloadTokens;
/** @internal */ _maxCachedCompletions;
/** @internal */ _onGeneration;
/** @internal */ _completionOptions;
/** @internal */ _completionCaches = new WeakMap();
/** @internal */ _disposeAggregator = new DisposeAggregator();
/** @internal */ _currentCompletionAbortController = new AbortController();
/** @internal */ _lastPrompt;
/** @internal */ _disposed = false;
constructor(chatSession, { maxPreloadTokens = defaultMaxPreloadTokens(chatSession.sequence), onGeneration, maxCachedCompletions = defaultMaxCachedCompletions, ...options }) {
this._chatSession = chatSession;
this._maxPreloadTokens = Math.max(1, maxPreloadTokens);
this._maxCachedCompletions = Math.max(1, maxCachedCompletions);
this._onGeneration = safeEventCallback(onGeneration);
this._completionOptions = options;
this.dispose = this.dispose.bind(this);
this._disposeAggregator.add(this._chatSession.onDispose.createListener(this.dispose));
this._disposeAggregator.add(() => {
this._disposed = true;
this._currentCompletionAbortController.abort();
});
}
dispose() {
if (this._disposed)
return;
this._disposeAggregator.dispose();
}
/**
* Get completion for the prompt from the cache,
* and begin preloading this prompt into the context sequence and completing it.
*
* On completion progress, `onGeneration` (configured for this engine instance) will be called.
*/
complete(prompt) {
if (this._disposed)
throw new DisposedError();
const completionCache = this._getCurrentCompletionCache();
const completion = completionCache.getCompletion(prompt);
if (this._lastPrompt == null || !(this._lastPrompt + (completion ?? "")).startsWith(prompt)) {
this._lastPrompt = prompt;
this._restartCompletion(completionCache);
}
this._lastPrompt = prompt;
return completion ?? "";
}
/** @internal */
_getCurrentCompletionCache() {
const completionCache = this._completionCaches.get(this._chatSession._chatHistoryStateRef);
if (completionCache != null)
return completionCache;
const newCompletionCache = new CompletionCache(this._maxCachedCompletions);
this._completionCaches.set(this._chatSession._chatHistoryStateRef, newCompletionCache);
return newCompletionCache;
}
/** @internal */
_restartCompletion(completionCache) {
if (this._disposed)
return;
this._currentCompletionAbortController.abort();
this._currentCompletionAbortController = new AbortController();
const prompt = this._lastPrompt;
if (prompt == null)
return;
const existingCompletion = completionCache.getCompletion(prompt);
const promptToComplete = prompt + (existingCompletion ?? "");
const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete, false, "trimLeadingSpace").length;
const leftTokens = Math.max(0, this._maxPreloadTokens - currentPromptTokens);
if (leftTokens === 0)
return;
const currentAbortController = this._currentCompletionAbortController;
const currentAbortSignal = this._currentCompletionAbortController.signal;
let currentCompletion = "";
void this._chatSession.completePrompt(promptToComplete, {
...this._completionOptions,
stopOnAbortSignal: false,
maxTokens: leftTokens,
signal: currentAbortSignal,
onTextChunk: (chunk) => {
currentCompletion += chunk;
const completion = (existingCompletion ?? "") + currentCompletion;
completionCache.putCompletion(prompt, completion);
if (this._getCurrentCompletionCache() !== completionCache) {
currentAbortController.abort();
return;
}
if (this._lastPrompt === prompt)
this._onGeneration?.(prompt, completion);
}
})
.then(() => {
if (this._lastPrompt !== prompt && this._getCurrentCompletionCache() === completionCache)
return this._restartCompletion(completionCache);
})
.catch((err) => {
if ((currentAbortSignal.aborted && err === currentAbortSignal.reason) || err instanceof DOMException)
return;
console.error(getConsoleLogPrefix(false, false), err);
});
}
/** @internal */
static _create(chatSession, options = {}) {
return new LlamaChatSessionPromptCompletionEngine(chatSession, options);
}
}
class CompletionCache {
/** @internal */ _cache;
/** @internal */ _rootNode = [new Map()];
constructor(maxInputs) {
this._cache = new LruCache(maxInputs, {
onDelete: (key) => {
this._deleteInput(key);
}
});
}
get maxInputs() {
return this._cache.maxSize;
}
getCompletion(input) {
let node = this._rootNode;
for (let i = 0; i < input.length; i++) {
if (node == null)
return null;
const [next, completion] = node;
const char = input[i];
if (!next.has(char)) {
if (completion != null && completion.startsWith(input.slice(i))) {
this._cache.get(input.slice(0, i));
return completion.slice(input.length - i);
}
}
node = next.get(char);
}
if (node == null)
return null;
const [, possibleCompletion] = node;
if (possibleCompletion != null) {
this._cache.get(input);
return possibleCompletion;
}
return null;
}
putCompletion(input, completion) {
this._cache.set(input, null);
let node = this._rootNode;
for (let i = 0; i < input.length; i++) {
const [next] = node;
const char = input[i];
if (!next.has(char))
next.set(char, [new Map()]);
node = next.get(char);
}
const currentCompletion = node[1];
if (currentCompletion != null && currentCompletion.startsWith(completion))
return currentCompletion;
node[1] = completion;
return completion;
}
/** @internal */
_deleteInput(input) {
let lastNodeWithMultipleChildren = this._rootNode;
let lastNodeWithMultipleChildrenDeleteChar = input[0];
let node = this._rootNode;
for (let i = 0; i < input.length; i++) {
const [next] = node;
const char = input[i];
if (next.size > 1) {
lastNodeWithMultipleChildren = node;
lastNodeWithMultipleChildrenDeleteChar = char;
}
if (!next.has(char))
return;
node = next.get(char);
}
if (lastNodeWithMultipleChildrenDeleteChar !== "")
lastNodeWithMultipleChildren[0].delete(lastNodeWithMultipleChildrenDeleteChar);
}
}
//# sourceMappingURL=LlamaChatSessionPromptCompletionEngine.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,15 @@
import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../../../utils/gbnfJson/types.js";
import { ChatSessionModelFunction } from "../../../types.js";
/**
* Define a function that can be used by the model in a chat session, and return it.
*
* This is a helper function to facilitate defining functions with full TypeScript type information.
*
* The handler function can return a Promise, and the return value will be awaited before being returned to the model.
* @param functionDefinition
*/
export declare function defineChatSessionFunction<const Params extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs>>({ description, params, handler }: {
description?: string;
params?: Readonly<Params> & GbnfJsonSchema<Defs>;
handler: (params: GbnfJsonSchemaToType<NoInfer<Params>>) => Promise<any> | any;
}): ChatSessionModelFunction<NoInfer<Params>>;

View File

@@ -0,0 +1,16 @@
/**
* Define a function that can be used by the model in a chat session, and return it.
*
* This is a helper function to facilitate defining functions with full TypeScript type information.
*
* The handler function can return a Promise, and the return value will be awaited before being returned to the model.
* @param functionDefinition
*/
export function defineChatSessionFunction({ description, params, handler }) {
return {
description,
params,
handler
};
}
//# sourceMappingURL=defineChatSessionFunction.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"defineChatSessionFunction.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AACH,MAAM,UAAU,yBAAyB,CAGvC,EACE,WAAW,EACX,MAAM,EACN,OAAO,EAKV;IACG,OAAO;QACH,WAAW;QACX,MAAM;QACN,OAAO;KACV,CAAC;AACN,CAAC"}

View File

@@ -0,0 +1,186 @@
import { EventRelay } from "lifecycle-utils";
import { LLamaContextualRepeatPenalty, Token } from "../types.js";
import { LlamaText } from "../utils/LlamaText.js";
import { LlamaGrammar } from "./LlamaGrammar.js";
import { EvaluationPriority } from "./LlamaContext/types.js";
import { LlamaContextSequence } from "./LlamaContext/LlamaContext.js";
import { TokenBias } from "./TokenBias.js";
export type LlamaCompletionOptions = {
contextSequence: LlamaContextSequence;
/**
* Automatically dispose the sequence when the object is disposed.
*
* Defaults to `false`.
*/
autoDisposeSequence?: boolean;
};
export type LlamaCompletionGenerationOptions = {
/**
* Called as the model generates a completion with the generated text chunk.
*
* Useful for streaming the generated completion as it's being generated.
*/
onTextChunk?: (text: string) => void;
/**
* Called as the model generates a completion with the generated tokens.
*
* Preferably, you'd want to use `onTextChunk` instead of this.
*/
onToken?: (tokens: Token[]) => void;
/**
* An AbortSignal to later abort the generation.
*
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
*
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
*/
signal?: AbortSignal;
/**
* When a completion already started being generated and then the signal is aborted,
* the generation will stop and the completion will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
/** Maximum number of tokens to generate */
maxTokens?: number;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
*
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
*
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
*
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* Disabled by default (set to `0`).
*/
temperature?: number;
/**
* From the next token candidates, discard the percentage of tokens with the lowest probability.
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
* This is useful for generating more high-quality results when using a high temperature.
* Set to a value between `0` and `1` to enable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* Disabled by default.
*/
minP?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
*/
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
*/
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Only relevant when using `temperature`.
*/
seed?: number;
/**
* Trim whitespace from the end of the generated text
* Disabled by default.
*/
trimWhitespaceSuffix?: boolean;
repeatPenalty?: false | LLamaContextualRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
*/
evaluationPriority?: EvaluationPriority;
grammar?: LlamaGrammar;
/**
* Custom stop triggers to stop the completion when any of the provided triggers are found.
*/
customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
/**
* The number of tokens to delete from the context window to make space for new ones.
* Defaults to 10% of the context size.
*/
contextShiftSize?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
/**
* Context shift reconstructs the context with partial relevant data to continue generation when the context fills up.
* This flag disables this behavior.
* This flag will cause the generation to stop when the context fills up
* by setting an appropriate `maxTokens` value or lowering the given `maxTokens` value when needed.
* This flag will cause the generation to fail if there's no space for generating new tokens at all with the given inputs.
*
* Disabled by default. Not recommended unless you know what you're doing.
*/
disableContextShift?: boolean;
};
export type LlamaInfillGenerationOptions = LlamaCompletionGenerationOptions & {
/**
* The minimum number of tokens to keep from the prefix input when making a context shift.
* Defaults to 10% of the context size.
*/
minPrefixKeepTokens?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
};
export type LlamaCompletionResponse = {
response: string;
metadata: {
remainingGenerationAfterStop?: string | Token[];
stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
} | {
remainingGenerationAfterStop?: string | Token[];
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
};
};
/**
* @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
*/
export declare class LlamaCompletion {
readonly onDispose: EventRelay<void>;
constructor({ contextSequence, autoDisposeSequence }: LlamaCompletionOptions);
dispose({ disposeSequence }?: {
disposeSequence?: boolean;
}): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get infillSupported(): boolean;
/**
* Generate a completion for an input.
*/
generateCompletion(input: Token[] | string | LlamaText, options?: LlamaCompletionGenerationOptions): Promise<string>;
/**
* Same as `generateCompletion`, but returns additional metadata about the generation.
* See `generateCompletion` for more information.
*/
generateCompletionWithMeta(input: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, customStopTriggers, contextShiftSize, disableContextShift }?: LlamaCompletionGenerationOptions): Promise<LlamaCompletionResponse>;
/**
* Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
* should connect to a given continuation (`suffixInput`).
* For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
* to make the final text be `123456789`.
*/
generateInfillCompletion(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, options?: LlamaInfillGenerationOptions): Promise<string>;
/**
* Same as `generateInfillCompletion`, but returns additional metadata about the generation.
* See `generateInfillCompletion` for more information.
*/
generateInfillCompletionWithMeta(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, contextShiftSize, customStopTriggers, minPrefixKeepTokens, disableContextShift }?: LlamaInfillGenerationOptions): Promise<LlamaCompletionResponse>;
}

View File

@@ -0,0 +1,495 @@
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { tokenizeInput } from "../utils/tokenizeInput.js";
import { UnsupportedError } from "../utils/UnsupportedError.js";
import { removeNullFields } from "../utils/removeNullFields.js";
import { TokenStreamRegulator } from "../utils/TokenStreamRegulator.js";
import { StopGenerationDetector } from "../utils/StopGenerationDetector.js";
import { UNKNOWN_UNICODE_CHAR } from "../consts.js";
import { getQueuedTokensBeforeStopTrigger } from "../utils/getQueuedTokensBeforeStopTrigger.js";
import { safeEventCallback } from "../utils/safeEventCallback.js";
import { pushAll } from "../utils/pushAll.js";
import { GgufArchitectureType } from "../gguf/types/GgufMetadataTypes.js";
import { resolveBeginningTokenToPrepend } from "../utils/tokenizerUtils.js";
import { LlamaGrammarEvaluationState } from "./LlamaGrammarEvaluationState.js";
const defaultContextShiftSize = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
const defaultMinPrefixKeepTokens = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
/**
* @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
*/
export class LlamaCompletion {
/** @internal */ _disposeAggregator = new DisposeAggregator();
/** @internal */ _autoDisposeSequence;
/** @internal */ _sequence;
onDispose = new EventRelay();
constructor({ contextSequence, autoDisposeSequence = false }) {
this._sequence = contextSequence;
this._autoDisposeSequence = autoDisposeSequence;
this._disposeAggregator.add(this._sequence.onDispose.createListener(() => {
this.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
}
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
if (this._sequence == null || this.disposed)
return;
if (disposeSequence)
this._sequence.dispose();
this._sequence = null;
this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.dispose]() {
return this.dispose();
}
get disposed() {
return this._sequence == null || this._sequence.disposed;
}
get infillSupported() {
if (this._sequence == null)
throw new DisposedError();
return this._sequence.model.tokens.infill.prefix != null &&
this._sequence.model.tokens.infill.suffix != null;
}
/**
* Generate a completion for an input.
*/
async generateCompletion(input, options = {}) {
const { response } = await this.generateCompletionWithMeta(input, options);
return response;
}
/**
* Same as `generateCompletion`, but returns additional metadata about the generation.
* See `generateCompletion` for more information.
*/
async generateCompletionWithMeta(input, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, customStopTriggers, contextShiftSize = defaultContextShiftSize, disableContextShift } = {}) {
if (this._sequence == null || this.disposed)
throw new DisposedError();
const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
const extraEosTokens = getExtraCompletionEosTokens(this._sequence.model);
async function fitInputIntoContext({ maxTokens, tokens }) {
const res = [];
if (beginningTokenToPrepend != null)
res.push(beginningTokenToPrepend);
const inputTokensSize = Math.max(0, Math.min(maxTokens - res.length, tokens.length));
if (inputTokensSize === 0 && tokens.length > 0)
throw new Error("The context size is too small to generate a response for the given input");
const slicedTokens = tokens.slice(-inputTokensSize);
pushAll(res, slicedTokens);
return res;
}
const ensureNotAborted = () => {
if (signal?.aborted && !stopOnAbortSignal)
throw signal.reason;
if (this.disposed)
throw new DisposedError();
};
return await withLock([this, "generateCompletion"], signal, async () => {
ensureNotAborted();
if (this._sequence == null || this.disposed)
throw new DisposedError();
const resolvedInput = tokenizeInput(input, this._sequence.model.tokenizer, beginningTokenToPrepend != null
? "trimLeadingSpace"
: undefined);
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
ensureNotAborted();
const inputTokens = await fitInputIntoContext({
maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
tokens: resolvedInput
});
ensureNotAborted();
const resolvedMaxTokens = !disableContextShift
? maxTokens
: (maxTokens != null && maxTokens > 0)
? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
: this._sequence.context.contextSize - inputTokens.length;
this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
return await this._generateResponse(inputTokens, {
onTextChunk: safeEventCallback(onTextChunk),
onToken: safeEventCallback(onToken),
signal,
stopOnAbortSignal,
maxTokens: resolvedMaxTokens,
temperature,
minP,
topK,
topP,
seed,
trimWhitespaceSuffix,
repeatPenalty,
tokenBias,
evaluationPriority,
grammar,
contextShiftSize,
customStopTriggers
}, {
async contextShift({ shiftSize, res, pendingTokens, sequence }) {
return {
newContextState: await fitInputIntoContext({
maxTokens: sequence.context.contextSize - shiftSize,
tokens: [...resolvedInput, ...res, ...pendingTokens]
})
};
},
extraEosTokens
});
});
}
/**
* Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
* should connect to a given continuation (`suffixInput`).
* For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
* to make the final text be `123456789`.
*/
async generateInfillCompletion(prefixInput, suffixInput, options = {}) {
const { response } = await this.generateInfillCompletionWithMeta(prefixInput, suffixInput, options);
return response;
}
/**
* Same as `generateInfillCompletion`, but returns additional metadata about the generation.
* See `generateInfillCompletion` for more information.
*/
async generateInfillCompletionWithMeta(prefixInput, suffixInput, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers, minPrefixKeepTokens = defaultMinPrefixKeepTokens, disableContextShift = false } = {}) {
if (this._sequence == null || this.disposed)
throw new DisposedError();
const prefixToken = this._sequence.model.tokens.infill.prefix;
const suffixToken = this._sequence.model.tokens.infill.suffix;
const middleToken = this._sequence.model.tokens.infill.middle;
const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
if (prefixToken == null || suffixToken == null)
throw new UnsupportedError("Infill completions are not supported by this model");
const extraEosTokens = getExtraInfillEosTokens(this._sequence.model);
async function fitInputIntoContext({ maxTokens, prefixTokens, suffixTokens, sequence }) {
if (prefixToken == null || suffixToken == null)
throw new UnsupportedError("Infill completions are not supported by this model");
// 2 - InfillPrefix token, InfillSuffix token
const specialTokensInContext = 2 +
(middleToken != null ? 1 : 0) +
(beginningTokenToPrepend != null ? 1 : 0);
const resolvedMaxTokens = maxTokens - specialTokensInContext;
let sizeLeftToFill = resolvedMaxTokens;
let suffixTokensSize = Math.min(sizeLeftToFill, suffixTokens.length);
sizeLeftToFill -= suffixTokensSize;
let prefixTokensSize = Math.min(sizeLeftToFill, prefixTokens.length);
sizeLeftToFill -= prefixTokensSize;
if (sizeLeftToFill <= 0 && disableContextShift)
throw new Error("The context size is too small to generate a response for the given input, and context shift is disabled. " +
"Consider removing `disableContextShift` or reducing the input size.");
const resolvedMinPrefixKeepTokens = Math.min(Math.min(resolvedMaxTokens, prefixTokens.length), Math.max(1, Math.floor(minPrefixKeepTokens instanceof Function
? await minPrefixKeepTokens(sequence)
: minPrefixKeepTokens)));
if (prefixTokensSize < resolvedMinPrefixKeepTokens) {
const diffToFill = Math.min(suffixTokensSize, resolvedMinPrefixKeepTokens - prefixTokensSize);
prefixTokensSize += diffToFill;
suffixTokensSize -= diffToFill;
}
const resolvedPrefixTokens = prefixTokens.slice(-prefixTokensSize);
const resolvedSuffixTokens = suffixTokens.slice(0, suffixTokensSize);
const newContextState = [];
if (beginningTokenToPrepend != null)
newContextState.push(beginningTokenToPrepend);
if (middleToken != null) {
newContextState.push(prefixToken);
pushAll(newContextState, resolvedPrefixTokens);
newContextState.push(suffixToken);
pushAll(newContextState, resolvedSuffixTokens);
newContextState.push(middleToken);
}
else {
newContextState.push(suffixToken);
pushAll(newContextState, resolvedSuffixTokens);
newContextState.push(prefixToken);
pushAll(newContextState, resolvedPrefixTokens);
}
return newContextState;
}
const ensureNotAborted = () => {
if (signal?.aborted && !stopOnAbortSignal)
throw signal.reason;
if (this.disposed)
throw new DisposedError();
};
return await withLock([this, "generateCompletion"], signal, async () => {
ensureNotAborted();
if (this._sequence == null || this.disposed)
throw new DisposedError();
const resolvedPrefixInputTokens = tokenizeInput(prefixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
const resolvedSuffixInputTokens = tokenizeInput(suffixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
ensureNotAborted();
const inputTokens = await fitInputIntoContext({
maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
prefixTokens: resolvedPrefixInputTokens,
suffixTokens: resolvedSuffixInputTokens,
sequence: this._sequence
});
ensureNotAborted();
const resolvedMaxTokens = !disableContextShift
? maxTokens
: (maxTokens != null && maxTokens > 0)
? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
: this._sequence.context.contextSize - inputTokens.length;
this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
return await this._generateResponse(inputTokens, {
onTextChunk: safeEventCallback(onTextChunk),
onToken: safeEventCallback(onToken),
signal,
stopOnAbortSignal,
maxTokens: resolvedMaxTokens,
temperature,
minP,
topK,
topP,
seed,
trimWhitespaceSuffix,
repeatPenalty,
tokenBias,
evaluationPriority,
grammar,
contextShiftSize,
customStopTriggers
}, {
async contextShift({ shiftSize, res, pendingTokens, sequence }) {
return {
newContextState: await fitInputIntoContext({
maxTokens: sequence.context.contextSize - shiftSize,
prefixTokens: [...resolvedPrefixInputTokens, ...res, ...pendingTokens],
suffixTokens: resolvedSuffixInputTokens,
sequence
})
};
},
extraEosTokens
});
});
}
/** @internal */
async _generateResponse(tokens, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers }, { contextShift, extraEosTokens = new Set() }) {
if (this._sequence == null)
throw new DisposedError();
const sequence = this._sequence;
const model = sequence.model;
const context = sequence.context;
const res = [];
const pendingTokens = [];
const grammarEvaluationState = grammar != null
? new LlamaGrammarEvaluationState({ model, grammar })
: undefined;
const { lastTokens: repeatPenaltyLastTokens = 64, punishTokensFilter, penalizeNewLine, penalty, frequencyPenalty, presencePenalty } = repeatPenalty === false
? { lastTokens: 0 }
: repeatPenalty;
const streamRegulator = new TokenStreamRegulator();
const stopGenerationDetector = new StopGenerationDetector();
const customStopGenerationTriggersDetector = new StopGenerationDetector();
const locksToReleaseOnValidGeneration = [];
const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
let inputTokens = tokens;
let generatedTokens = 0;
if (grammar != null)
StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
.map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
if (customStopTriggers != null)
StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
.map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
const ensureNotAborted = () => {
if (signal?.aborted && !stopOnAbortSignal)
throw signal.reason;
if (this.disposed)
throw new DisposedError();
};
const getPenaltyTokens = () => {
if (this._sequence == null)
throw new DisposedError();
let punishTokens = res.slice(-repeatPenaltyLastTokens);
if (punishTokensFilter != null)
punishTokens = punishTokensFilter(punishTokens);
if (penalizeNewLine == null || !penalizeNewLine) {
const nlToken = model.tokens.nl;
if (nlToken != null)
punishTokens = punishTokens.filter((token) => token !== nlToken);
}
return punishTokens;
};
while (true) {
ensureNotAborted();
let shouldContextShift = false;
if (inputTokens.length === 1 && sequence.nextTokenIndex !== 0)
await sequence.eraseContextTokenRanges([{
start: 0,
end: sequence.nextTokenIndex
}]);
else {
const lastToken = inputTokens[inputTokens.length - 1];
// we need to decode at least one token to generate a response
inputTokens.pop();
await sequence.adaptStateToTokens(inputTokens, false);
inputTokens.push(lastToken);
ensureNotAborted();
const firstDifferentIndex = sequence.nextTokenIndex;
inputTokens.splice(0, firstDifferentIndex);
}
const evaluationIterator = sequence.evaluate(inputTokens, removeNullFields({
temperature, minP, topK, topP, seed,
grammarEvaluationState,
repeatPenalty: !repeatPenaltyEnabled ? undefined : {
punishTokens: getPenaltyTokens,
maxPunishTokens: repeatPenaltyLastTokens,
penalty,
frequencyPenalty,
presencePenalty
},
tokenBias,
evaluationPriority,
yieldEogToken: true
}));
const pendingPartialTokens = [];
for await (const token of evaluationIterator) {
ensureNotAborted();
generatedTokens++;
const tokens = pendingPartialTokens.length === 0
? [token]
: [...pendingPartialTokens, token];
const text = model.detokenize([token]);
if (pendingPartialTokens.length === 0 &&
text.endsWith(UNKNOWN_UNICODE_CHAR) &&
!model.isSpecialToken(token) &&
!model.isEogToken(token)) {
pendingPartialTokens.push(token);
continue;
}
else {
pendingPartialTokens.length = 0;
const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
if (text.endsWith(UNKNOWN_UNICODE_CHAR) || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "") || (text === "" && locksToReleaseOnValidGeneration.length > 0 && !model.isSpecialToken(token))) {
locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
}
else {
while (locksToReleaseOnValidGeneration.length > 0)
locksToReleaseOnValidGeneration.shift().dispose();
}
stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
if (model.isEogToken(token) || extraEosTokens.has(token))
queuedTokenRelease.createTokenIndexLock(0);
pushAll(pendingTokens, streamRegulator.popFreeChunkTokens());
if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
model.isEogToken(token) || extraEosTokens.has(token)) {
const triggeredStops = stopGenerationDetector.hasTriggeredStops
? stopGenerationDetector.getTriggeredStops()
: customStopGenerationTriggersDetector.getTriggeredStops();
const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
pushAll(pendingTokens, queuedTokensBeforeStopTrigger);
const { firstRemainingGenerationAfterStop } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
if (pendingTokens.length > 0) {
onToken?.(pendingTokens.slice());
onTextChunk?.(model.detokenize(pendingTokens, false, res));
}
pushAll(res, pendingTokens);
pendingTokens.length = 0;
let modelResponse = model.detokenize(res);
if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
modelResponse = modelResponse.trimEnd();
const isEogToken = model.isEogToken(token) || extraEosTokens.has(token);
if (isEogToken || stopGenerationDetector.hasTriggeredStops)
return {
response: modelResponse,
metadata: {
remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
stopReason: isEogToken
? "eogToken"
: "stopGenerationTrigger"
}
};
return {
response: modelResponse,
metadata: {
remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
stopReason: "customStopTrigger",
customStopTrigger: triggeredStops[0].stopTrigger
}
};
}
if (pendingTokens.length > 0) {
onToken?.(pendingTokens.slice());
onTextChunk?.(model.detokenize(pendingTokens, false, res));
pushAll(res, pendingTokens);
pendingTokens.length = 0;
}
}
const aborted = (signal?.aborted ?? false) && stopOnAbortSignal;
const maxTokensReached = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
if (aborted || maxTokensReached) {
let modelResponse = model.detokenize(res);
if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
modelResponse = modelResponse.trimEnd();
return {
response: modelResponse,
metadata: {
stopReason: aborted
? "abort"
: "maxTokens"
}
};
}
if (sequence.nextTokenIndex >= context.contextSize - 1) {
shouldContextShift = true;
break;
}
}
if (shouldContextShift) {
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, sequence);
ensureNotAborted();
const { newContextState } = await contextShift({
shiftSize: resolvedContextShiftSize,
res,
pendingTokens,
sequence
});
ensureNotAborted();
inputTokens = newContextState;
continue;
}
break;
}
throw new Error("The context size is too small to generate a response");
}
}
async function resolveContextShiftSize(contextShiftSize, sequence) {
if (typeof contextShiftSize === "number")
return contextShiftSize;
else if (contextShiftSize instanceof Function)
return Math.min(sequence.context.contextSize, Math.max(1, Math.floor(contextShiftSize instanceof Function
? await contextShiftSize(sequence)
: contextShiftSize)));
return defaultContextShiftSize(sequence);
}
function getExtraCompletionEosTokens(model) {
const extraEosTokens = new Set();
if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
for (const token of model.iterateAllTokens()) {
const tokenText = model.detokenize([token], true);
if (tokenText === "<|file_separator|>" || tokenText === "<|fim_prefix|>") {
extraEosTokens.add(token);
if (extraEosTokens.size === 2)
break;
}
}
}
return extraEosTokens;
}
function getExtraInfillEosTokens(model) {
const extraEosTokens = new Set();
if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
for (const token of model.iterateAllTokens()) {
const tokenText = model.detokenize([token], true);
if (tokenText === "<|file_separator|>") {
extraEosTokens.add(token);
break;
}
}
}
return extraEosTokens;
}
//# sourceMappingURL=LlamaCompletion.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,245 @@
import { EventRelay } from "lifecycle-utils";
import { Token } from "../../types.js";
import { TokenMeter } from "../TokenMeter.js";
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
import { TokenPredictor } from "./TokenPredictor.js";
export declare class LlamaContext {
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get model(): LlamaModel;
get contextSize(): number;
get batchSize(): number;
get flashAttention(): boolean;
/**
* The actual size of the state in the memory in bytes.
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
*/
get stateSize(): number;
/** The number of threads currently used to evaluate tokens */
get currentThreads(): number;
/**
* The number of threads that are preferred to be used to evaluate tokens.
*
* The actual number of threads used may be lower when other evaluations are running in parallel.
*/
get idealThreads(): number;
getAllocatedContextSize(): number;
get totalSequences(): number;
get sequencesLeft(): number;
/**
* Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
* When there are no sequences left, this method will throw an error.
*/
getSequence(options?: {
contextShift?: ContextShiftOptions;
/**
* Token predictor to use for the sequence.
* Don't share the same token predictor between multiple sequences.
*
* Using a token predictor doesn't affect the generation output itself -
* it only allows for greater parallelization of the token evaluation to speed up the generation.
*
* > **Note:** that if a token predictor is too resource intensive,
* > it can slow down the generation process due to the overhead of running the predictor.
* >
* > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
*
* Automatically disposed when disposing the sequence.
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
*/
tokenPredictor?: TokenPredictor;
}): LlamaContextSequence;
dispatchPendingBatch(): void;
/**
* Print the timings of token evaluation since that last print for this context.
*
* Requires the `performanceTracking` option to be enabled.
*
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
* it won't print anything.
*/
printTimings(): Promise<void>;
}
export declare class LlamaContextSequence {
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get context(): LlamaContext;
get model(): LlamaModel;
/** The maximum number of tokens that the sequence state can hold */
get contextSize(): number;
/** The index where the next evaluated token will be placed in the context */
get nextTokenIndex(): number;
/** The current context state tokens */
get contextTokens(): Token[];
get tokenMeter(): TokenMeter;
/**
* The token predictor used when creating this sequence.
*/
get tokenPredictor(): TokenPredictor | undefined;
/**
* Get the index of the first token in the KV cache.
*
* If you remove any tokens from the state that come before this index,
* no cached prefix tokens evaluation state will be used for the next evaluation.
*
* For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
* then the cached state for range `0-10` will be used in the next evaluation,
* but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
* and will be re-evaluated in the next evaluation.
*
* This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
*
* When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
*
* When the KV cache is empty, this index will be `-1`.
*
* You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
*/
get stateCellsStartIndex(): number;
/**
* Statistics of token predictions using the sequence's `tokenPredictor`.
*
* The statistics change only when token prediction is used in this sequence.
*
* `validated` + `refuted` = total number of evaluated predictions.
*
* Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
*/
get tokenPredictions(): {
/** Number of token predictions that were actually used (tokens that were validated and then consumed) */
used: number;
/** Number of token predictions that were not used (tokens that were validated and were not consumed) */
unused: number;
/** Number of token predictions that were validated successfully */
validated: number;
/** Number of token predictions that were refuted */
refuted: number;
};
get isLoadedToMemory(): boolean;
compareContextTokens(tokens: Token[]): {
firstDifferentIndex: number;
};
/**
* Erase parts of the context state to align it with the given tokens.
*
* If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
*
* To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
*
* If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
* which incurs token evaluation of the shifted tokens.
*/
adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
/**
* Clear the history of the sequence.
*/
clearHistory(): Promise<void>;
/**
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
* The start of each range is inclusive, and the end of each range is exclusive.
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
*/
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
/**
* Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
*
* This method uses the token predictor (when provided) to generate new tokens faster.
*/
evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
/**
* Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
*
* Configure the additional metadata options to choose which metadata to include.
*/
evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
/**
* Evaluate the provided tokens into the context sequence without generating new tokens.
*/
evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
/**
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
* evaluated based on the strategy chosen for the context.
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
* highest evaluation priority.
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
*/
evaluationPriority?: EvaluationPriority;
/** Override the sequence context shift options for this evaluation */
contextShift?: ContextShiftOptions;
}): Promise<void>;
/**
* Evaluate the provided tokens into the context sequence with custom options for each token.
*
* This method allows for more precise control of the generation process.
*
* A next token will be generated for a given token only if any of the `generateNext` options for it are used.
*
* To generate more tokens after this method finishes,
* use it again with token(s) you selected to add to the context from the previous evaluation.
*
* This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
* Use the `evaluate` method when you need to use token prediction.
* @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
* For indexes that have no output, there won't be any value at the corresponding index in the output array.
*
* It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
*/
controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
/**
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
* evaluated based on the strategy chosen for the context.
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
* highest evaluation priority.
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
*/
evaluationPriority?: EvaluationPriority;
/** Override the sequence context shift options for this evaluation */
contextShift?: ContextShiftOptions;
/** Called on each token result after it's generated */
onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
}): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
/**
* Save the current context sequence evaluation state to a file.
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
*/
saveStateToFile(filePath: string): Promise<{
fileSize: number;
}>;
/**
* Load a context sequence evaluation state from a file.
*
* Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
*
* You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
*/
loadStateFromFile(filePath: string, acceptRisk: {
/**
* Loading a state file created using a different model may crash the process.
*
* You must accept this risk to use this feature.
*/
acceptRisk: true;
}): Promise<void>;
}
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
contextSize: number;
sequences: number;
}): number;
export declare function getDefaultContextSequences(): number;
export declare function getDefaultModelContextSize({ trainContextSize }: {
trainContextSize?: number;
}): number;

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1 @@
export {};

View File

@@ -0,0 +1,31 @@
/** @internal */
export class LlamaSampler {
/** @internal */ _llama;
/** @internal */ _sampler;
/** @internal */ disposed = false;
constructor(model) {
this._llama = model._llama;
this._sampler = new this._llama._bindings.AddonSampler(model._model);
this.asyncDispose = this.asyncDispose.bind(this);
}
dispose() {
this.disposed = true;
this._sampler.dispose();
}
async asyncDispose() {
this.disposed = true;
this._sampler.dispose();
}
applyConfig(config) {
return this._sampler.applyConfig(config);
}
/** @internal */
static _canBeNextTokenForGrammarEvaluationState(llama, grammarEvaluationState, token) {
return llama._bindings.AddonSampler.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
}
/** @internal */
static _acceptTokenOnGrammarEvaluationState(llama, grammarEvaluationState, token) {
llama._bindings.AddonSampler.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
}
}
//# sourceMappingURL=LlamaSampler.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}

View File

@@ -0,0 +1,55 @@
import { Token } from "../../types.js";
import { SequenceEvaluateOptions } from "./types.js";
import { LlamaContextSequence } from "./LlamaContext.js";
/**
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
*/
export declare abstract class TokenPredictor {
/**
* Resets the state of the predictor.
*
* Called before the generation starts.
*/
abstract reset(params: {
/** The target sequence that this token predictor is generating tokens for */
targetSequence: LlamaContextSequence;
/**
* The tokens that are or will be loaded into the state.
*
* The initial predictions should be based on these tokens.
*
* When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
*/
stateTokens: Token[];
/**
* Options used for the evaluation on the target sequence.
*
* The `grammarEvaluationState` is cloned before being passed to the token predictor,
* so it can be modified without affecting the original state.
*/
evaluateOptions: Readonly<SequenceEvaluateOptions>;
}): Promise<void> | void;
abstract pushTokens(tokens: Token[]): void;
/**
* Predicts the next tokens based on the current state.
*
* If the generation should wait until the minimum predications are ready,
* this method should return a promise that resolves when the minimum predictions are ready.
*
* A background prediction process can be started when this function is called,
* so that the next predictions will be ready when this function is called again.
*/
abstract predictTokens(): Promise<Token[]> | Token[];
/**
* Stops the prediction process when it runs in the background.
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
*/
stop(untilPredictionsExhausted?: boolean): Promise<void> | void;
/**
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
*/
updateInputTokens(tokens: Token[]): void;
dispose(): Promise<void> | void;
/** @hidden */
[Symbol.dispose](): void | Promise<void>;
}

View File

@@ -0,0 +1,20 @@
/**
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
*/
export class TokenPredictor {
/**
* Stops the prediction process when it runs in the background.
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
*/
stop(untilPredictionsExhausted) { }
/**
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
*/
updateInputTokens(tokens) { }
dispose() { }
/** @hidden */
[Symbol.dispose]() {
return this.dispose();
}
}
//# sourceMappingURL=TokenPredictor.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}

View File

@@ -0,0 +1,56 @@
import { Token } from "../../../types.js";
import { SequenceEvaluateOptions } from "../types.js";
import { LlamaContextSequence } from "../LlamaContext.js";
import { TokenPredictor } from "../TokenPredictor.js";
/**
* Predicts the next tokens by evaluating the current state of the target sequence
* on a draft sequence from a smaller and faster draft model.
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
*/
export declare class DraftSequenceTokenPredictor extends TokenPredictor {
constructor(draftSequence: LlamaContextSequence, options?: {
/**
* The minimum number of tokens to draft.
*
* Defaults to `0`.
*/
minTokens?: number;
/**
* Maximum number of tokens to draft.
*
* Defaults to `16`.
*/
maxTokens?: number;
/**
* Evaluate options default to the values of the target sequence.
*
* You can override any of the options for the prediction here.
*/
evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
/**
* Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
* When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
* are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
*
* A number between `0` and `1` representing the minimum probability of the token to be generated.
*
* Set to `0` to disable.
*
* Defaults to `0.6`.
*/
minConfidence?: number;
});
get draftSequence(): LlamaContextSequence;
get minTokens(): number;
get maxTokens(): number;
get minConfidence(): number | undefined;
reset({ targetSequence, stateTokens, evaluateOptions }: {
targetSequence: LlamaContextSequence;
stateTokens: Token[];
evaluateOptions: Readonly<SequenceEvaluateOptions>;
}): Promise<void>;
pushTokens(tokens: Token[]): void;
predictTokens(): Token[] | Promise<Token[]>;
stop(untilPredictionsExhausted?: boolean): void;
dispose(): void;
}

View File

@@ -0,0 +1,266 @@
import { withLock } from "lifecycle-utils";
import { pushAll } from "../../../utils/pushAll.js";
import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
import { LlamaSampler } from "../LlamaSampler.js";
import { TokenPredictor } from "../TokenPredictor.js";
const defaultPredictionMinTokens = 0;
const defaultPredictionMaxTokens = 16;
const defaultPredictionMinConfidence = 0.6;
/**
* Predicts the next tokens by evaluating the current state of the target sequence
* on a draft sequence from a smaller and faster draft model.
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
*/
export class DraftSequenceTokenPredictor extends TokenPredictor {
/** @internal */ _draftSequence;
/** @internal */ _minTokens;
/** @internal */ _maxTokens;
/** @internal */ _minConfidence;
/** @internal */ _stateTokens = [];
/** @internal */ _pendingEvalTokens = [];
/** @internal */ _predictedTokens = [];
/** @internal */ _evaluateOptions = {};
/** @internal */ _overrideEvaluateOptions = {};
/** @internal */ _grammarEvaluationStateOption;
/** @internal */ _currentEvaluationAbortController = new AbortController();
/** @internal */ _resetAbortController = new AbortController();
/** @internal */ _stopped = true;
/** @internal */ _waitForPredictionExhaustion = false;
/** @internal */ _minTokensCallbacks = [];
/** @internal */ _resetPredictions = false;
/** @internal */ _iterator;
/** @internal */ _active = false;
/** @internal */ _disposed = false;
constructor(draftSequence, options = {}) {
super();
this._draftSequence = draftSequence;
this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
this._overrideEvaluateOptions = options.evaluateOptions ?? {};
this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
if (draftSequence.disposed)
throw new Error("The draft sequence is disposed");
}
get draftSequence() {
return this._draftSequence;
}
get minTokens() {
return this._minTokens;
}
get maxTokens() {
return this._maxTokens;
}
get minConfidence() {
return this._minConfidence;
}
async reset({ targetSequence, stateTokens, evaluateOptions }) {
this._currentEvaluationAbortController.abort();
this._resetAbortController.abort();
this._currentEvaluationAbortController = new AbortController();
this._resetAbortController = new AbortController();
this._stopped = true;
this._waitForPredictionExhaustion = false;
this._iterator?.return();
this._iterator = undefined;
const currentAbortSignal = this._resetAbortController.signal;
targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
try {
await withLock([this, "evaluate"], currentAbortSignal, async () => {
this._stateTokens = stateTokens.slice();
this._pendingEvalTokens = [];
this._predictedTokens = [];
this._resetPredictions = false;
while (this._minTokensCallbacks.length > 0)
this._minTokensCallbacks.shift()?.();
const lastToken = this._stateTokens.pop();
if (lastToken != null)
this._pendingEvalTokens.push(lastToken);
this._evaluateOptions = evaluateOptions;
this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
? this._evaluateOptions.grammarEvaluationState()?.clone()
: this._evaluateOptions.grammarEvaluationState?.clone();
const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
await this._draftSequence.adaptStateToTokens(newStateTokens, true);
newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
contextShift: this._evaluateOptions.contextShift,
evaluationPriority: this._evaluateOptions.evaluationPriority
});
});
}
catch (err) {
if (err !== currentAbortSignal.reason)
throw err;
}
}
pushTokens(tokens) {
const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
? this._evaluateOptions.grammarEvaluationState()?.clone()
: this._evaluateOptions.grammarEvaluationState?.clone();
void withLock([this, "pushTokens"], async () => {
this._grammarEvaluationStateOption = grammarEvaluationStateOption;
const tokensToPush = tokens.slice();
while (!this._resetPredictions && tokensToPush.length > 0) {
const token = tokensToPush.shift();
if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
this._predictedTokens.shift();
}
else {
tokensToPush.unshift(token);
break;
}
}
if (tokensToPush.length === 0) {
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
this._resume();
return;
}
this._currentEvaluationAbortController.abort();
this._currentEvaluationAbortController = new AbortController();
pushAll(this._pendingEvalTokens, tokensToPush);
this._resetPredictions = true;
this._resume();
});
}
predictTokens() {
if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
return this._predictedTokens;
this._stopped = false;
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
this._waitForPredictionExhaustion = false;
this._resume();
}
if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
return this._predictedTokens;
if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
if (this._resetPredictions)
return [];
return this._predictedTokens;
}
return new Promise((accept) => void this._minTokensCallbacks.push(accept))
.then(() => {
if (this._resetPredictions)
return [];
return this._predictedTokens;
});
}
stop(untilPredictionsExhausted = false) {
this._stopped = true;
this._currentEvaluationAbortController.abort();
this._currentEvaluationAbortController = new AbortController();
if (untilPredictionsExhausted)
this._waitForPredictionExhaustion = true;
void withLock([this, "evaluate"], async () => {
this._iterator?.return();
this._iterator = undefined;
});
}
dispose() {
this._disposed = true;
this._stopped = true;
this._resetAbortController.abort();
this._currentEvaluationAbortController.abort();
void withLock([this, "evaluate"], async () => {
this._iterator?.return();
this._iterator = undefined;
});
}
/** @internal */
_canIterate() {
return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
}
/** @internal */
_resume() {
if (this._active || !this._canIterate())
return;
this._active = true;
void withLock([this, "evaluate"], async () => {
try {
const abortSignal = this._currentEvaluationAbortController.signal;
if (!this._canIterate() || abortSignal.aborted)
return;
const resetPredications = async () => {
this._iterator?.return();
this._iterator = undefined;
this._waitForPredictionExhaustion = false;
this._resetPredictions = false;
const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
this._predictedTokens = [];
await this._draftSequence.eraseContextTokenRanges([{
start: this._draftSequence.nextTokenIndex - tokenToDelete,
end: this._draftSequence.nextTokenIndex
}]);
};
const createIterator = () => {
const tokens = this._pendingEvalTokens;
this._pendingEvalTokens = [];
return this.draftSequence.evaluateWithMetadata(tokens, { confidence: true }, {
...this._evaluateOptions,
...this._overrideEvaluateOptions,
grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
});
};
if (this._resetPredictions)
await resetPredications();
if (!this._canIterate() || abortSignal.aborted)
return;
let iterator = createIterator();
this._iterator = iterator;
while (this._canIterate() && !abortSignal.aborted) {
const { value, done } = await iterator.next();
let shouldBreak = done;
if (value != null) {
const { token, confidence } = value;
if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
confidence < this._minConfidence) {
this._iterator = undefined;
await iterator.return();
this._waitForPredictionExhaustion = true;
shouldBreak = true;
}
else
this._predictedTokens.push(token);
}
if (this._resetPredictions && !abortSignal.aborted) {
await resetPredications();
iterator = createIterator();
this._iterator = iterator;
continue;
}
if (this._predictedTokens.length >= this._minTokens) {
while (this._minTokensCallbacks.length > 0)
this._minTokensCallbacks.shift()?.();
}
if (shouldBreak) {
this._iterator = undefined;
await iterator.return();
this._waitForPredictionExhaustion = true;
while (this._minTokensCallbacks.length > 0)
this._minTokensCallbacks.shift()?.();
break;
}
}
}
finally {
this._active = false;
}
});
}
/** @internal */
_getGrammarEvaluationStateWithTokens(tokens) {
if (this._grammarEvaluationStateOption == null)
return undefined;
const clone = this._grammarEvaluationStateOption.clone();
for (const token of tokens) {
const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
if (!canAddToken) {
console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
this._grammarEvaluationStateOption = undefined;
return undefined;
}
LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
}
return clone;
}
}
//# sourceMappingURL=DraftSequenceTokenPredictor.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,58 @@
import { Token } from "../../../types.js";
import { TokenPredictor } from "../TokenPredictor.js";
/**
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
*
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
* such as in text summarization or modifying code).
*
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
*
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
*/
export declare class InputLookupTokenPredictor extends TokenPredictor {
constructor(options?: {
patternLength?: {
/**
* Min pattern length to look for in the input tokens.
*
* Defaults to `1`.
*/
min?: number;
/**
* Max pattern length to look for in the input tokens.
*
* Set to `0` to disable the max pattern size.
*
* Defaults to `0`.
*/
max?: number;
};
predictionLength?: {
/**
* Minimum number of tokens to predict.
*
* Defaults to `1`.
*/
min?: number;
/**
* Maximum number of tokens to predict.
*
* Defaults to `3`.
*/
max?: number;
};
});
get patternMinLength(): number;
get patternMaxLength(): number;
get predictionMinLength(): number;
get predictionMaxLength(): number;
reset({ stateTokens }: {
stateTokens: Token[];
}): void;
updateInputTokens(tokens: Token[]): void;
pushTokens(tokens: Token[]): void;
predictTokens(): Token[];
dispose(): void;
}

View File

@@ -0,0 +1,138 @@
import { DisposedError } from "lifecycle-utils";
import { pushAll } from "../../../utils/pushAll.js";
import { TokenPredictor } from "../TokenPredictor.js";
const defaultPatternMinLength = 1;
const defaultPatternMaxLength = 0;
const defaultPredictionMinLength = 1;
const defaultPredictionMaxLength = 3;
/**
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
*
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
* such as in text summarization or modifying code).
*
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
*
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
*/
export class InputLookupTokenPredictor extends TokenPredictor {
/** @internal */ _patternMinLength;
/** @internal */ _patternMaxLength;
/** @internal */ _predictionMinLength;
/** @internal */ _predictionMaxLength;
/** @internal */ _lastPredictionMatchStartIndex = undefined;
/** @internal */ _lastPredictionMatchLength = undefined;
/** @internal */ _stateTokens = [];
/** @internal */ _inputTokens = [];
/** @internal */ _disposed = false;
constructor(options = {}) {
super();
this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
this._patternMaxLength = Math.floor(Math.max(0, Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)));
this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
this._predictionMaxLength = Math.floor(Math.max(this._patternMinLength, options.predictionLength?.max ?? defaultPredictionMaxLength));
}
get patternMinLength() {
return this._patternMinLength;
}
get patternMaxLength() {
return this._patternMaxLength;
}
get predictionMinLength() {
return this._predictionMinLength;
}
get predictionMaxLength() {
return this._predictionMaxLength;
}
reset({ stateTokens }) {
this._stateTokens = stateTokens.slice();
delete this._lastPredictionMatchStartIndex;
delete this._lastPredictionMatchLength;
}
updateInputTokens(tokens) {
this._inputTokens = tokens.slice();
delete this._lastPredictionMatchStartIndex;
delete this._lastPredictionMatchLength;
}
pushTokens(tokens) {
pushAll(this._stateTokens, tokens);
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
this._lastPredictionMatchLength += tokens.length;
}
}
predictTokens() {
if (this._disposed)
throw new DisposedError();
if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
return [];
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
for (let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1, s = this._stateTokens.length - 1; p >= this._lastPredictionMatchStartIndex && s >= 0; p--, s--) {
if (this._inputTokens[p] !== this._stateTokens[s]) {
delete this._lastPredictionMatchStartIndex;
delete this._lastPredictionMatchLength;
break;
}
}
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
if (predictionEndIndex < this._inputTokens.length) {
return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
}
}
}
const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
if (matchStartIndex == null || matchLength == null)
return [];
const predictionEndIndex = matchStartIndex + matchLength;
const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
if (res.length >= this._predictionMinLength) {
this._lastPredictionMatchStartIndex = matchStartIndex;
this._lastPredictionMatchLength = matchLength;
return res;
}
return [];
}
dispose() {
this._disposed = true;
this._stateTokens = [];
this._inputTokens = [];
delete this._lastPredictionMatchStartIndex;
delete this._lastPredictionMatchLength;
}
/** @internal */
_findLongestPatternIndex(findIn, lookupPattern) {
const checkIndexes = [];
let bestIndex = -1;
let bestIndexDiff = -1;
for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
const token = findIn[i];
for (let j = checkIndexes.length - 1; j >= 0; j--) {
const startIndex = checkIndexes[j];
const indexDiff = startIndex - i;
if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength)) {
checkIndexes.splice(j, 1);
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
bestIndex = startIndex;
bestIndexDiff = indexDiff;
}
}
}
if (token === lookupPattern[lookupPattern.length - 1])
checkIndexes.unshift(i);
}
for (let j = checkIndexes.length - 1; j >= 0; j--) {
const startIndex = checkIndexes[j];
const indexDiff = startIndex + 1;
checkIndexes.splice(j, 1);
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
bestIndex = startIndex;
bestIndexDiff = indexDiff;
}
}
if (bestIndex >= 0)
return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
return [];
}
}
//# sourceMappingURL=InputLookupTokenPredictor.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,458 @@
import { PickOptions } from "../../utils/utilTypes.js";
import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
import type { TokenBias } from "../TokenBias.js";
import type { Token } from "../../types.js";
import type { LlamaContextSequence } from "./LlamaContext.js";
export type LlamaContextOptions = {
/**
* number of sequences for the context.
* Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context.
* Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other.
* This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch).
*
* Each sequence increases the memory usage of the context.
*
* Defaults to `1`.
*/
sequences?: number;
/**
* The number of tokens the model can see at once.
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
* the model was trained on.
* - **`number`** - set the context size to a specific number of tokens.
* If there's not enough VRAM, an error will be thrown.
* Use with caution.
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
* up to the size the model was trained on, but at least `min` and at most `max`.
*
* The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
* aligns the context size to multiples of 256 for performance reasons.
* To check the actual context size that gets created, use the `.contextSize` property
* of the created context instance or any of its sequences.
*
* Defaults to `"auto"`.
*/
contextSize?: "auto" | number | {
min?: number;
max?: number;
};
/**
* The number of tokens that can be processed at once by the GPU.
*
* Defaults to `512` or `contextSize` if `contextSize` is less than `512`.
*/
batchSize?: number;
/**
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
*
* The support for flash attention is currently experimental and may not always work as expected.
* Use with caution.
*
* This option will be ignored if flash attention is not supported by the model.
*
* Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
*
* Upon flash attention exiting the experimental status, the default value will become `true`
* (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
*/
flashAttention?: boolean;
/**
* number of threads to use to evaluate tokens.
* set to 0 to use the maximum threads supported by the current machine hardware.
*
* This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running.
* To ensure the minimum number of threads you want to use are always used,
* set this to an object with a `min` property (see the `min` property description for more details).
*
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
*
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
*/
threads?: number | {
/**
* The ideal number of threads to use for evaluations.
*
* If other evaluations are running, the actual number of threads may be lower than this value.
*
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
*
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
*/
ideal?: number;
/**
* Ensure evaluations always use at least this number of threads.
*
* Use with caution, since setting this value too high can lead to the context waiting too much time
* to reserve this number of threads before the evaluation can start.
*/
min?: number;
};
/**
* Control the parallel sequences processing behavior.
*
* See {@link BatchingOptions} for more information.
*/
batching?: BatchingOptions;
/**
* When using SWA (Sliding Window Attention) on a supported model,
* extend the sliding window size to the current context size (meaning practically disabling SWA).
*
* Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
* but will allow reusing the evaluation cache of any prefix length of the context sequence state
* (instead of just the size of the sliding window when SWA is used).
*
* This option has no effect on models that do not support SWA (Sliding Window Attention).
*
* > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
*
* Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
*/
swaFullCache?: boolean;
/**
* Load the provided LoRA adapters onto the context.
* LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
* without the need for extensive retraining from scratch.
*
* If a string is provided, it will be treated as a path to a single LoRA adapter file.
*
* The adapters will be released from memory once the model (not just the context) is disposed.
*/
lora?: string | {
adapters: Array<{
filePath: string;
/**
* Defaults to `1`
*/
scale?: number;
}>;
/**
* Called with the LoRA adapters load percentage when the LoRA adapters are being loaded.
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
*/
onLoadProgress?(loadProgress: number): void;
};
/** An abort signal to abort the context creation */
createSignal?: AbortSignal;
/**
* Ignore insufficient memory errors and continue with the context creation.
* Can cause the process to crash if there's not enough VRAM for the new context.
*
* Defaults to `false`.
*/
ignoreMemorySafetyChecks?: boolean;
/**
* On failed context creation, retry the creation with a smaller context size.
*
* Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties.
*
* Set `retries` to `false` to disable.
*/
failedCreationRemedy?: false | {
/**
* Retries to attempt to create the context.
*
* Defaults to `6`.
*/
retries?: number;
/**
* The percentage to decrease the context size by on each retry.
* Should be a number between `0` and `1`.
*
* If a function is provided, it will be called with the current context size and should return the new context size.
*
* Defaults to `0.16`.
*/
autoContextSizeShrink?: number | ((contextSize: number) => number);
};
/**
* Track the inference performance of the context, so using `.printTimings()` will work.
*
* Defaults to `false`.
*/
performanceTracking?: boolean;
};
export type LlamaContextSequenceRepeatPenalty = {
/** Tokens to lower the predication probability of to be the next predicted token */
punishTokens: Token[] | (() => Token[]);
/**
* The maximum number of tokens that will be provided in the `punishTokens` array.
*
* This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation.
*
* Don't set this value too high, as it can allocate too much memory.
*
* Defaults to `64`.
*/
maxPunishTokens?: number;
/**
* The relative amount to lower the probability of the tokens in `punishTokens` by.
*
* Defaults to `1.1`.
* Set to `1` to disable.
*/
penalty?: number;
/**
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`.
*
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
frequencyPenalty?: number;
/**
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`.
*
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
presencePenalty?: number;
};
export type BatchingOptions = {
/**
* The strategy used to dispatch items to be processed when there are items pending to be processed.
* - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
* You can provide a custom function to define a custom dispatch schedule.
*
* Defaults to `"nextCycle"`.
*/
dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule;
/**
* The strategy used to prioritize pending items to be processed.
* - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
* - **`"firstInFirstOut"`** - process items in the order they were added.
* - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
* See the {@link CustomBatchingPrioritizationStrategy} type for more information.
*
* Defaults to `"maximumParallelism"`.
*/
itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy;
};
/**
* A function that schedules the dispatch of the batch items.
* Call the `dispatch` function to dispatch the items.
*/
export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void;
/**
* A function that prioritizes the batch items to be processed.
* The function receives an array of `items` and the `size` of how many tokens can be processed in this batch.
*
* The function should return an array of prioritized items,
* where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received,
* and where the `item` of each prioritized item is the same reference to an original item in the `items` array.
*/
export type CustomBatchingPrioritizationStrategy = (options: {
items: readonly BatchItem[];
size: number;
}) => PrioritizedBatchItem[];
export type ContextShiftOptions = {
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
strategy?: "eraseBeginning" | ((options: {
sequence: LlamaContextSequence;
size: number;
}) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>);
};
export type ContextTokensDeleteRange = {
start: number;
end: number;
};
export type SequenceEvaluateOptions = {
temperature?: number;
minP?: number;
topK?: number;
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Defaults to the current epoch time.
*
* Only relevant when using `temperature`.
*/
seed?: number;
grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
* evaluated based on the strategy chosen for the context.
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
* highest evaluation priority.
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
*/
evaluationPriority?: EvaluationPriority;
/**
* Override the sequence context shift options for this evaluation
*
* See {@link ContextShiftOptions} for more information.
*/
contextShift?: ContextShiftOptions;
/**
* Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
* When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
* Defaults to `false`.
*/
yieldEogToken?: boolean;
};
export type SequenceEvaluateMetadataOptions = {
/**
* Get the confidence (probability) of the selected token.
*
* Same as `probabilities.get(token)` from the output.
*
* If you need only this value, you can skip getting the full probabilities list to improve performance.
*
* This value might be slightly different when evaluated on different GPUs and configurations.
*/
readonly confidence?: boolean;
/**
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
*
* Only enable when needed, as it impacts the performance.
*
* Defaults to `false`.
*/
readonly probabilities?: boolean;
};
export type SequenceEvaluateOutput<Options extends {
readonly confidence?: boolean;
readonly probabilities?: boolean;
} = {
readonly confidence: true;
readonly probabilities: true;
}> = PickOptions<{
/**
* The next token generated by the model and selected using the given options (such a temperature).
*/
token: Token;
/**
* The confidence (probability) of the selected token.
*
* Same as `probabilities.get(token)`.
*
* If you need only this value, you can skip getting the full probabilities list to improve performance.
*
* This value might be slightly different when evaluated on different GPUs and configurations.
*/
confidence: number;
/**
* The probabilities of the tokens from the vocabulary to be the next token.
*
* A probability is a number from `0` to `1`.
*
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
*
* The map is sorted by the probability of the tokens from the highest to the lowest,
* and is reflected in the order of the entries when iterating over the map.
* Use `.entries().next().value` to get the top probability pair
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
*/
probabilities: Map<Token, number>;
}, Options & {
token: true;
}>;
export type ControlledEvaluateInputItem = Token | [
token: Token,
options: {
generateNext?: {
/**
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
*
* Only enable when needed, as it impacts the performance.
*
* Defaults to `false`.
*/
probabilities?: boolean;
/**
* Get the confidence (probability) of the selected token.
*
* Same as `next.probabilities.get(next.token)` from the output.
*
* If you need only this value, you can skip getting the full probabilities list to improve performance.
*
* This value might be slightly different when evaluated on different GPUs and configurations.
*/
confidence?: boolean;
/**
* Generate the next token with the provided options using sampling.
*
* Setting this to `true` will generate probabilities for the next token and sample it.
*/
token?: boolean;
options?: {
temperature?: number;
minP?: number;
topK?: number;
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Defaults to the current epoch time.
*
* Only relevant when using `temperature`.
*/
seed?: number;
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
};
};
}
];
export type ControlledEvaluateIndexOutput = {
next: {
token?: Token | null;
/**
* The confidence (probability) of the selected token (the `token` field in this object).
*
* Same as `next.probabilities.get(next.token)`.
*
* If you need only this value, you can skip getting the full probabilities list to improve performance.
*
* This value might be slightly different when evaluated on different GPUs and configurations.
*/
confidence?: number;
/**
* The probabilities of the tokens from the vocabulary to be the next token.
*
* A probability is a number from `0` to `1`.
*
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
*
* The map is sorted by the probability of the tokens from the highest to the lowest,
* and is reflected in the order of the entries when iterating over the map.
* Use `.entries().next().value` to get the top probability pair
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
*/
probabilities?: Map<Token, number>;
};
};
/**
* 1 - low
*
* 5 - high
*/
export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
export type BatchItem = {
readonly tokens: readonly Token[];
readonly logits: readonly (true | undefined)[];
readonly evaluationPriority: EvaluationPriority;
};
export type PrioritizedBatchItem = {
item: BatchItem;
processAmount: number;
};

View File

@@ -0,0 +1,2 @@
export {};
//# sourceMappingURL=types.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}

View File

@@ -0,0 +1,5 @@
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
export declare function firstInFirstOutStrategy({ items, size }: {
items: readonly BatchItem[];
size: number;
}): PrioritizedBatchItem[];

View File

@@ -0,0 +1,16 @@
export function firstInFirstOutStrategy({ items, size }) {
const res = [];
const sortedItems = items
.slice()
.sort((a, b) => b.evaluationPriority - a.evaluationPriority);
let leftFreeTokens = size;
for (const item of sortedItems) {
const processAmount = Math.min(item.tokens.length, leftFreeTokens);
res.push({ item, processAmount });
leftFreeTokens -= processAmount;
if (leftFreeTokens === 0)
break;
}
return res;
}
//# sourceMappingURL=firstInFirstOutStrategy.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}

View File

@@ -0,0 +1,5 @@
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
export declare function maximumParallelismStrategy({ items, size }: {
items: readonly BatchItem[];
size: number;
}): PrioritizedBatchItem[];

View File

@@ -0,0 +1,42 @@
export function maximumParallelismStrategy({ items, size }) {
let leftFreeTokens = size;
const minTokensForEachItem = Math.floor(leftFreeTokens / items.length);
const res = [];
const clippedItems = [];
for (const item of items) {
const processAmount = Math.min(item.tokens.length, leftFreeTokens, minTokensForEachItem);
const prioritizeItem = { item, processAmount };
res.push(prioritizeItem);
leftFreeTokens -= processAmount;
if (processAmount < item.tokens.length)
clippedItems.push(prioritizeItem);
if (leftFreeTokens === 0)
break;
}
for (let passesLeft = 3; leftFreeTokens > 0 && clippedItems.length > 0 && passesLeft > 0; passesLeft--) {
const minIncreaseAmount = Math.ceil(leftFreeTokens / clippedItems.length);
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
const prioritizeItem = clippedItems[i];
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens, minIncreaseAmount);
prioritizeItem.processAmount += increaseAmount;
if (increaseAmount === unprocessedAmount) {
clippedItems.splice(i, 1);
i--;
}
}
}
clippedItems.sort((a, b) => b.item.evaluationPriority - a.item.evaluationPriority);
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
const prioritizeItem = clippedItems[i];
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens);
prioritizeItem.processAmount += increaseAmount;
if (increaseAmount === unprocessedAmount) {
clippedItems.splice(i, 1);
i--;
}
}
return res;
}
//# sourceMappingURL=maximumParallelismStrategy.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}

View File

@@ -0,0 +1 @@
export declare function padSafeContextSize(value: number, padDirection: "up" | "down", padding?: number): number;

View File

@@ -0,0 +1,18 @@
import { contextSizePad } from "../../../config.js";
export function padSafeContextSize(value, padDirection, padding = contextSizePad) {
const paddedSize = ggmlPad(value, padding);
if (paddedSize === value)
return value;
else if (padDirection === "up")
return paddedSize;
else if (padDirection === "down") {
const smallerPaddedSize = ggmlPad(value - padding, padding);
if (smallerPaddedSize >= padding)
return smallerPaddedSize;
}
return paddedSize;
}
function ggmlPad(value, padding) {
return ((value + padding - 1) & ~(padding - 1));
}
//# sourceMappingURL=padSafeContextSize.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}

View File

@@ -0,0 +1,2 @@
import { BatchingOptions } from "../types.js";
export declare function resolveBatchItemsPrioritizationStrategy(strategy: Required<BatchingOptions>["itemPrioritizationStrategy"]): import("../types.js").CustomBatchingPrioritizationStrategy;

View File

@@ -0,0 +1,13 @@
import { maximumParallelismStrategy } from "./batchItemsPrioritizationStrategies/maximumParallelismStrategy.js";
import { firstInFirstOutStrategy } from "./batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js";
export function resolveBatchItemsPrioritizationStrategy(strategy) {
if (strategy instanceof Function)
return strategy;
else if (strategy === "maximumParallelism")
return maximumParallelismStrategy;
else if (strategy === "firstInFirstOut")
return firstInFirstOutStrategy;
void strategy;
throw new Error(`Unknown batch items prioritize strategy: ${strategy}`);
}
//# sourceMappingURL=resolveBatchItemsPrioritizationStrategy.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}

View File

@@ -0,0 +1,21 @@
export type LlamaEmbeddingOptions = {
vector: readonly number[];
};
export type LlamaEmbeddingJSON = {
type: "embedding";
vector: readonly number[];
};
export declare class LlamaEmbedding {
readonly vector: readonly number[];
constructor(options: LlamaEmbeddingOptions);
toJSON(): LlamaEmbeddingJSON;
/**
* Calculates the cosine similarity between this embedding and another embedding.
*
* Note that you should only compare embeddings created by the exact same model file.
* @returns A value between 0 and 1 representing the similarity between the embedding vectors,
* where 1 means the embeddings are identical.
*/
calculateCosineSimilarity(other: LlamaEmbedding | LlamaEmbeddingJSON | readonly number[]): number;
static fromJSON(json: LlamaEmbeddingJSON): LlamaEmbedding;
}

View File

@@ -0,0 +1,53 @@
export class LlamaEmbedding {
vector;
constructor(options) {
this.vector = Object.freeze(options.vector.slice());
}
toJSON() {
return {
type: "embedding",
vector: this.vector
};
}
/**
* Calculates the cosine similarity between this embedding and another embedding.
*
* Note that you should only compare embeddings created by the exact same model file.
* @returns A value between 0 and 1 representing the similarity between the embedding vectors,
* where 1 means the embeddings are identical.
*/
calculateCosineSimilarity(other) {
const otherVector = other instanceof Array
? other
: other.vector;
if (otherVector == null)
throw new Error("Other vector is null");
else if (otherVector.length !== this.vector.length) {
if (otherVector.length === 0 || this.vector.length === 0)
return 0;
else
throw new Error("Vectors have different lengths");
}
let dotProduct = 0;
let thisMagnitude = 0;
let otherMagnitude = 0;
for (let i = 0; i < this.vector.length; i++) {
dotProduct += this.vector[i] * otherVector[i];
thisMagnitude += Math.pow(this.vector[i], 2);
otherMagnitude += Math.pow(otherVector[i], 2);
}
if (thisMagnitude === 0 && otherMagnitude === 0)
return 1;
else if (thisMagnitude === 0 || otherMagnitude === 0)
return 0;
const thisNorm = Math.sqrt(thisMagnitude);
const otherNorm = Math.sqrt(otherMagnitude);
return dotProduct / (thisNorm * otherNorm);
}
static fromJSON(json) {
return new LlamaEmbedding({
vector: json.vector
});
}
}
//# sourceMappingURL=LlamaEmbedding.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaEmbedding.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbedding.ts"],"names":[],"mappings":"AASA,MAAM,OAAO,cAAc;IACP,MAAM,CAAoB;IAE1C,YAAmB,OAA8B;QAC7C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM;QACT,OAAO;YACH,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC;IACN,CAAC;IAED;;;;;;OAMG;IACI,yBAAyB,CAAC,KAA8D;QAC3F,MAAM,WAAW,GAAG,KAAK,YAAY,KAAK;YACtC,CAAC,CAAC,KAAK;YACP,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;QAEnB,IAAI,WAAW,IAAI,IAAI;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;aACvC,IAAI,WAAW,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACjD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC;gBACpD,OAAO,CAAC,CAAC;;gBAET,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;YAChD,aAAa,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;YAC9C,cAAc,IAAI,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;QACnD,CAAC;QAED,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAC3C,OAAO,CAAC,CAAC;aACR,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAChD,OAAO,CAAC,CAAC;QAEb,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAE5C,OAAO,UAAU,GAAG,CAAC,QAAQ,GAAG,SAAS,CAAC,CAAC;IAC/C,CAAC;IAEM,MAAM,CAAC,QAAQ,CAAC,IAAwB;QAC3C,OAAO,IAAI,cAAc,CAAC;YACtB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC,CAAC;IACP,CAAC;CACJ"}

View File

@@ -0,0 +1,52 @@
import { EventRelay } from "lifecycle-utils";
import { Token } from "../types.js";
import { LlamaText } from "../utils/LlamaText.js";
import { LlamaEmbedding } from "./LlamaEmbedding.js";
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
export type LlamaEmbeddingContextOptions = {
/**
* The number of tokens the model can see at once.
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
* the model was trained on.
* - **`number`** - set the context size to a specific number of tokens.
* If there's not enough VRAM, an error will be thrown.
* Use with caution.
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
* up to the size the model was trained on, but at least `min` and at most `max`.
*
* Defaults to `"auto"`.
*/
contextSize?: "auto" | number | {
min?: number;
max?: number;
};
/** prompt processing batch size */
batchSize?: number;
/**
* number of threads to use to evaluate tokens.
* set to 0 to use the maximum threads supported by the current machine hardware
*/
threads?: number;
/** An abort signal to abort the context creation */
createSignal?: AbortSignal;
/**
* Ignore insufficient memory errors and continue with the context creation.
* Can cause the process to crash if there's not enough VRAM for the new context.
*
* Defaults to `false`.
*/
ignoreMemorySafetyChecks?: boolean;
};
/**
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
*/
export declare class LlamaEmbeddingContext {
readonly onDispose: EventRelay<void>;
private constructor();
getEmbeddingFor(input: Token[] | string | LlamaText): Promise<LlamaEmbedding>;
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get model(): LlamaModel;
}

View File

@@ -0,0 +1,86 @@
import { AsyncDisposeAggregator, EventRelay, withLock } from "lifecycle-utils";
import { tokenizeInput } from "../utils/tokenizeInput.js";
import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
import { LlamaEmbedding } from "./LlamaEmbedding.js";
/**
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
*/
export class LlamaEmbeddingContext {
/** @internal */ _llamaContext;
/** @internal */ _sequence;
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
onDispose = new EventRelay();
constructor({ _llamaContext }) {
this._llamaContext = _llamaContext;
this._sequence = this._llamaContext.getSequence();
this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
void this._disposeAggregator.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
this._disposeAggregator.add(async () => {
await this._llamaContext.dispose();
});
}
async getEmbeddingFor(input) {
const resolvedInput = tokenizeInput(input, this._llamaContext.model.tokenizer, undefined, true);
if (resolvedInput.length > this._llamaContext.contextSize)
throw new Error("Input is longer than the context size. " +
"Try to increase the context size or use another model that supports longer contexts.");
else if (resolvedInput.length === 0)
return new LlamaEmbedding({
vector: []
});
const beginningToken = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
if (beginningToken != null && resolvedInput[0] !== beginningToken)
resolvedInput.unshift(beginningToken);
const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
if (endToken != null && resolvedInput.at(-1) !== endToken)
resolvedInput.push(endToken);
return await withLock([this, "evaluate"], async () => {
await this._sequence.eraseContextTokenRanges([{
start: 0,
end: this._sequence.nextTokenIndex
}]);
const iterator = this._sequence.evaluate(resolvedInput, { _noSampling: true });
// eslint-disable-next-line @typescript-eslint/no-unused-vars
for await (const token of iterator) {
break; // only generate one token to get embeddings
}
const embedding = this._llamaContext._ctx.getEmbedding(resolvedInput.length);
const embeddingVector = Array.from(embedding);
return new LlamaEmbedding({
vector: embeddingVector
});
});
}
async dispose() {
await this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.asyncDispose]() {
return this.dispose();
}
get disposed() {
return this._llamaContext.disposed;
}
get model() {
return this._llamaContext.model;
}
/** @internal */
static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, ignoreMemorySafetyChecks }) {
if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
throw new Error("Computing embeddings is not supported for encoder-decoder models.");
const llamaContext = await _model.createContext({
contextSize,
batchSize,
threads,
createSignal,
ignoreMemorySafetyChecks,
_embeddings: true
});
return new LlamaEmbeddingContext({
_llamaContext: llamaContext
});
}
}
//# sourceMappingURL=LlamaEmbeddingContext.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaEmbeddingContext.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbeddingContext.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,sBAAsB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,iBAAiB,CAAC;AAG7E,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAC,8BAA8B,EAAE,uBAAuB,EAAC,MAAM,4BAA4B,CAAC;AACnG,OAAO,EAAC,cAAc,EAAC,MAAM,qBAAqB,CAAC;AA2CnD;;GAEG;AACH,MAAM,OAAO,qBAAqB;IAC9B,gBAAgB,CAAkB,aAAa,CAAe;IAC9D,gBAAgB,CAAkB,SAAS,CAAuB;IAClE,gBAAgB,CAAkB,kBAAkB,GAAG,IAAI,sBAAsB,EAAE,CAAC;IAEpE,SAAS,GAAG,IAAI,UAAU,EAAQ,CAAC;IAEnD,YAAoB,EAChB,aAAa,EAGhB;QACG,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;QACnC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC;QAElD,IAAI,CAAC,kBAAkB,CAAC,GAAG,CACvB,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,cAAc,CAAC,GAAG,EAAE;YAC7C,KAAK,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;QAC3C,CAAC,CAAC,CACL,CAAC;QACF,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;QAC1D,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE;YACnC,MAAM,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;QACvC,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,eAAe,CAAC,KAAmC;QAC5D,MAAM,aAAa,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEhG,IAAI,aAAa,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW;YACrD,MAAM,IAAI,KAAK,CACX,yCAAyC;gBACzC,sFAAsF,CACzF,CAAC;aACD,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;YAC/B,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,EAAE;aACb,CAAC,CAAC;QAEP,MAAM,cAAc,GAAG,8BAA8B,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACpG,IAAI,cAAc,IAAI,IAAI,IAAI,aAAa,CAAC,CAAC,CAAC,KAAK,cAAc;YAC7D,aAAa,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAE1C,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACvF,IAAI,QAAQ,IAAI,IAAI,IAAI,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,QAAQ;YACrD,aAAa,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEjC,OAAO,MAAM,QAAQ,CAAC,CAAC,IAA6B,EAAE,UAAU,CAAC,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,IAAI,CAAC,SAAS,CAAC,uBAAuB,CAAC,CAAC;oBAC1C,KAAK,EAAE,CAAC;oBACR,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc;iBACrC,CAAC,CAAC,CAAC;YAEJ,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAC,WAAW,EAAE,IAAI,EAAC,CAAC,CAAC;YAC7E,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;gBACjC,MAAM,CAAC,4CAA4C;YACvD,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC7E,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAE9C,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,eAAe;aAC1B,CAAC,CAAC;QACP,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,OAAO;QAChB,MAAM,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;IAC5C,CAAC;IAED,cAAc;IACP,CAAC,MAAM,CAAC,YAAY,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;IAED,IAAW,QAAQ;QACf,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;IACvC,CAAC;IAED,IAAW,KAAK;QACZ,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC;IACpC,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,EACxB,MAAM,EAGT,EAAE,EACC,WAAW,EACX,SAAS,EACT,OAAO,GAAG,CAAC,EACX,YAAY,EACZ,wBAAwB,EACG;QAC3B,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU;YAChE,MAAM,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC;QAEzF,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC;YAC5C,WAAW;YACX,SAAS;YACT,OAAO;YACP,YAAY;YACZ,wBAAwB;YACxB,WAAW,EAAE,IAAI;SACpB,CAAC,CAAC;QAEH,OAAO,IAAI,qBAAqB,CAAC;YAC7B,aAAa,EAAE,YAAY;SAC9B,CAAC,CAAC;IACP,CAAC;CACJ"}

View File

@@ -0,0 +1,39 @@
import { LlamaText } from "../utils/LlamaText.js";
import { Llama } from "../bindings/Llama.js";
import { Token } from "../types.js";
export type LlamaGrammarOptions = {
/** GBNF grammar */
grammar: string;
/** Consider any of these as EOS for the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
stopGenerationTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
/** Trim whitespace from the end of the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
trimWhitespaceSuffix?: boolean;
/**
* Root rule name.
*
* Defaults to `"root"`.
*/
rootRuleName?: string;
};
/**
* @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
*/
export declare class LlamaGrammar {
/**
* > GBNF files are supported.
* > More info here: [
* github:ggml-org/llama.cpp:grammars/README.md
* ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
*
* Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
* @deprecated Use `llama.createGrammar(...)` instead.
* @param llama
* @param options
*/
constructor(llama: Llama, { grammar, stopGenerationTriggers, trimWhitespaceSuffix, rootRuleName }: LlamaGrammarOptions);
get grammar(): string;
get rootRuleName(): string;
get stopGenerationTriggers(): readonly (string | import("../utils/LlamaText.js")._LlamaText | readonly (string | Token)[])[];
get trimWhitespaceSuffix(): boolean;
static getFor(llama: Llama, type: "json" | "json_arr" | "english" | "list" | "c" | "arithmetic" | "japanese" | "chess"): Promise<LlamaGrammar>;
}

View File

@@ -0,0 +1,72 @@
import path from "path";
import fs from "fs-extra";
import { getGrammarsFolder } from "../utils/getGrammarsFolder.js";
import { LlamaText } from "../utils/LlamaText.js";
/**
* @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
*/
export class LlamaGrammar {
/** @internal */ _llama;
/** @internal */ _grammar;
/** @internal */ _stopGenerationTriggers;
/** @internal */ _trimWhitespaceSuffix;
/** @internal */ _grammarText;
/** @internal */ _rootRuleName;
/**
* > GBNF files are supported.
* > More info here: [
* github:ggml-org/llama.cpp:grammars/README.md
* ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
*
* Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
* @deprecated Use `llama.createGrammar(...)` instead.
* @param llama
* @param options
*/
constructor(llama, { grammar, stopGenerationTriggers = [], trimWhitespaceSuffix = false, rootRuleName = "root" }) {
this._llama = llama;
this._grammar = new this._llama._bindings.AddonGrammar(grammar, {
addonExports: this._llama._bindings,
rootRuleName
});
this._stopGenerationTriggers = stopGenerationTriggers ?? [];
this._trimWhitespaceSuffix = trimWhitespaceSuffix;
this._grammarText = grammar;
this._rootRuleName = rootRuleName;
}
get grammar() {
return this._grammarText;
}
get rootRuleName() {
return this._rootRuleName;
}
get stopGenerationTriggers() {
return this._stopGenerationTriggers;
}
get trimWhitespaceSuffix() {
return this._trimWhitespaceSuffix;
}
/**
* Test if the given text is compatible with the grammar.
* @internal
*/
_testText(text) {
return this._grammar.isTextCompatible(String(text));
}
static async getFor(llama, type) {
const grammarsFolder = await getGrammarsFolder(llama.buildType);
const grammarFile = path.join(grammarsFolder, type + ".gbnf");
if (await fs.pathExists(grammarFile)) {
const grammar = await fs.readFile(grammarFile, "utf8");
return new LlamaGrammar(llama, {
grammar,
stopGenerationTriggers: [LlamaText(["\n".repeat((type === "json" || type === "json_arr")
? 4
: 10)])], // this is a workaround for the model not stopping to generate text,
trimWhitespaceSuffix: true
});
}
throw new Error(`Grammar file for type "${type}" was not found in "${grammarsFolder}"`);
}
}
//# sourceMappingURL=LlamaGrammar.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,UAAU,CAAC;AAC1B,OAAO,EAAC,iBAAiB,EAAC,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAwBhD;;GAEG;AACH,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAkB,uBAAuB,CAAgE;IACzH,gBAAgB,CAAkB,qBAAqB,CAAU;IACjE,gBAAgB,CAAkB,YAAY,CAAS;IACvD,gBAAgB,CAAkB,aAAa,CAAS;IAExD;;;;;;;;;;OAUG;IACH,YAAmB,KAAY,EAAE,EAC7B,OAAO,EAAE,sBAAsB,GAAG,EAAE,EAAE,oBAAoB,GAAG,KAAK,EAAE,YAAY,GAAG,MAAM,EACvE;QAClB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC;QACpB,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE;YAC5D,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YACnC,YAAY;SACf,CAAC,CAAC;QACH,IAAI,CAAC,uBAAuB,GAAG,sBAAsB,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,qBAAqB,GAAG,oBAAoB,CAAC;QAClD,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC;QAC5B,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED,IAAW,YAAY;QACnB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED,IAAW,sBAAsB;QAC7B,OAAO,IAAI,CAAC,uBAAuB,CAAC;IACxC,CAAC;IAED,IAAW,oBAAoB;QAC3B,OAAO,IAAI,CAAC,qBAAqB,CAAC;IACtC,CAAC;IAED;;;OAGG;IACI,SAAS,CAAC,IAAY;QACzB,OAAO,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,KAAY,EAAE,IAA0F;QAC/H,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAEhE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,GAAG,OAAO,CAAC,CAAC;QAE9D,IAAI,MAAM,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YACnC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;YACvD,OAAO,IAAI,YAAY,CAAC,KAAK,EAAE;gBAC3B,OAAO;gBACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAC3C,CAAC,IAAI,KAAK,MAAM,IAAI,IAAI,KAAK,UAAU,CAAC;4BACpC,CAAC,CAAC,CAAC;4BACH,CAAC,CAAC,EAAE,CACX,CAAC,CAAC,CAAC,EAAE,oEAAoE;gBAC1E,oBAAoB,EAAE,IAAI;aAC7B,CAAC,CAAC;QACP,CAAC;QAED,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,uBAAuB,cAAc,GAAG,CAAC,CAAC;IAC5F,CAAC;CACJ"}

View File

@@ -0,0 +1,19 @@
import type { LlamaGrammar } from "./LlamaGrammar.js";
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
export type LlamaGrammarEvaluationStateOptions = {
model: LlamaModel;
grammar: LlamaGrammar;
};
/**
* Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
*
* Create a new grammar evaluation state for every response you generate with the model.
*
* This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
*/
export declare class LlamaGrammarEvaluationState {
constructor(options: LlamaGrammarEvaluationStateOptions);
constructor(existingState: LlamaGrammarEvaluationState);
/** Clone the grammar evaluation state */
clone(): LlamaGrammarEvaluationState;
}

View File

@@ -0,0 +1,29 @@
/**
* Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
*
* Create a new grammar evaluation state for every response you generate with the model.
*
* This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
*/
export class LlamaGrammarEvaluationState {
/** @internal */ _llama;
/** @internal */ _state;
constructor(existingStateOrOptions) {
if (existingStateOrOptions instanceof LlamaGrammarEvaluationState) {
this._llama = existingStateOrOptions._llama;
this._state = new this._llama._bindings.AddonGrammarEvaluationState(existingStateOrOptions._state);
}
else {
const { model, grammar } = existingStateOrOptions;
this._llama = model._llama;
if (model._llama !== grammar._llama)
throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
}
}
/** Clone the grammar evaluation state */
clone() {
return new LlamaGrammarEvaluationState(this);
}
}
//# sourceMappingURL=LlamaGrammarEvaluationState.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaGrammarEvaluationState.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammarEvaluationState.ts"],"names":[],"mappings":"AAWA;;;;;;GAMG;AACH,MAAM,OAAO,2BAA2B;IACpC,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,MAAM,CAA8B;IAIrE,YAAmB,sBAAwF;QACvG,IAAI,sBAAsB,YAAY,2BAA2B,EAAE,CAAC;YAChE,IAAI,CAAC,MAAM,GAAG,sBAAsB,CAAC,MAAM,CAAC;YAC5C,IAAI,CAAC,MAAM,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,sBAAsB,CAAC,MAAM,CAAC,CAAC;QACvG,CAAC;aAAM,CAAC;YACJ,MAAM,EAAC,KAAK,EAAE,OAAO,EAAC,GAAG,sBAAsB,CAAC;YAChD,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YAE3B,IAAI,KAAK,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM;gBAC/B,MAAM,IAAI,KAAK,CAAC,4EAA4E,CAAC,CAAC;YAElG,IAAI,CAAC,MAAM,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzG,CAAC;IACL,CAAC;IAED,yCAAyC;IAClC,KAAK;QACR,OAAO,IAAI,2BAA2B,CAAC,IAAI,CAAC,CAAC;IACjD,CAAC;CACJ"}

View File

@@ -0,0 +1,17 @@
import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../utils/gbnfJson/types.js";
import { Llama } from "../bindings/Llama.js";
import { LlamaGrammar } from "./LlamaGrammar.js";
/**
* @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
* @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
*/
export declare class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs> = Record<any, any>> extends LlamaGrammar {
private readonly _schema;
/**
* Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
* @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
*/
constructor(llama: Llama, schema: Readonly<T> & GbnfJsonSchema<Defs>);
get schema(): Readonly<T>;
parse(json: string): GbnfJsonSchemaToType<T>;
}

View File

@@ -0,0 +1,35 @@
import { getGbnfGrammarForGbnfJsonSchema } from "../utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.js";
import { validateObjectAgainstGbnfSchema } from "../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
import { LlamaText } from "../utils/LlamaText.js";
import { LlamaGrammar } from "./LlamaGrammar.js";
/* eslint-disable @stylistic/max-len */
/**
* @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
* @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
*/
export class LlamaJsonSchemaGrammar extends LlamaGrammar {
_schema;
/**
* Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
* @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
*/
constructor(llama, schema) {
const grammar = getGbnfGrammarForGbnfJsonSchema(schema);
super(llama, {
grammar,
stopGenerationTriggers: [LlamaText(["\n".repeat(4)])],
trimWhitespaceSuffix: true
});
this._schema = schema;
}
get schema() {
return this._schema;
}
parse(json) {
const parsedJson = JSON.parse(json);
validateObjectAgainstGbnfSchema(parsedJson, this._schema);
return parsedJson;
}
}
/* eslint-enable @stylistic/max-len */
//# sourceMappingURL=LlamaJsonSchemaGrammar.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LlamaJsonSchemaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaJsonSchemaGrammar.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,+BAA+B,EAAC,MAAM,sDAAsD,CAAC;AACrG,OAAO,EAAC,+BAA+B,EAAC,MAAM,4DAA4D,CAAC;AAC3G,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAEhD,OAAO,EAAC,YAAY,EAAC,MAAM,mBAAmB,CAAC;AAE/C,uCAAuC;AACvC;;;GAGG;AACH,MAAM,OAAO,sBAGX,SAAQ,YAAY;IACD,OAAO,CAAI;IAE5B;;;OAGG;IACH,YAAmB,KAAY,EAAE,MAA0C;QACvE,MAAM,OAAO,GAAG,+BAA+B,CAAC,MAAM,CAAC,CAAC;QAExD,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;IAC1B,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,OAAO,CAAC;IACxB,CAAC;IAEM,KAAK,CAAC,IAAY;QACrB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,+BAA+B,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAE1D,OAAO,UAAU,CAAC;IACtB,CAAC;CACJ;AACD,sCAAsC"}

View File

@@ -0,0 +1,311 @@
import { EventRelay } from "lifecycle-utils";
import { Token, Tokenizer } from "../../types.js";
import { ModelTypeDescription } from "../../bindings/AddonTypes.js";
import { LlamaVocabularyType } from "../../bindings/types.js";
import { GgufFileInfo } from "../../gguf/types/GgufFileInfoTypes.js";
import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
import { LlamaContextOptions } from "../LlamaContext/types.js";
import { LlamaContext } from "../LlamaContext/LlamaContext.js";
import { LlamaEmbeddingContext, LlamaEmbeddingContextOptions } from "../LlamaEmbeddingContext.js";
import { GgufMetadata } from "../../gguf/types/GgufMetadataTypes.js";
import { OverridesObject } from "../../utils/OverridesObject.js";
import { LlamaRankingContext, LlamaRankingContextOptions } from "../LlamaRankingContext.js";
import { TokenAttributes } from "./utils/TokenAttributes.js";
import type { Llama } from "../../bindings/Llama.js";
import type { BuiltinSpecialTokenValue } from "../../utils/LlamaText.js";
export type LlamaModelOptions = {
/** path to the model on the filesystem */
modelPath: string;
/**
* Number of layers to store in VRAM.
* - **`"auto"`** - adapt to the current VRAM state and try to fit as many layers as possible in it.
* Takes into account the VRAM required to create a context with a `contextSize` set to `"auto"`.
* - **`"max"`** - store all layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
* - **`number`** - store the specified number of layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
* - **`{min?: number, max?: number, fitContext?: {contextSize: number}}`** - adapt to the current VRAM state and try to fit as
* many layers as possible in it, but at least `min` and at most `max` layers. Set `fitContext` to the parameters of a context you
* intend to create with the model, so it'll take it into account in the calculations and leave enough memory for such a context.
*
* If GPU support is disabled, will be set to `0` automatically.
*
* Defaults to `"auto"`.
*/
gpuLayers?: "auto" | "max" | number | {
min?: number;
max?: number;
fitContext?: {
contextSize?: number;
/**
* Defaults to `false`.
*/
embeddingContext?: boolean;
};
};
/**
* Only load the vocabulary, not weight tensors.
*
* Useful when you only want to use the model to use its tokenizer but not for evaluation.
*
* Defaults to `false`.
*/
vocabOnly?: boolean;
/**
* Use mmap (memory-mapped file) to load the model.
*
* Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
* and makes it easier for the system to manage memory.
*
* When using mmap, you might notice a delay the first time you actually use the model,
* which is caused by the OS itself loading the model into memory.
*
* Defaults to `true` if the current system supports it.
*/
useMmap?: boolean;
/**
* Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory,
* bypassing OS in-memory caches.
*
* It leads to improved model loading times and reduced RAM usage,
* on the expense of higher loading times when the model unloaded and loaded again repeatedly in a short period of time.
*
* When this option is enabled, if Direct I/O is supported by the system (and for the given file)
* it will be used and mmap will be disabled.
*
* Unsupported on macOS.
*
* Defaults to `true`.
*/
useDirectIo?: boolean;
/**
* Force the system to keep the model in the RAM/VRAM.
* Use with caution as this can crash your system if the available resources are insufficient.
*/
useMlock?: boolean;
/**
* Check for tensor validity before actually loading the model.
* Using it increases the time it takes to load the model.
*
* Defaults to `false`.
*/
checkTensors?: boolean;
/**
* Enable flash attention by default for contexts created with this model.
* Only works with models that support flash attention.
*
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
*
* The support for flash attention is currently experimental and may not always work as expected.
* Use with caution.
*
* This option will be ignored if flash attention is not supported by the model.
*
* Enabling this affects the calculations of default values for the model and contexts created with it
* as flash attention reduces the amount of memory required,
* which allows for more layers to be offloaded to the GPU and for context sizes to be bigger.
*
* Defaults to `false`.
*
* Upon flash attention exiting the experimental status, the default value will become `true`.
*/
defaultContextFlashAttention?: boolean;
/**
* When using SWA (Sliding Window Attention) on a supported model,
* extend the sliding window size to the current context size (meaning practically disabling SWA)
* by default for contexts created with this model.
*
* See the `swaFullCache` option of the `.createContext()` method for more information.
*
* Defaults to `false`.
*/
defaultContextSwaFullCache?: boolean;
/**
* Called with the load percentage when the model is being loaded.
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
*/
onLoadProgress?(loadProgress: number): void;
/** An abort signal to abort the model load */
loadSignal?: AbortSignal;
/**
* Ignore insufficient memory errors and continue with the model load.
* Can cause the process to crash if there's not enough VRAM to fit the model.
*
* Defaults to `false`.
*/
ignoreMemorySafetyChecks?: boolean;
/**
* Metadata overrides to load the model with.
*
* > **Note:** Most metadata value overrides aren't supported and overriding them will have no effect on `llama.cpp`.
* > Only use this for metadata values that are explicitly documented to be supported by `llama.cpp` to be overridden,
* > and only in cases when this is crucial, as this is not guaranteed to always work as expected.
*/
metadataOverrides?: OverridesObject<GgufMetadata, number | bigint | boolean | string>;
};
export declare class LlamaModel {
readonly tokenizer: Tokenizer;
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get llama(): Llama;
get tokens(): LlamaModelTokens;
get filename(): string | undefined;
get fileInfo(): GgufFileInfo;
get fileInsights(): GgufInsights;
/**
* Number of layers offloaded to the GPU.
* If GPU support is disabled, this will always be `0`.
*/
get gpuLayers(): number;
/**
* Total model size in memory in bytes.
*
* When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
*/
get size(): number;
get flashAttentionSupported(): boolean;
get defaultContextFlashAttention(): boolean;
get defaultContextSwaFullCache(): boolean;
/**
* Transform text into tokens that can be fed to the model
* @param text - the text to tokenize
* @param [specialTokens] - if set to true, text that correspond to special tokens will be tokenized to those tokens.
* For example, `<s>` will be tokenized to the BOS token if `specialTokens` is set to `true`,
* otherwise it will be tokenized to tokens that corresponds to the plaintext `<s>` string.
* @param [options] - additional options for tokenization.
* If set to `"trimLeadingSpace"`, a leading space will be trimmed from the tokenized output if the output has an
* additional space at the beginning.
*/
tokenize(text: string, specialTokens?: boolean, options?: "trimLeadingSpace"): Token[];
tokenize(text: BuiltinSpecialTokenValue, specialTokens: "builtin"): Token[];
/**
* Transform tokens into text
* @param tokens - the tokens to detokenize.
* @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
*
* Recommended for debugging purposes only.
*
* > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
* this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
*
* Defaults to `false`.
* @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
* If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
* and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
*
* Using it may have no effect with some models, but it is still recommended.
*/
detokenize(tokens: readonly Token[], specialTokens?: boolean, lastTokens?: readonly Token[]): string;
getTokenAttributes(token: Token): TokenAttributes;
/** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
isSpecialToken(token: Token | undefined): boolean;
iterateAllTokens(): Generator<Token, void, unknown>;
/** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
isEogToken(token: Token | undefined): boolean;
createContext(options?: LlamaContextOptions): Promise<LlamaContext>;
/**
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
*/
createEmbeddingContext(options?: LlamaEmbeddingContextOptions): Promise<LlamaEmbeddingContext>;
/**
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
*/
createRankingContext(options?: LlamaRankingContextOptions): Promise<LlamaRankingContext>;
/**
* Get warnings about the model file that would affect its usage.
*
* These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
*/
getWarnings(): string[];
/** @hidden `ModelTypeDescription` type alias is too long in the documentation */
get typeDescription(): ModelTypeDescription;
/** The context size the model was trained on */
get trainContextSize(): number;
/** The size of an embedding vector the model can produce */
get embeddingVectorSize(): number;
get vocabularyType(): LlamaVocabularyType;
}
export declare class LlamaModelTokens {
private constructor();
/**
* @returns infill tokens
*/
get infill(): LlamaModelInfillTokens;
/**
* @returns The BOS (Beginning Of Sequence) token.
*/
get bos(): Token | null;
/**
* @returns The EOS (End Of Sequence) token.
*/
get eos(): Token | null;
/**
* @returns The EOT (End Of Turn) token.
*/
get eot(): Token | null;
/**
* @returns The SEP (Sentence Separator) token.
*/
get sep(): Token | null;
/**
* @returns The NL (New Line) token.
*/
get nl(): Token | null;
/**
* @returns The BOS (Beginning Of Sequence) token text representation.
*/
get bosString(): string | null;
/**
* @returns The EOS (End Of Sequence) token text representation.
*/
get eosString(): string | null;
/**
* @returns The EOT (End Of Turn) token text representation.
*/
get eotString(): string | null;
/**
* @returns The SEP (Sentence Separator) token text representation.
*/
get sepString(): string | null;
/**
* @returns The NL (New Line) token text representation.
*/
get nlString(): string | null;
/**
* @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
*/
get shouldPrependBosToken(): boolean;
/**
* @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
*/
get shouldAppendEosToken(): boolean;
}
export declare class LlamaModelInfillTokens {
private constructor();
/**
* @returns The beginning of infill prefix token.
*/
get prefix(): Token | null;
/**
* @returns The beginning of infill middle token.
*/
get middle(): Token | null;
/**
* @returns The beginning of infill suffix token.
*/
get suffix(): Token | null;
/**
* @returns The beginning of infill prefix token as a string.
*/
get prefixString(): string | null;
/**
* @returns The beginning of infill middle token as a string.
*/
get middleString(): string | null;
/**
* @returns The beginning of infill suffix token as a string.
*/
get suffixString(): string | null;
}

View File

@@ -0,0 +1,832 @@
import process from "process";
import path from "path";
import { AsyncDisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { removeNullFields } from "../../utils/removeNullFields.js";
import { DisposeGuard } from "../../utils/DisposeGuard.js";
import { LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues } from "../../bindings/types.js";
import { readGgufFileInfo } from "../../gguf/readGgufFileInfo.js";
import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
import { getConsoleLogPrefix } from "../../utils/getConsoleLogPrefix.js";
import { getReadablePath } from "../../cli/utils/getReadablePath.js";
import { LlamaContext } from "../LlamaContext/LlamaContext.js";
import { LlamaEmbeddingContext } from "../LlamaEmbeddingContext.js";
import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
import { maxRecentDetokenizerTokens } from "../../consts.js";
import { LlamaRankingContext } from "../LlamaRankingContext.js";
import { TokenAttribute, TokenAttributes } from "./utils/TokenAttributes.js";
const defaultUseMmap = true;
const defaultUseDirectIo = true;
const defaultContextFlashAttentionEnabled = false;
const defaultContextSwaFullCache = false;
export class LlamaModel {
/** @internal */ _llama;
/** @internal */ _model;
/** @internal */ _backendModelDisposeGuard;
/** @internal */ _tokens;
/** @internal */ _modelPath;
/** @internal */ _fileInfo;
/** @internal */ _fileInsights;
/** @internal */ _gpuLayers;
/** @internal */ _vocabOnly;
/** @internal */ _filename;
/** @internal */ _disposedState = { disposed: false };
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
/** @internal */ _llamaPreventDisposalHandle;
/** @internal */ _defaultContextFlashAttentionOptionEnabled;
/** @internal */ _defaultContextFlashAttention;
/** @internal */ _defaultContextSwaFullCache;
/** @internal */ _flashAttentionSupported;
/** @internal */ _loraAdapters = new Map();
/** @internal */ _typeDescription;
/** @internal */ _trainContextSize;
/** @internal */ _embeddingVectorSize;
/** @internal */ _vocabularyType;
tokenizer;
onDispose = new EventRelay();
constructor({ modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides }, { _llama, _fileInfo, _fileInsights, _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, _defaultContextSwaFullCache, _flashAttentionSupported }) {
this._llama = _llama;
this._fileInfo = _fileInfo;
this._modelPath = path.resolve(process.cwd(), modelPath);
this._fileInsights = _fileInsights;
this._gpuLayers = gpuLayers;
this._vocabOnly = vocabOnly ?? false;
this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]);
this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
this._defaultContextFlashAttention = _defaultContextFlashAttention;
this._defaultContextSwaFullCache = _defaultContextSwaFullCache;
this._flashAttentionSupported = _flashAttentionSupported;
const overridesList = ggufMetadataOverridesToList(metadataOverrides);
this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
addonExports: this._llama._bindings,
gpuLayers,
vocabOnly: this._vocabOnly,
useMmap,
useDirectIo,
useMlock: _llama.supportsMlock
? useMlock
: undefined,
checkTensors: checkTensors ?? false,
onLoadProgress: onLoadProgress == null
? undefined
: (loadPercentage) => {
try {
onLoadProgress(loadPercentage);
}
catch (err) {
// the native addon code calls this function, so there's no use to throw an error here
console.error(err);
}
},
hasLoadAbortSignal: loadSignal != null,
overridesList: overridesList.length > 0
? overridesList
: undefined
}));
this._tokens = LlamaModelTokens._create(this._model, this._disposedState);
this._filename = path.basename(modelPath);
this._disposeAggregator.add(() => {
this._disposedState.disposed = true;
});
this._disposeAggregator.add(this.onDispose.dispatchEvent);
this._disposeAggregator.add(this._llama.onDispose.createListener(disposeModelIfReferenced.bind(null, new WeakRef(this))));
this._disposeAggregator.add(async () => {
await this._backendModelDisposeGuard.acquireDisposeLock();
await this._model.dispose();
this._llamaPreventDisposalHandle.dispose();
});
this.tokenize = this.tokenize.bind(this);
this.detokenize = this.detokenize.bind(this);
this.isSpecialToken = this.isSpecialToken.bind(this);
this.isEogToken = this.isEogToken.bind(this);
this.tokenize.detokenize = this.detokenize;
this.tokenize.isSpecialToken = this.isSpecialToken;
this.tokenize.isEogToken = this.isEogToken;
Object.freeze(this.tokenize);
this.tokenizer = this.tokenize;
}
async dispose() {
if (this._disposedState.disposed)
return;
this._disposedState.disposed = true;
await this._disposeAggregator.dispose();
}
/** @hidden */
async [Symbol.asyncDispose]() {
await this.dispose();
}
get disposed() {
return this._disposedState.disposed;
}
get llama() {
return this._llama;
}
get tokens() {
return this._tokens;
}
get filename() {
return this._filename;
}
get fileInfo() {
return this._fileInfo;
}
get fileInsights() {
return this._fileInsights;
}
/**
* Number of layers offloaded to the GPU.
* If GPU support is disabled, this will always be `0`.
*/
get gpuLayers() {
return this._gpuLayers;
}
/**
* Total model size in memory in bytes.
*
* When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
*/
get size() {
this._ensureNotDisposed();
return this._model.getModelSize();
}
get flashAttentionSupported() {
return this._flashAttentionSupported;
}
get defaultContextFlashAttention() {
return this._defaultContextFlashAttention;
}
get defaultContextSwaFullCache() {
return this._defaultContextSwaFullCache;
}
tokenize(text, specialTokens = false, options) {
this._ensureNotDisposed();
if (text === "")
return [];
if (specialTokens === "builtin") {
const builtinToken = text;
switch (builtinToken) {
case "BOS": return this.tokens.bos == null ? [] : [this.tokens.bos];
case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
}
void builtinToken;
throw new Error(`Unknown builtin special token: ${builtinToken}`);
}
if (options === "trimLeadingSpace") {
if (specialTokens) {
const countLeadingSpaces = (text) => {
let count = 0;
for (; count < text.length; count++) {
if (text[count] !== " ")
break;
}
return count;
};
const textLeadingSpaces = countLeadingSpaces(text);
const [workaroundToken, workaroundTokenString] = (this.tokens.bos != null && this.tokens.bosString != null)
? [this.tokens.bos, this.tokens.bosString]
: (this.tokens.eos != null && this.tokens.eosString != null)
? [this.tokens.eos, this.tokens.eosString]
: (this.tokens.nl != null && this.tokens.nlString != null)
? [this.tokens.nl, this.tokens.nlString]
: (this.tokens.eot != null && this.tokens.eotString != null)
? [this.tokens.eot, this.tokens.eotString]
: [null, null];
if (workaroundToken != null && workaroundTokenString != null) {
const tokens = Array.from(this._model.tokenize(workaroundTokenString + text, true));
const workaroundTokenIndex = tokens.indexOf(workaroundToken);
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
if (workaroundTokenIndex >= 0 && workaroundTokenIndex <= 1) {
tokens.splice(0, workaroundTokenIndex + 1);
if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
return tokens;
}
}
const workaroundTokensString = "\n";
const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, true));
if (text.startsWith(workaroundTokensString)) {
const tokens = Array.from(this._model.tokenize(text, true));
if (this.detokenize(tokens, true).startsWith(workaroundTokensString))
return tokens;
}
const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, true));
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
tokens.splice(0, workaroundTokens.length);
if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
return tokens;
}
}
else {
const workaroundTokensString = "\n";
const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, false));
if (text.startsWith(workaroundTokensString)) {
const tokens = Array.from(this._model.tokenize(text, false));
if (this.detokenize(tokens, false).startsWith(workaroundTokensString))
return tokens;
}
const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, false));
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
tokens.splice(0, workaroundTokens.length);
return tokens;
}
}
}
return Array.from(this._model.tokenize(text, specialTokens));
}
/**
* Transform tokens into text
* @param tokens - the tokens to detokenize.
* @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
*
* Recommended for debugging purposes only.
*
* > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
* this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
*
* Defaults to `false`.
* @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
* If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
* and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
*
* Using it may have no effect with some models, but it is still recommended.
*/
detokenize(tokens, specialTokens = false, lastTokens) {
this._ensureNotDisposed();
if (tokens.length === 0)
return "";
if (lastTokens == null || lastTokens.length === 0)
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
const addedTokens = lastTokens.slice(-maxRecentDetokenizerTokens);
const addedTokensText = this._model.detokenize(Uint32Array.from(addedTokens), Boolean(specialTokens));
if (addedTokensText === "")
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
const text = this._model.detokenize(Uint32Array.from([...addedTokens, ...tokens]), Boolean(specialTokens));
if (text.startsWith(addedTokensText))
return text.slice(addedTokensText.length);
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
}
getTokenAttributes(token) {
if (token == null)
throw new Error("Token cannot be null");
if (this.vocabularyType === LlamaVocabularyType.none)
return TokenAttributes._create(token, TokenAttribute.undefined);
return TokenAttributes._create(token, this._model.getTokenAttributes(token));
}
/** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
isSpecialToken(token) {
if (token == null)
return false;
if (this.getTokenAttributes(token).control)
return true;
const normalText = this.detokenize([token], false);
if (normalText === "")
return this.detokenize([token], true) !== "";
return false;
}
*iterateAllTokens() {
if (this.vocabularyType === LlamaVocabularyType.none)
return;
const totalTokens = this.fileInfo.metadata?.tokenizer?.ggml?.tokens?.length;
if (typeof totalTokens !== "number")
return;
for (let i = 0; i < totalTokens; i++)
yield i;
}
/** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
isEogToken(token) {
if (token == null)
return false;
return token === this.tokens.eos || token === this.tokens.eot || this._model.isEogToken(token);
}
async createContext(options = {}) {
if (this._vocabOnly)
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
return await withLock([this._llama._memoryLock, LlamaLocks.loadToMemory], options.createSignal, async () => {
const preventDisposalHandle = this._backendModelDisposeGuard.createPreventDisposalHandle();
try {
return await LlamaContext._create(options, { _model: this });
}
finally {
preventDisposalHandle.dispose();
}
});
}
/**
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
*/
async createEmbeddingContext(options = {}) {
if (this._vocabOnly)
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
return await LlamaEmbeddingContext._create({ _model: this }, options);
}
/**
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
*/
async createRankingContext(options = {}) {
if (this._vocabOnly)
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
return await LlamaRankingContext._create({ _model: this }, options);
}
/**
* Get warnings about the model file that would affect its usage.
*
* These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
*/
getWarnings() {
this._ensureNotDisposed();
const warnings = this._fileInsights.getWarnings(this._modelPath);
const modelFilePathText = `("${getReadablePath(this._modelPath)}")`;
try {
const beforeTextNoSpecialTokens = "some test text here";
const afterTextNoSpecialTokens = this.detokenize(this.tokenize(beforeTextNoSpecialTokens, false, "trimLeadingSpace"), false);
if (beforeTextNoSpecialTokens !== afterTextNoSpecialTokens)
warnings.push(`Using this model ${modelFilePathText} to tokenize text and then detokenize it resulted in a different text. ` +
"There might be an issue with the model or the tokenizer implementation. " +
"Using this model may not work as intended");
}
catch (err) {
// do nothing
}
try {
if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) {
if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
warnings.push("Flash attention is incompatible with Grok and thus was turned off");
else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
warnings.push("Flash attention is incompatible with Gemma2 and thus was turned off");
else {
const nHead = this.fileInfo.architectureMetadata?.attention?.head_count ?? 0;
const nEmbd = this.fileInfo.architectureMetadata?.embedding_length ?? 0;
const nEmbdHeadK = this.fileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
const nEmbdHeadV = this.fileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
if (nEmbdHeadK !== nEmbdHeadV)
warnings.push("Flash attention is incompatible with this model and thus was turned off");
}
}
}
catch (err) {
// do nothing
}
return warnings;
}
/** @hidden `ModelTypeDescription` type alias is too long in the documentation */
get typeDescription() {
this._ensureNotDisposed();
if (this._typeDescription == null)
this._typeDescription = this._model.getModelDescription();
return this._typeDescription;
}
/** The context size the model was trained on */
get trainContextSize() {
this._ensureNotDisposed();
if (this._trainContextSize == null)
this._trainContextSize = this._model.getTrainContextSize();
return this._trainContextSize;
}
/** The size of an embedding vector the model can produce */
get embeddingVectorSize() {
this._ensureNotDisposed();
if (this._embeddingVectorSize == null)
this._embeddingVectorSize = this._model.getEmbeddingVectorSize();
return this._embeddingVectorSize;
}
get vocabularyType() {
this._ensureNotDisposed();
if (this._vocabularyType == null) {
const vocabType = this._model.getVocabularyType();
this._vocabularyType = LlamaVocabularyTypeValues[vocabType];
if (this._vocabularyType == null) {
console.warn(getConsoleLogPrefix() + "Unknown vocabulary type:", vocabType);
this._vocabularyType = LlamaVocabularyType.none;
}
}
return this._vocabularyType;
}
/** @internal */
_ensureNotDisposed() {
if (this._disposedState.disposed)
throw new DisposedError();
}
/** @internal */
async _getOrLoadLora(filePath) {
const resolvedPath = path.resolve(process.cwd(), filePath);
if (this._loraAdapters.has(resolvedPath))
return this._loraAdapters.get(resolvedPath);
return await withLock([this._loraAdapters, "modify"], async () => {
if (this._loraAdapters.has(resolvedPath))
return this._loraAdapters.get(resolvedPath);
const lora = new this._llama._bindings.AddonModelLora(this._model, resolvedPath);
await this._model.loadLora(lora);
this._loraAdapters.set(resolvedPath, lora);
return lora;
});
}
/** @internal */
static async _create(modelOptions, { _llama }) {
const { loadSignal, defaultContextFlashAttention } = modelOptions;
const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
sourceType: "filesystem",
signal: loadSignal
});
applyGgufMetadataOverrides(fileInfo, modelOptions.metadataOverrides);
const ggufInsights = await GgufInsights.from(fileInfo, _llama);
const flashAttentionSupported = ggufInsights.flashAttentionSupported;
const resolvedDefaultContextFlashAttention = flashAttentionSupported
? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
: false;
const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
useMmap
});
const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
gpuLayers: gpuLayers,
useMmap
});
const model = new LlamaModel({ ...modelOptions, gpuLayers, useMmap, useDirectIo }, {
_fileInfo: fileInfo,
_fileInsights: ggufInsights,
_llama,
_defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
_flashAttentionSupported: flashAttentionSupported,
_defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
_defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache
});
const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
? null
: _llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
const modelCreationRamReservation = modelOptions.ignoreMemorySafetyChecks
? null
: _llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
const loggedWarnings = new Set();
function onAbort() {
model._model.abortActiveModelLoad();
loadSignal?.removeEventListener("abort", onAbort);
}
function logWarnings(warnings) {
for (const warning of warnings) {
if (loggedWarnings.has(warning))
continue;
_llama._log(LlamaLogLevel.warn, warning);
loggedWarnings.add(warning);
}
}
if (loadSignal != null) {
if (loadSignal.aborted)
throw loadSignal.reason;
loadSignal.addEventListener("abort", onAbort);
}
logWarnings(ggufInsights.getWarnings(modelOptions.modelPath));
try {
const modelLoaded = await model._model.init();
if (loadSignal?.aborted) {
if (modelLoaded)
await model._model.dispose();
throw loadSignal.reason;
}
else if (!modelLoaded)
throw new Error("Failed to load model");
loadSignal?.removeEventListener("abort", onAbort);
logWarnings(model.getWarnings());
return model;
}
finally {
loadSignal?.removeEventListener("abort", onAbort);
modelCreationVramReservation?.dispose?.();
modelCreationRamReservation?.dispose?.();
}
}
}
export class LlamaModelTokens {
/** @internal */ _model;
/** @internal */ _disposedState;
/** @internal */ _infillTokens;
/** @internal */ _bosToken;
/** @internal */ _eosToken;
/** @internal */ _eotToken;
/** @internal */ _sepToken;
/** @internal */ _nlToken;
/** @internal */ _bosString;
/** @internal */ _eosString;
/** @internal */ _eotString;
/** @internal */ _sepString;
/** @internal */ _nlString;
/** @internal */ _shouldPrependBosToken;
/** @internal */ _shouldAppendEosToken;
constructor(model, disposedState) {
this._model = model;
this._disposedState = disposedState;
}
/**
* @returns infill tokens
*/
get infill() {
this._ensureNotDisposed();
if (this._infillTokens == null)
this._infillTokens = LlamaModelInfillTokens._create(this._model, this._disposedState);
return this._infillTokens;
}
/**
* @returns The BOS (Beginning Of Sequence) token.
*/
get bos() {
this._ensureNotDisposed();
if (this._bosToken == null)
this._bosToken = this._model.tokenBos();
if (this._bosToken === -1)
return null;
return this._bosToken;
}
/**
* @returns The EOS (End Of Sequence) token.
*/
get eos() {
this._ensureNotDisposed();
if (this._eosToken == null)
this._eosToken = this._model.tokenEos();
if (this._eosToken === -1)
return null;
return this._eosToken;
}
/**
* @returns The EOT (End Of Turn) token.
*/
get eot() {
this._ensureNotDisposed();
if (this._eotToken == null)
this._eotToken = this._model.eotToken();
if (this._eotToken === -1)
return null;
return this._eotToken;
}
/**
* @returns The SEP (Sentence Separator) token.
*/
get sep() {
this._ensureNotDisposed();
if (this._sepToken == null)
this._sepToken = this._model.sepToken();
if (this._sepToken === -1)
return null;
return this._sepToken;
}
/**
* @returns The NL (New Line) token.
*/
get nl() {
this._ensureNotDisposed();
if (this._nlToken == null)
this._nlToken = this._model.tokenNl();
if (this._nlToken === -1)
return null;
return this._nlToken;
}
/**
* @returns The BOS (Beginning Of Sequence) token text representation.
*/
get bosString() {
this._ensureNotDisposed();
const bosToken = this.bos;
if (bosToken == null)
return null;
if (this._bosString == null)
this._bosString = this._model.getTokenString(bosToken);
return this._bosString;
}
/**
* @returns The EOS (End Of Sequence) token text representation.
*/
get eosString() {
this._ensureNotDisposed();
const eosToken = this.eos;
if (eosToken == null)
return null;
if (this._eosString == null)
this._eosString = this._model.getTokenString(eosToken);
return this._eosString;
}
/**
* @returns The EOT (End Of Turn) token text representation.
*/
get eotString() {
this._ensureNotDisposed();
const eotToken = this.eot;
if (eotToken == null)
return null;
if (this._eotString == null)
this._eotString = this._model.getTokenString(eotToken);
return this._eotString;
}
/**
* @returns The SEP (Sentence Separator) token text representation.
*/
get sepString() {
this._ensureNotDisposed();
const sepToken = this.sep;
if (sepToken == null)
return null;
if (this._sepString == null)
this._sepString = this._model.getTokenString(sepToken);
return this._sepString;
}
/**
* @returns The NL (New Line) token text representation.
*/
get nlString() {
this._ensureNotDisposed();
const nlToken = this.nl;
if (nlToken == null)
return null;
if (this._nlString == null)
this._nlString = this._model.getTokenString(nlToken);
return this._nlString;
}
/**
* @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
*/
get shouldPrependBosToken() {
this._ensureNotDisposed();
if (this._shouldPrependBosToken == null)
this._shouldPrependBosToken = this.bos != null && this._model.shouldPrependBosToken();
return this._shouldPrependBosToken;
}
/**
* @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
*/
get shouldAppendEosToken() {
this._ensureNotDisposed();
if (this._shouldAppendEosToken == null)
this._shouldAppendEosToken = this.bos != null && this._model.shouldAppendEosToken();
return this._shouldAppendEosToken;
}
/** @internal */
_ensureNotDisposed() {
if (this._disposedState.disposed)
throw new DisposedError();
}
/** @internal */
static _create(model, disposedState) {
return new LlamaModelTokens(model, disposedState);
}
}
export class LlamaModelInfillTokens {
/** @internal */ _model;
/** @internal */ _disposedState;
/** @internal */ _prefixToken;
/** @internal */ _middleToken;
/** @internal */ _suffixToken;
/** @internal */ _prefixString;
/** @internal */ _middleString;
/** @internal */ _suffixString;
constructor(model, disposedState) {
this._model = model;
this._disposedState = disposedState;
}
/**
* @returns The beginning of infill prefix token.
*/
get prefix() {
this._ensureNotDisposed();
if (this._prefixToken == null)
this._prefixToken = this._resolveSpecialToken(this._model.prefixToken(), ["<fim_prefix>"]);
if (this._prefixToken === -1)
return null;
return this._prefixToken;
}
/**
* @returns The beginning of infill middle token.
*/
get middle() {
this._ensureNotDisposed();
if (this._middleToken == null)
this._middleToken = this._resolveSpecialToken(this._model.middleToken(), ["<fim_middle>"]);
if (this._middleToken === -1)
return null;
return this._middleToken;
}
/**
* @returns The beginning of infill suffix token.
*/
get suffix() {
this._ensureNotDisposed();
if (this._suffixToken == null)
this._suffixToken = this._resolveSpecialToken(this._model.suffixToken(), ["<fim_suffix>"]);
if (this._suffixToken === -1)
return null;
return this._suffixToken;
}
/**
* @returns The beginning of infill prefix token as a string.
*/
get prefixString() {
this._ensureNotDisposed();
const prefixToken = this.prefix;
if (prefixToken == null)
return null;
if (this._prefixString == null)
this._prefixString = this._model.getTokenString(prefixToken);
return this._prefixString;
}
/**
* @returns The beginning of infill middle token as a string.
*/
get middleString() {
this._ensureNotDisposed();
const middleToken = this.middle;
if (middleToken == null)
return null;
if (this._middleString == null)
this._middleString = this._model.getTokenString(middleToken);
return this._middleString;
}
/**
* @returns The beginning of infill suffix token as a string.
*/
get suffixString() {
this._ensureNotDisposed();
const suffixToken = this.suffix;
if (suffixToken == null)
return null;
if (this._suffixString == null)
this._suffixString = this._model.getTokenString(suffixToken);
return this._suffixString;
}
/** @internal */
_ensureNotDisposed() {
if (this._disposedState.disposed)
throw new DisposedError();
}
/** @internal */
_resolveSpecialToken(token, fallbackTexts) {
if (token != null && token !== -1)
return token;
for (const text of fallbackTexts) {
const tokens = this._model.tokenize(text, true);
if (tokens.length !== 1)
continue;
return tokens[0];
}
return -1;
}
/** @internal */
static _create(model, disposedState) {
return new LlamaModelInfillTokens(model, disposedState);
}
}
function applyGgufMetadataOverrides(ggufFileInfo, overrides) {
function applyOverride(object, override) {
if (override == null || object == null)
return;
if (object instanceof Array || typeof object !== "object" || typeof override !== "object")
return;
for (const [key, value] of Object.entries(override)) {
if (value instanceof Array || typeof value !== "object" || (typeof value === "object" && typeof object[key] !== "object"))
object[key] = value;
else
applyOverride(object[key], value);
}
}
applyOverride(ggufFileInfo.metadata, overrides);
}
function ggufMetadataOverridesToList(overrides) {
const maxStringLength = 127;
const maxKeyLength = 127;
const res = [];
function addItem(object, path) {
if (object == null || object instanceof Array)
return;
if (typeof object !== "object") {
if (typeof object === "string" && object.length > maxStringLength)
throw new Error(`Metadata key "${path.join(".")}" override string value (${JSON.stringify(object)}) is longer than ${maxStringLength} characters`);
const key = path.join(".");
if (key.length > maxKeyLength)
throw new Error(`Metadata key "${key}" override path is longer than ${maxKeyLength} characters`);
let type = undefined;
if (typeof object === "number") {
if (typeof object === "bigint" || Number.isInteger(object))
type = 0;
else
type = 1;
}
res.push([key, object, type]);
return;
}
for (const [key, value] of Object.entries(object))
addItem(value, [...path, key]);
}
addItem(overrides ?? {}, []);
return res;
}
function disposeModelIfReferenced(modelRef) {
const model = modelRef.deref();
if (model != null)
void model.dispose();
}
//# sourceMappingURL=LlamaModel.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,29 @@
import { Token } from "../../../types.js";
export declare const enum TokenAttribute {
undefined = 0,
unknown = 1,
unused = 2,
normal = 4,
control = 8,// SPECIAL
userDefined = 16,
byte = 32,
normalized = 64,
lstrip = 128,
rstrip = 256,
singleWord = 512
}
export declare class TokenAttributes {
readonly token: Token;
private constructor();
get undefined(): boolean;
get unknown(): boolean;
get unused(): boolean;
get normal(): boolean;
get control(): boolean;
get userDefined(): boolean;
get byte(): boolean;
get normalized(): boolean;
get lstrip(): boolean;
get rstrip(): boolean;
get singleWord(): boolean;
}

View File

@@ -0,0 +1,65 @@
// updated against `enum llama_token_attr` from `llama.h`
export var TokenAttribute;
(function (TokenAttribute) {
TokenAttribute[TokenAttribute["undefined"] = 0] = "undefined";
TokenAttribute[TokenAttribute["unknown"] = 1] = "unknown";
TokenAttribute[TokenAttribute["unused"] = 2] = "unused";
TokenAttribute[TokenAttribute["normal"] = 4] = "normal";
TokenAttribute[TokenAttribute["control"] = 8] = "control";
TokenAttribute[TokenAttribute["userDefined"] = 16] = "userDefined";
TokenAttribute[TokenAttribute["byte"] = 32] = "byte";
TokenAttribute[TokenAttribute["normalized"] = 64] = "normalized";
TokenAttribute[TokenAttribute["lstrip"] = 128] = "lstrip";
TokenAttribute[TokenAttribute["rstrip"] = 256] = "rstrip";
TokenAttribute[TokenAttribute["singleWord"] = 512] = "singleWord";
})(TokenAttribute || (TokenAttribute = {}));
export class TokenAttributes {
token;
/** @internal */ _attributes;
constructor(token, attributes) {
this.token = token;
this._attributes = attributes;
}
get undefined() {
return this._attributes === TokenAttribute.undefined;
}
get unknown() {
return this._hasAttribute(TokenAttribute.unknown);
}
get unused() {
return this._hasAttribute(TokenAttribute.unused);
}
get normal() {
return this._hasAttribute(TokenAttribute.normal);
}
get control() {
return this._hasAttribute(TokenAttribute.control);
}
get userDefined() {
return this._hasAttribute(TokenAttribute.userDefined);
}
get byte() {
return this._hasAttribute(TokenAttribute.byte);
}
get normalized() {
return this._hasAttribute(TokenAttribute.normalized);
}
get lstrip() {
return this._hasAttribute(TokenAttribute.lstrip);
}
get rstrip() {
return this._hasAttribute(TokenAttribute.rstrip);
}
get singleWord() {
return this._hasAttribute(TokenAttribute.singleWord);
}
/** @internal */
_hasAttribute(attribute) {
return (this._attributes & attribute) === attribute;
}
/** @internal */
static _create(token, attributes) {
return new TokenAttributes(token, attributes);
}
}
//# sourceMappingURL=TokenAttributes.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TokenAttributes.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaModel/utils/TokenAttributes.ts"],"names":[],"mappings":"AAEA,yDAAyD;AACzD,MAAM,CAAN,IAAkB,cAYjB;AAZD,WAAkB,cAAc;IAC5B,6DAAa,CAAA;IACb,yDAAgB,CAAA;IAChB,uDAAe,CAAA;IACf,uDAAe,CAAA;IACf,yDAAgB,CAAA;IAChB,kEAAoB,CAAA;IACpB,oDAAa,CAAA;IACb,gEAAmB,CAAA;IACnB,yDAAe,CAAA;IACf,yDAAe,CAAA;IACf,iEAAmB,CAAA;AACvB,CAAC,EAZiB,cAAc,KAAd,cAAc,QAY/B;AAED,MAAM,OAAO,eAAe;IACR,KAAK,CAAQ;IAC7B,gBAAgB,CAAkB,WAAW,CAAiB;IAE9D,YAAoB,KAAY,EAAE,UAA0B;QACxD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;IAClC,CAAC;IAED,IAAW,SAAS;QAChB,OAAO,IAAI,CAAC,WAAW,KAAK,cAAc,CAAC,SAAS,CAAC;IACzD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,WAAW;QAClB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC;IAED,IAAW,IAAI;QACX,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACnD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,gBAAgB;IACR,aAAa,CAAC,SAAyB;QAC3C,OAAO,CAAC,IAAI,CAAC,WAAW,GAAG,SAAS,CAAC,KAAK,SAAS,CAAC;IACxD,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,OAAO,CAAC,KAAY,EAAE,UAA0B;QAC1D,OAAO,IAAI,eAAe,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;IAClD,CAAC;CACJ"}

View File

@@ -0,0 +1,91 @@
import { EventRelay } from "lifecycle-utils";
import { Token } from "../types.js";
import { LlamaText } from "../utils/LlamaText.js";
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
export type LlamaRankingContextOptions = {
/**
* The number of tokens the model can see at once.
* - **`"auto"`** - adapt to the current VRAM state and attempt to set the context size as high as possible up to the size
* the model was trained on.
* - **`number`** - set the context size to a specific number of tokens.
* If there's not enough VRAM, an error will be thrown.
* Use with caution.
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attempt to set the context size as high as possible
* up to the size the model was trained on, but at least `min` and at most `max`.
*
* Defaults to `"auto"`.
*/
contextSize?: "auto" | number | {
min?: number;
max?: number;
};
/** prompt processing batch size */
batchSize?: number;
/**
* number of threads to use to evaluate tokens.
* set to 0 to use the maximum threads supported by the current machine hardware
*/
threads?: number;
/** An abort signal to abort the context creation */
createSignal?: AbortSignal;
/**
* The template to use for the ranking evaluation.
* If not provided, the model's template will be used by default.
*
* The template is tokenized with special tokens enabled, but the provided query and document are not.
*
* **<span v-pre>`{{query}}`</span>** is replaced with the query content.
*
* **<span v-pre>`{{document}}`</span>** is replaced with the document content.
*
* It's recommended to not set this option unless you know what you're doing.
*
* Defaults to the model's template.
*/
template?: `${string}{{query}}${string}{{document}}${string}` | `${string}{{document}}${string}{{query}}${string}`;
/**
* Ignore insufficient memory errors and continue with the context creation.
* Can cause the process to crash if there's not enough VRAM for the new context.
*
* Defaults to `false`.
*/
ignoreMemorySafetyChecks?: boolean;
};
/**
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
*/
export declare class LlamaRankingContext {
readonly onDispose: EventRelay<void>;
private constructor();
/**
* Get the ranking score for a document for a query.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
* @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
*/
rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText): Promise<number>;
/**
* Get the ranking scores for all the given documents for a query.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
* @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
*/
rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]>;
/**
* Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
*/
rankAndSort<const T extends string>(query: Token[] | string | LlamaText, documents: T[]): Promise<Array<{
document: T;
/**
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
*/
score: number;
}>>;
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get model(): LlamaModel;
}

View File

@@ -0,0 +1,178 @@
import { AsyncDisposeAggregator, EventRelay, splitText, withLock } from "lifecycle-utils";
import { tokenizeInput } from "../utils/tokenizeInput.js";
import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
import { isRankingTemplateValid, parseRankingTemplate } from "../gguf/insights/GgufInsights.js";
/**
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
*/
export class LlamaRankingContext {
/** @internal */ _llamaContext;
/** @internal */ _template;
/** @internal */ _sequence;
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
onDispose = new EventRelay();
constructor({ _llamaContext, _template }) {
this._llamaContext = _llamaContext;
this._template = _template;
this._sequence = this._llamaContext.getSequence();
this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
void this._disposeAggregator.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
this._disposeAggregator.add(async () => {
await this._llamaContext.dispose();
});
}
/**
* Get the ranking score for a document for a query.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
* @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
*/
async rank(query, document) {
const resolvedInput = this._getEvaluationInput(query, document);
if (resolvedInput.length > this._llamaContext.contextSize)
throw new Error("The input length exceed the context size. " +
`Try to increase the context size to at least ${resolvedInput.length + 1} ` +
"or use another model that supports longer contexts.");
return this._evaluateRankingForInput(resolvedInput);
}
/**
* Get the ranking scores for all the given documents for a query.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
* @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
*/
async rankAll(query, documents) {
const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
const maxInputTokensLength = resolvedTokens.reduce((max, tokens) => Math.max(max, tokens.length), 0);
if (maxInputTokensLength > this._llamaContext.contextSize)
throw new Error("The input lengths of some of the given documents exceed the context size. " +
`Try to increase the context size to at least ${maxInputTokensLength + 1} ` +
"or use another model that supports longer contexts.");
else if (resolvedTokens.length === 0)
return [];
return await Promise.all(resolvedTokens.map((tokens) => this._evaluateRankingForInput(tokens)));
}
/**
* Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
*
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
*/
async rankAndSort(query, documents) {
const scores = await this.rankAll(query, documents);
return documents
.map((document, index) => ({ document: document, score: scores[index] }))
.sort((a, b) => b.score - a.score);
}
async dispose() {
await this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.asyncDispose]() {
return this.dispose();
}
get disposed() {
return this._llamaContext.disposed;
}
get model() {
return this._llamaContext.model;
}
/** @internal */
_getEvaluationInput(query, document) {
if (this._template != null) {
const resolvedInput = splitText(this._template, ["{{query}}", "{{document}}"])
.flatMap((item) => {
if (typeof item === "string")
return this._llamaContext.model.tokenize(item, true, "trimLeadingSpace");
else if (item.separator === "{{query}}")
return tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
else if (item.separator === "{{document}}")
return tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
else
void item;
void item;
return [];
});
const beginningTokens = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
if (beginningTokens != null && resolvedInput.at(0) !== beginningTokens)
resolvedInput.unshift(beginningTokens);
if (endToken != null && resolvedInput.at(-1) !== endToken)
resolvedInput.unshift(endToken);
return resolvedInput;
}
if (this.model.tokens.eos == null && this.model.tokens.sep == null)
throw new Error("Computing rankings is not supported for this model.");
const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
const resolvedDocument = tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
if (resolvedQuery.length === 0 && resolvedDocument.length === 0)
return [];
const resolvedInput = [
...(this.model.tokens.bos == null ? [] : [this.model.tokens.bos]),
...resolvedQuery,
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos]),
...(this.model.tokens.sep == null ? [] : [this.model.tokens.sep]),
...resolvedDocument,
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos])
];
return resolvedInput;
}
/** @internal */
_evaluateRankingForInput(input) {
if (input.length === 0)
return Promise.resolve(0);
return withLock([this, "evaluate"], async () => {
await this._sequence.eraseContextTokenRanges([{
start: 0,
end: this._sequence.nextTokenIndex
}]);
const iterator = this._sequence.evaluate(input, { _noSampling: true });
// eslint-disable-next-line @typescript-eslint/no-unused-vars
for await (const token of iterator) {
break; // only generate one token to get embeddings
}
const embedding = this._llamaContext._ctx.getEmbedding(input.length, 1);
if (embedding.length === 0)
return 0;
const logit = embedding[0];
const probability = logitToSigmoid(logit);
return probability;
});
}
/** @internal */
static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, template, ignoreMemorySafetyChecks }) {
const resolvedTemplate = template ?? parseRankingTemplate(_model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"]);
if (_model.tokens.eos == null && _model.tokens.sep == null) {
if (!isRankingTemplateValid(resolvedTemplate)) {
if (resolvedTemplate === _model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"])
throw new Error("The model's builtin template is invalid. It must contain both {query} and {document} placeholders.");
else
throw new Error("The provided template is invalid. It must contain both {{query}} and {{document}} placeholders.");
}
else if (resolvedTemplate == null)
throw new Error("Computing rankings is not supported for this model.");
}
if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
throw new Error("Computing rankings is not supported for encoder-decoder models.");
if (!_model.fileInsights.supportsRanking)
throw new Error("Computing rankings is not supported for this model.");
const llamaContext = await _model.createContext({
contextSize,
batchSize,
threads,
createSignal,
ignoreMemorySafetyChecks,
_embeddings: true,
_ranking: true
});
return new LlamaRankingContext({
_llamaContext: llamaContext,
_template: resolvedTemplate
});
}
}
function logitToSigmoid(logit) {
return 1 / (1 + Math.exp(-logit));
}
//# sourceMappingURL=LlamaRankingContext.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,37 @@
import { Token, Tokenizer } from "../types.js";
import { LlamaText } from "../utils/LlamaText.js";
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
/**
* @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
*/
export declare class TokenBias {
constructor(tokenizer: Tokenizer);
/**
* Adjust the bias of the given token(s).
*
* If a text is provided, the bias will be applied to each individual token in the text.
*
* Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
*
* Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
* @param input - The token(s) to apply the bias to
* @param bias - The probability bias to apply to the token(s).
*
* Setting to a positive number increases the probability of the token(s) being generated.
*
* Setting to a negative number decreases the probability of the token(s) being generated.
*
* Setting to `0` has no effect.
*
* For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
* Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
*
* Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
*
* Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
*/
set(input: Token | Token[] | string | LlamaText, bias: "never" | number | {
logit: number;
}): this;
static for(modelOrTokenizer: LlamaModel | Tokenizer): TokenBias;
}

View File

@@ -0,0 +1,68 @@
import { tokenizeInput } from "../utils/tokenizeInput.js";
/**
* @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
*/
export class TokenBias {
/** @internal */ _tokenizer;
/** @internal */ _biases = new Map();
constructor(tokenizer) {
this._tokenizer = tokenizer;
}
/**
* Adjust the bias of the given token(s).
*
* If a text is provided, the bias will be applied to each individual token in the text.
*
* Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
*
* Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
* @param input - The token(s) to apply the bias to
* @param bias - The probability bias to apply to the token(s).
*
* Setting to a positive number increases the probability of the token(s) being generated.
*
* Setting to a negative number decreases the probability of the token(s) being generated.
*
* Setting to `0` has no effect.
*
* For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
* Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
*
* Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
*
* Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
*/
set(input, bias) {
const resolvedLogit = bias === "never"
? -Infinity
: typeof bias === "number"
? probabilityToLogit(bias)
: bias.logit;
for (const token of tokenizeInput(input, this._tokenizer)) {
if (this._tokenizer.isEogToken(token))
continue;
this._biases.set(token, resolvedLogit);
}
for (const token of tokenizeInput(input, this._tokenizer, "trimLeadingSpace")) {
if (this._tokenizer.isEogToken(token))
continue;
this._biases.set(token, resolvedLogit);
}
return this;
}
static for(modelOrTokenizer) {
if (modelOrTokenizer.tokenizer != null)
return new TokenBias(modelOrTokenizer.tokenizer);
return new TokenBias(modelOrTokenizer);
}
}
function probabilityToLogit(probability) {
if (probability <= -1)
return -Infinity;
else if (probability >= 1)
return Infinity;
else if (probability === 0)
return 0;
return Math.log(probability / (1 - probability));
}
//# sourceMappingURL=TokenBias.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TokenBias.js","sourceRoot":"","sources":["../../src/evaluator/TokenBias.ts"],"names":[],"mappings":"AAEA,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AAGxD;;GAEG;AACH,MAAM,OAAO,SAAS;IAClB,gBAAgB,CAAiB,UAAU,CAAY;IACvD,gBAAgB,CAAiB,OAAO,GAAG,IAAI,GAAG,EAAiB,CAAC;IAEpE,YAAmB,SAAoB;QACnC,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;IAChC,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACI,GAAG,CAAC,KAA2C,EAAE,IAAwC;QAC5F,MAAM,aAAa,GAAG,IAAI,KAAK,OAAO;YAClC,CAAC,CAAC,CAAC,QAAQ;YACX,CAAC,CAAC,OAAO,IAAI,KAAK,QAAQ;gBACtB,CAAC,CAAC,kBAAkB,CAAC,IAAI,CAAC;gBAC1B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC;QAErB,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACxD,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,kBAAkB,CAAC,EAAE,CAAC;YAC5E,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,IAAI,CAAC;IAChB,CAAC;IAEM,MAAM,CAAC,GAAG,CAAC,gBAAwC;QACtD,IAAK,gBAA+B,CAAC,SAAS,IAAI,IAAI;YAClD,OAAO,IAAI,SAAS,CAAE,gBAA+B,CAAC,SAAS,CAAC,CAAC;QAErE,OAAO,IAAI,SAAS,CAAC,gBAA6B,CAAC,CAAC;IACxD,CAAC;CACJ;AAED,SAAS,kBAAkB,CAAC,WAAmB;IAC3C,IAAI,WAAW,IAAI,CAAC,CAAC;QACjB,OAAO,CAAC,QAAQ,CAAC;SAChB,IAAI,WAAW,IAAI,CAAC;QACrB,OAAO,QAAQ,CAAC;SACf,IAAI,WAAW,KAAK,CAAC;QACtB,OAAO,CAAC,CAAC;IAEb,OAAO,IAAI,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC;AACrD,CAAC"}

View File

@@ -0,0 +1,45 @@
/**
* Tracks the usage of tokens.
*/
export declare class TokenMeter {
private _inputTokens;
private _outputTokens;
/**
* The number of input tokens used
*/
get usedInputTokens(): number;
/**
* The number of tokens generated by a model
*/
get usedOutputTokens(): number;
/**
* Get the current state of the token meter
*/
getState(): TokenMeterState;
/**
* Log the usage of tokens
*/
useTokens(tokens: number, type: "input" | "output"): void;
/**
* Get the difference between the current meter and another meter
*/
diff(meter: TokenMeter | TokenMeterState): {
usedInputTokens: number;
usedOutputTokens: number;
};
/**
* Log the usage of tokens on multiple meters
*/
static useTokens(meters: null | undefined | TokenMeter | readonly TokenMeter[] | ReadonlySet<TokenMeter>, tokens: number, type: "input" | "output"): void;
/**
* Get the difference between two meters
*/
static diff(meter1: TokenMeter | TokenMeterState, meter2: TokenMeter | TokenMeterState): {
usedInputTokens: number;
usedOutputTokens: number;
};
}
export type TokenMeterState = {
usedInputTokens: number;
usedOutputTokens: number;
};

View File

@@ -0,0 +1,74 @@
/**
* Tracks the usage of tokens.
*/
export class TokenMeter {
_inputTokens = 0;
_outputTokens = 0;
/**
* The number of input tokens used
*/
get usedInputTokens() {
return this._inputTokens;
}
/**
* The number of tokens generated by a model
*/
get usedOutputTokens() {
return this._outputTokens;
}
/**
* Get the current state of the token meter
*/
getState() {
return {
usedInputTokens: this.usedInputTokens,
usedOutputTokens: this.usedOutputTokens
};
}
/**
* Log the usage of tokens
*/
useTokens(tokens, type) {
if (tokens < 0)
throw new RangeError("Tokens cannot be negative");
else if (tokens === 0)
return;
if (type === "input")
this._inputTokens += tokens;
else if (type === "output")
this._outputTokens += tokens;
else {
void type;
throw new TypeError(`Unknown token type: ${type}`);
}
}
/**
* Get the difference between the current meter and another meter
*/
diff(meter) {
return TokenMeter.diff(this, meter);
}
/**
* Log the usage of tokens on multiple meters
*/
static useTokens(meters, tokens, type) {
if (meters == null)
return;
if (meters instanceof TokenMeter)
meters.useTokens(tokens, type);
else {
for (const meter of meters)
meter.useTokens(tokens, type);
}
}
/**
* Get the difference between two meters
*/
static diff(meter1, meter2) {
return {
usedInputTokens: meter1.usedInputTokens - meter2.usedInputTokens,
usedOutputTokens: meter1.usedOutputTokens - meter2.usedOutputTokens
};
}
}
//# sourceMappingURL=TokenMeter.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TokenMeter.js","sourceRoot":"","sources":["../../src/evaluator/TokenMeter.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,OAAO,UAAU;IACX,YAAY,GAAW,CAAC,CAAC;IACzB,aAAa,GAAW,CAAC,CAAC;IAElC;;OAEG;IACH,IAAW,eAAe;QACtB,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,IAAW,gBAAgB;QACvB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED;;OAEG;IACI,QAAQ;QACX,OAAO;YACH,eAAe,EAAE,IAAI,CAAC,eAAe;YACrC,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;SAC1C,CAAC;IACN,CAAC;IAED;;OAEG;IACI,SAAS,CAAC,MAAc,EAAE,IAAwB;QACrD,IAAI,MAAM,GAAG,CAAC;YACV,MAAM,IAAI,UAAU,CAAC,2BAA2B,CAAC,CAAC;aACjD,IAAI,MAAM,KAAK,CAAC;YACjB,OAAO;QAEX,IAAI,IAAI,KAAK,OAAO;YAChB,IAAI,CAAC,YAAY,IAAI,MAAM,CAAC;aAC3B,IAAI,IAAI,KAAK,QAAQ;YACtB,IAAI,CAAC,aAAa,IAAI,MAAM,CAAC;aAC5B,CAAC;YACF,KAAM,IAAqB,CAAC;YAC5B,MAAM,IAAI,SAAS,CAAC,uBAAuB,IAAI,EAAE,CAAC,CAAC;QACvD,CAAC;IACL,CAAC;IAED;;OAEG;IACI,IAAI,CAAC,KAAmC;QAC3C,OAAO,UAAU,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IACxC,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,SAAS,CACnB,MAAuF,EACvF,MAAc,EACd,IAAwB;QAExB,IAAI,MAAM,IAAI,IAAI;YACd,OAAO;QAEX,IAAI,MAAM,YAAY,UAAU;YAC5B,MAAM,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;aAC9B,CAAC;YACF,KAAK,MAAM,KAAK,IAAI,MAAM;gBACtB,KAAK,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACtC,CAAC;IACL,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,IAAI,CACd,MAAoC,EACpC,MAAoC;QAEpC,OAAO;YACH,eAAe,EAAE,MAAM,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe;YAChE,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,GAAG,MAAM,CAAC,gBAAgB;SACtE,CAAC;IACN,CAAC;CACJ"}

View File

@@ -0,0 +1,86 @@
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { Token, Tokenizer } from "../../types.js";
import { LlamaText } from "../../utils/LlamaText.js";
/**
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
*
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
*
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
* @experimental - this API is experimental and may change or be removed in subsequent releases
* @hidden
*/
export declare function experimentalChunkDocument(options: {
contextSequence: LlamaContextSequence;
document: string;
/**
* The tokens to use as separators for chunking the document.
* Passed to the `getSystemPrompt` function to generate the prompt.
*/
separatorTokens?: Token[];
getSystemPrompt?(options: {
separatorTokens: Token[];
tokenizer: Tokenizer;
maxChunkSize?: number;
}): LlamaText | string;
/**
* Maximum number of tokens to allow in a chunk.
*
* As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
*
* Set to `0` to disable this mechanism.
*
* Defaults to `500`.
*/
maxChunkSize?: number;
/**
* The alignment curve for the maximum chunk size mechanism.
*
* Adjust the value based on the behavior of the model.
*
* Play around with values between `1` and `4` to see what works best for you.
*
* Set to `1` to disable this mechanism.
*
* Defaults to `4`.
*/
maxChunkSizeAlignmentCurve?: number;
/**
* Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
* matches any of the texts in `trimmedTexts`
*/
syntaxAlignment?: {
/**
* The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
*
* Default: `4`
*/
maxTokens?: number;
/**
* The trimmed texts to match for, to append the token to the current chunk.
*
* Default: `["", ".", ";"]`
*/
trimmedTexts?: string[];
};
/**
* The number of tokens to skip before starting to use the generated separator tokens to split the document.
*/
skipFirstTokens?: number;
/**
* The number of recent probabilities to keep in the trail for normalization.
*
* Adjust the value based on the behavior of the model.
*
* Defaults to `200`.
*/
normalizationTrailSize?: number;
/**
* Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
*/
onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void;
/**
* Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
*/
onChunkText?(chunkText: string, usedSeparatorToken: Token): void;
}): Promise<string[]>;

View File

@@ -0,0 +1,212 @@
import { LlamaText, SpecialTokensText } from "../../utils/LlamaText.js";
import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { maxRecentDetokenizerTokens } from "../../consts.js";
/**
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
*
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
*
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
* @experimental - this API is experimental and may change or be removed in subsequent releases
* @hidden
*/
export async function experimentalChunkDocument(options) {
const { contextSequence, document, separatorTokens = findAppropriateSeparatorTokens(contextSequence.model), getSystemPrompt = getDefaultPrompt, maxChunkSize = 500, maxChunkSizeAlignmentCurve = 4, syntaxAlignment: { maxTokens: maxSyntaxAlignment = 4, trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"] } = {}, skipFirstTokens = 3, normalizationTrailSize = 100 } = options;
const onChunkTokens = safeEventCallback(options.onChunkTokens);
const onChunkText = safeEventCallback(options.onChunkText);
if (separatorTokens.length === 0)
throw new Error("Separator tokens must be provided");
const chatHistory = [{
type: "system",
text: LlamaText(getSystemPrompt({
separatorTokens,
tokenizer: contextSequence.model.tokenizer,
maxChunkSize: maxChunkSize <= 0
? undefined
: maxChunkSize
})).toJSON()
}, {
type: "user",
text: document
}, {
type: "model",
response: [""]
}];
const chatWrapper = resolveChatWrapper(contextSequence.model);
const { contextText } = chatWrapper.generateContextState({ chatHistory });
const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
throw new Error("The context size is too small to chunk the given document");
const evaluateInput = initialContextTokens.slice();
for (let i = 0; i < documentTokens.length - 1; i++) {
const token = documentTokens[i];
evaluateInput.push([token, {
generateNext: {
probabilities: true
}
}]);
}
let weight = 1;
const recentProbabilitiesTrail = [];
let chunkStartIndex = 0;
let lastPushedSeparatorIndex = 0;
const chunks = [];
const res = [];
function pushSeparatorIndex(separateIndex, separatorToken) {
lastPushedSeparatorIndex = separateIndex;
if (separateIndex <= chunkStartIndex)
return;
let endIndex = separateIndex;
for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
const text = contextSequence.model.detokenize([documentTokens[endIndex + i]]);
if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
break;
endIndex++;
}
const chunk = documentTokens.slice(chunkStartIndex, endIndex);
const text = contextSequence.model.detokenize(chunk, false, documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex));
chunks.push(chunk);
chunkStartIndex = endIndex;
onChunkTokens?.(chunk, separatorToken);
onChunkText?.(text, separatorToken);
res.push(text);
}
await contextSequence.controlledEvaluate(evaluateInput, {
onTokenResult(inputTokenIndex, result) {
const i = inputTokenIndex - initialContextTokens.length;
const nextProbabilities = result?.next?.probabilities;
const nextDocumentToken = documentTokens[i + 1];
if (nextProbabilities == null)
throw new Error("received no result for token " + i);
const topProbabilityScore = nextProbabilities.entries()
.next().value?.[1];
const [usedSeparatorToken, separatorProbability] = separatorTokens
.filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
.map((token) => [token, nextProbabilities.get(token)])
.filter((pair) => pair[1] != null)
.reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
if (probabilityA >= probabilityB)
return [tokenA, probabilityA];
return [tokenB, probabilityB];
}, [separatorTokens[0], 0]);
if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
return;
// console.log(
// i, contextSequence.model.detokenize([documentTokens[i]!]),
// Array.from(nextProbabilities.entries()).slice(0, 5)
// .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
// );
if (separatorProbability >= topProbabilityScore)
pushSeparatorIndex(i + 1, usedSeparatorToken);
else if (i > skipFirstTokens) {
const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
let maxChunkSizeAlignment = 0;
if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
const leftProbability = 1 - adjustedProbability;
const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
maxChunkSizeAlignment = currentChunkSize === 0
? 0
: adjustExponential(leftProbability * Math.min(1, currentChunkSize / maxChunkSize), maxChunkSizeAlignmentCurve <= 0
? 1
: maxChunkSizeAlignmentCurve, 0.8);
if (currentChunkSize === maxChunkSize)
maxChunkSizeAlignment = 1;
}
if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
pushSeparatorIndex(i + 1, usedSeparatorToken);
// update the weight of the current token with the adjusted probability in the trail
if (recentProbabilitiesTrail.length > 1) {
weight /= recentProbabilitiesTrail.pop();
recentProbabilitiesTrail.push(adjustedProbability);
weight *= adjustedProbability;
}
}
}
const nextDocumentTokenProbability = nextDocumentToken == null
? undefined
: nextProbabilities.get(nextDocumentToken);
if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
recentProbabilitiesTrail.push(nextDocumentTokenProbability);
weight *= nextDocumentTokenProbability;
if (recentProbabilitiesTrail.length > normalizationTrailSize)
weight /= recentProbabilitiesTrail.shift();
}
}
});
if (lastPushedSeparatorIndex !== documentTokens.length)
pushSeparatorIndex(documentTokens.length, separatorTokens[0]);
return res;
}
const idealTokenTexts = [
"\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
"\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
"\u00a1", // inverted exclamation mark
"|",
"_"
];
function findAppropriateSeparatorTokens(model, maxTokens = 2) {
const idealTextsSet = new Set(idealTokenTexts);
const foundTokens = [];
for (const token of model.iterateAllTokens()) {
if (model.isSpecialToken(token))
continue;
const text = model.detokenize([token]);
const trimmedText = text.trim();
if (idealTextsSet.has(trimmedText)) {
const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
if (foundTokens[textIndex] == null || text === trimmedText)
foundTokens[textIndex] = token;
}
}
const res = [];
for (let i = 0; i < idealTokenTexts.length; i++) {
const token = foundTokens[i];
if (token != null)
res.push(token);
}
return res.slice(0, maxTokens);
}
function getDefaultPrompt({ separatorTokens, tokenizer, maxChunkSize = 500 }) {
if (separatorTokens.length === 0)
throw new Error("No separator tokens provided");
else if (separatorTokens.length > 2)
throw new Error("Maximum of 2 separator tokens are supported");
return LlamaText.joinValues("\n", [
'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
"",
"You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
"",
"# Instructions",
LlamaText([
"- For splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]])),
'` as the "big split token" separator.'
]),
separatorTokens.length > 1 && (LlamaText([
"- For small splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]])),
'` as the "big split token" separator.'
])),
"- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
LlamaText([
"- You may get a user message that is unstructured or not structured cleanly. " +
"Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
" characters, and a big split every ", Math.floor(maxChunkSize), " characters."
]),
"- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
"- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
'- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
"- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
].filter((x) => x !== false));
}
function adjustExponential(value, exponent, weight) {
if (value < 0)
return 0;
else if (value > 1)
return 1;
return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
}
//# sourceMappingURL=chunkDocument.js.map

File diff suppressed because one or more lines are too long