First upload version 0.0.1
This commit is contained in:
459
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
generated
vendored
Normal file
459
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.d.ts
generated
vendored
Normal file
@@ -0,0 +1,459 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { ChatWrapper } from "../../ChatWrapper.js";
|
||||
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
|
||||
import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
|
||||
import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
|
||||
import { LlamaGrammar } from "../LlamaGrammar.js";
|
||||
import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
|
||||
import { EvaluationPriority } from "../LlamaContext/types.js";
|
||||
import { TokenBias } from "../TokenBias.js";
|
||||
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
|
||||
export type LlamaChatOptions = {
|
||||
contextSequence: LlamaContextSequence;
|
||||
/** `"auto"` is used by default */
|
||||
chatWrapper?: "auto" | ChatWrapper;
|
||||
/**
|
||||
* Automatically dispose the sequence when the session is disposed
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
autoDisposeSequence?: boolean;
|
||||
};
|
||||
export type LlamaChatResponseChunk = LlamaChatResponseTextChunk | LlamaChatResponseSegmentChunk;
|
||||
export type LlamaChatResponseTextChunk = {
|
||||
/** When `type` is `undefined`, the chunk is part of the main response and is not a segment */
|
||||
type: undefined;
|
||||
/**
|
||||
* `segmentType` has no purpose when `type` is `undefined` (meaning that this chunk is part of the main response and is not a segment).
|
||||
*/
|
||||
segmentType: undefined;
|
||||
/**
|
||||
* The generated text chunk.
|
||||
*
|
||||
* Detokenized from the `tokens` property,
|
||||
* but with the context of the previous generation (for better spacing of the text with some models).
|
||||
*
|
||||
* Prefer using this property over `tokens` when streaming the generated response as text.
|
||||
*/
|
||||
text: string;
|
||||
/** The generated tokens */
|
||||
tokens: Token[];
|
||||
};
|
||||
export type LlamaChatResponseSegmentChunk = {
|
||||
type: "segment";
|
||||
/** Segment type */
|
||||
segmentType: ChatModelSegmentType;
|
||||
/**
|
||||
* The generated text chunk.
|
||||
*
|
||||
* Detokenized from the `tokens` property,
|
||||
* but with the context of the previous generation (for better spacing of the text with some models).
|
||||
*
|
||||
* Prefer using this property over `tokens` when streaming the generated response as text.
|
||||
*/
|
||||
text: string;
|
||||
/** The generated tokens */
|
||||
tokens: Token[];
|
||||
/**
|
||||
* When the current chunk is the start of a segment, this field will be set.
|
||||
*
|
||||
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
|
||||
* to signify that the segment has started.
|
||||
*/
|
||||
segmentStartTime?: Date;
|
||||
/**
|
||||
* When the current chunk is the last one of a segment (meaning the current segment has ended), this field will be set.
|
||||
*
|
||||
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
|
||||
* to signify that the segment has ended.
|
||||
*/
|
||||
segmentEndTime?: Date;
|
||||
};
|
||||
export type LlamaChatResponseFunctionCallParamsChunk = {
|
||||
/**
|
||||
* Each different function call has a different `callIndex`.
|
||||
*
|
||||
* When the previous function call has finished being generated, the `callIndex` of the next one will increment.
|
||||
*
|
||||
* Use this value to distinguish between different function calls.
|
||||
*/
|
||||
callIndex: number;
|
||||
/**
|
||||
* The name of the function being called
|
||||
*/
|
||||
functionName: string;
|
||||
/**
|
||||
* A chunk of the generated text used for the function call parameters.
|
||||
*
|
||||
* Collect all the chunks together to construct the full function call parameters.
|
||||
*
|
||||
* After the function call is finished, the entire constructed params text can be parsed as a JSON object,
|
||||
* according to the function parameters schema.
|
||||
*/
|
||||
paramsChunk: string;
|
||||
/**
|
||||
* When this is `true`, the current chunk is the last chunk in the generation of the current function call parameters.
|
||||
*/
|
||||
done: boolean;
|
||||
};
|
||||
export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
|
||||
/**
|
||||
* Called as the model generates the main response with the generated text chunk.
|
||||
*
|
||||
* Useful for streaming the generated response as it's being generated.
|
||||
*
|
||||
* Includes only the main response without any text segments (like thoughts).
|
||||
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
||||
*/
|
||||
onTextChunk?: (text: string) => void;
|
||||
/**
|
||||
* Called as the model generates the main response with the generated tokens.
|
||||
*
|
||||
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
|
||||
*
|
||||
* Includes only the main response without any segments (like thoughts).
|
||||
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
||||
*/
|
||||
onToken?: (tokens: Token[]) => void;
|
||||
/**
|
||||
* Called as the model generates a response with the generated text and tokens,
|
||||
* including segment information (when the generated output is part of a segment).
|
||||
*
|
||||
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
|
||||
*
|
||||
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
|
||||
*/
|
||||
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
|
||||
/**
|
||||
* An AbortSignal to later abort the generation.
|
||||
*
|
||||
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
|
||||
*
|
||||
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
|
||||
*/
|
||||
signal?: AbortSignal;
|
||||
/**
|
||||
* When a response already started being generated and then the signal is aborted,
|
||||
* the generation will stop and the response will be returned as is instead of throwing an error.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
stopOnAbortSignal?: boolean;
|
||||
/** Maximum number of tokens to generate */
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* Temperature is a hyperparameter that controls the randomness of the generated text.
|
||||
* It affects the probability distribution of the model's output tokens.
|
||||
*
|
||||
* A higher temperature (e.g., 1.5) makes the output more random and creative,
|
||||
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
|
||||
*
|
||||
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
|
||||
*
|
||||
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
*
|
||||
* Set to `0` to disable.
|
||||
* Disabled by default (set to `0`).
|
||||
*/
|
||||
temperature?: number;
|
||||
/**
|
||||
* From the next token candidates, discard the percentage of tokens with the lowest probability.
|
||||
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
|
||||
* This is useful for generating more high-quality results when using a high temperature.
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
* Disabled by default.
|
||||
*/
|
||||
minP?: number;
|
||||
/**
|
||||
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
|
||||
* An integer number between `1` and the size of the vocabulary.
|
||||
* Set to `0` to disable (which uses the full vocabulary).
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than 0.
|
||||
*/
|
||||
topK?: number;
|
||||
/**
|
||||
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
|
||||
* and samples the next token only from this set.
|
||||
* A float number between `0` and `1`.
|
||||
* Set to `1` to disable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
*/
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
/**
|
||||
* Trim whitespace from the end of the generated text
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
trimWhitespaceSuffix?: boolean;
|
||||
repeatPenalty?: false | LLamaContextualRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
/**
|
||||
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
contextShift?: LLamaChatContextShiftOptions;
|
||||
/**
|
||||
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
|
||||
*/
|
||||
customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
|
||||
/**
|
||||
* The evaluation context window returned from the last evaluation.
|
||||
* This is an optimization to utilize existing context sequence state better when possible.
|
||||
*/
|
||||
lastEvaluationContextWindow?: {
|
||||
/** The history of the last evaluation. */
|
||||
history?: ChatHistoryItem[];
|
||||
/**
|
||||
* Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
|
||||
* If the last evaluation context window is not used, a new context will be generated based on the full history,
|
||||
* which will decrease the likelihood of another context shift happening so soon.
|
||||
*
|
||||
* A number between `0` (exclusive) and `1` (inclusive).
|
||||
*/
|
||||
minimumOverlapPercentageToPreventContextShift?: number;
|
||||
};
|
||||
/**
|
||||
* Called as the model generates function calls with the generated parameters chunk for each function call.
|
||||
*
|
||||
* Useful for streaming the generated function call parameters as they're being generated.
|
||||
* Only useful in specific use cases,
|
||||
* such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
|
||||
*
|
||||
* The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
|
||||
* according to the function parameters schema.
|
||||
*
|
||||
* Each function call has its own `callIndex` you can use to distinguish between them.
|
||||
*
|
||||
* Only relevant when using function calling (via passing the `functions` option).
|
||||
*/
|
||||
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
||||
/**
|
||||
* Set the maximum number of tokens the model is allowed to spend on various segmented responses.
|
||||
*/
|
||||
budgets?: {
|
||||
/**
|
||||
* Whether to include the tokens already consumed by the current model response being completed in the budget.
|
||||
*
|
||||
* Defaults to `true`.
|
||||
*/
|
||||
includeCurrentResponse?: boolean;
|
||||
/**
|
||||
* Budget for thought tokens.
|
||||
*
|
||||
* Defaults to `Infinity`.
|
||||
*/
|
||||
thoughtTokens?: number;
|
||||
/**
|
||||
* Budget for comment tokens.
|
||||
*
|
||||
* Defaults to `Infinity`.
|
||||
*/
|
||||
commentTokens?: number;
|
||||
};
|
||||
/**
|
||||
* Stop the generation when the model tries to generate a non-textual segment or call a function.
|
||||
*
|
||||
* Useful for generating completions in a form of a model response.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
abortOnNonText?: boolean;
|
||||
} & ({
|
||||
grammar?: LlamaGrammar;
|
||||
functions?: never;
|
||||
documentFunctionParams?: never;
|
||||
maxParallelFunctionCalls?: never;
|
||||
onFunctionCall?: never;
|
||||
onFunctionCallParamsChunk?: never;
|
||||
} | {
|
||||
grammar?: never;
|
||||
functions?: Functions | ChatModelFunctions;
|
||||
documentFunctionParams?: boolean;
|
||||
maxParallelFunctionCalls?: number;
|
||||
onFunctionCall?: (functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>) => void;
|
||||
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
||||
});
|
||||
export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
|
||||
/**
|
||||
* Complete the given user prompt without adding it or the completion to the returned context window.
|
||||
*/
|
||||
initialUserPrompt?: string;
|
||||
/**
|
||||
* When a completion already started being generated and then the signal is aborted,
|
||||
* the generation will stop and the completion will be returned as is instead of throwing an error.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
stopOnAbortSignal?: boolean;
|
||||
/**
|
||||
* Called as the model generates a completion with the generated text chunk.
|
||||
*
|
||||
* Useful for streaming the generated completion as it's being generated.
|
||||
*/
|
||||
onTextChunk?: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
|
||||
/**
|
||||
* Called as the model generates a completion with the generated tokens.
|
||||
*
|
||||
* Preferably, you'd want to use `onTextChunk` instead of this.
|
||||
*/
|
||||
onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"];
|
||||
signal?: LLamaChatGenerateResponseOptions<Functions>["signal"];
|
||||
maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
|
||||
temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"];
|
||||
minP?: LLamaChatGenerateResponseOptions<Functions>["minP"];
|
||||
topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
|
||||
topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
|
||||
seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
|
||||
trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
|
||||
repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
|
||||
tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
|
||||
evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
|
||||
contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
|
||||
customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
|
||||
lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"];
|
||||
grammar?: LlamaGrammar;
|
||||
/**
|
||||
* Functions are not used by the model here,
|
||||
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
||||
* to avoid context shifts.
|
||||
*
|
||||
* It's best to provide the same functions that were used for the previous prompt here.
|
||||
*/
|
||||
functions?: Functions | ChatModelFunctions;
|
||||
/**
|
||||
* Functions are not used by the model here,
|
||||
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
||||
* to avoid context shifts.
|
||||
*
|
||||
* It's best to provide the same value that was used for the previous prompt here.
|
||||
*/
|
||||
documentFunctionParams?: boolean;
|
||||
};
|
||||
export type LLamaChatContextShiftOptions = {
|
||||
/**
|
||||
* The number of tokens to delete from the context window to make space for new ones.
|
||||
* Defaults to 10% of the context size.
|
||||
*/
|
||||
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
|
||||
/**
|
||||
* The strategy to use when deleting tokens from the context window.
|
||||
*
|
||||
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
|
||||
*/
|
||||
strategy?: "eraseFirstResponseAndKeepFirstSystem" | ((options: {
|
||||
/** Full chat history */
|
||||
chatHistory: readonly ChatHistoryItem[];
|
||||
/** Maximum number of tokens that the new chat history should fit under when tokenized */
|
||||
maxTokensCount: number;
|
||||
/** Tokenizer used to tokenize the chat history */
|
||||
tokenizer: Tokenizer;
|
||||
/** Chat wrapper used to generate the context state */
|
||||
chatWrapper: ChatWrapper;
|
||||
/**
|
||||
* The metadata returned from the last context shift strategy call.
|
||||
* Will be `null` on the first call.
|
||||
*/
|
||||
lastShiftMetadata?: object | null;
|
||||
}) => {
|
||||
chatHistory: ChatHistoryItem[];
|
||||
metadata?: object | null;
|
||||
} | Promise<{
|
||||
chatHistory: ChatHistoryItem[];
|
||||
metadata?: object | null;
|
||||
}>);
|
||||
/**
|
||||
* The `contextShiftMetadata` returned from the last evaluation.
|
||||
* This is an optimization to utilize the existing context state better when possible.
|
||||
*/
|
||||
lastEvaluationMetadata?: object | undefined | null;
|
||||
};
|
||||
export declare class LlamaChat {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
constructor({ contextSequence, chatWrapper, autoDisposeSequence }: LlamaChatOptions);
|
||||
dispose({ disposeSequence }?: {
|
||||
disposeSequence?: boolean;
|
||||
}): void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void;
|
||||
get disposed(): boolean;
|
||||
get chatWrapper(): ChatWrapper;
|
||||
get sequence(): LlamaContextSequence;
|
||||
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
|
||||
get model(): LlamaModel;
|
||||
generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatGenerateResponseOptions<Functions>): Promise<LlamaChatResponse<Functions>>;
|
||||
loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatLoadAndCompleteUserMessageOptions<Functions>): Promise<LlamaChatLoadAndCompleteUserResponse>;
|
||||
}
|
||||
export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
|
||||
/**
|
||||
* The response text only, _without_ any text segments (like thoughts).
|
||||
*/
|
||||
response: string;
|
||||
/**
|
||||
* The full response, including all text and text segments (like thoughts).
|
||||
*/
|
||||
fullResponse: Array<string | LlamaChatResponseSegment>;
|
||||
functionCalls?: Functions extends ChatModelFunctions ? LlamaChatResponseFunctionCall<Functions>[] : never;
|
||||
lastEvaluation: {
|
||||
cleanHistory: ChatHistoryItem[];
|
||||
contextWindow: ChatHistoryItem[];
|
||||
contextShiftMetadata: any;
|
||||
};
|
||||
metadata: {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort";
|
||||
} | {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "customStopTrigger";
|
||||
customStopTrigger: (string | Token)[];
|
||||
};
|
||||
};
|
||||
export type LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions, FunctionCallName extends keyof Functions & string = string & keyof Functions, Params = Functions[FunctionCallName]["params"] extends undefined | null | void ? undefined : GbnfJsonSchemaToType<Functions[FunctionCallName]["params"]>> = {
|
||||
functionName: FunctionCallName;
|
||||
params: Params;
|
||||
raw: LlamaTextJSON;
|
||||
};
|
||||
export type LlamaChatResponseSegment = {
|
||||
type: "segment";
|
||||
segmentType: ChatModelSegmentType;
|
||||
text: string;
|
||||
ended: boolean;
|
||||
raw: LlamaTextJSON;
|
||||
startTime?: string;
|
||||
endTime?: string;
|
||||
};
|
||||
export type LlamaChatLoadAndCompleteUserResponse = {
|
||||
completion: string;
|
||||
lastEvaluation: {
|
||||
/**
|
||||
* The completion and initial user prompt are not added to this context window result,
|
||||
* but are loaded to the current context sequence state as tokens
|
||||
*/
|
||||
contextWindow: ChatHistoryItem[];
|
||||
contextShiftMetadata: any;
|
||||
};
|
||||
metadata: {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
|
||||
} | {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "customStopTrigger";
|
||||
customStopTrigger: (string | Token)[];
|
||||
};
|
||||
};
|
||||
2584
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js
generated
vendored
Normal file
2584
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/LlamaChat.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
11
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.d.ts
generated
vendored
Normal file
11
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.d.ts
generated
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import { LlamaGrammar } from "../../LlamaGrammar.js";
|
||||
import { ChatModelFunctions } from "../../../types.js";
|
||||
import { ChatWrapper } from "../../../ChatWrapper.js";
|
||||
import { Llama } from "../../../bindings/Llama.js";
|
||||
export declare class FunctionCallNameGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
|
||||
private readonly _functions;
|
||||
private readonly _chatWrapper;
|
||||
constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper);
|
||||
parseFunctionName(generatedFunctionName: string): keyof Functions & string;
|
||||
private _validateFunctions;
|
||||
}
|
||||
55
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js
generated
vendored
Normal file
55
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js
generated
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
import { LlamaGrammar } from "../../LlamaGrammar.js";
|
||||
import { LlamaText } from "../../../utils/LlamaText.js";
|
||||
import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
|
||||
import { GbnfGrammar } from "../../../utils/gbnfJson/terminals/GbnfGrammar.js";
|
||||
import { GbnfOr } from "../../../utils/gbnfJson/terminals/GbnfOr.js";
|
||||
import { GbnfVerbatimText } from "../../../utils/gbnfJson/terminals/GbnfVerbatimText.js";
|
||||
import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
|
||||
export class FunctionCallNameGrammar extends LlamaGrammar {
|
||||
_functions;
|
||||
_chatWrapper;
|
||||
constructor(llama, functions, chatWrapper) {
|
||||
const grammar = getGbnfGrammarForFunctionName(functions, chatWrapper);
|
||||
super(llama, {
|
||||
grammar,
|
||||
stopGenerationTriggers: [LlamaText("\n")],
|
||||
trimWhitespaceSuffix: true
|
||||
});
|
||||
this._functions = functions;
|
||||
this._chatWrapper = chatWrapper;
|
||||
this._validateFunctions();
|
||||
}
|
||||
parseFunctionName(generatedFunctionName) {
|
||||
if (this._chatWrapper.settings.functions.call.optionalPrefixSpace && generatedFunctionName[0] === " ")
|
||||
generatedFunctionName = generatedFunctionName.slice(1);
|
||||
const newlineIndex = generatedFunctionName.indexOf("\n");
|
||||
const functionName = generatedFunctionName.slice(0, newlineIndex < 0
|
||||
? generatedFunctionName.length
|
||||
: newlineIndex);
|
||||
if (!Object.hasOwn(this._functions, functionName))
|
||||
throw new LlamaFunctionCallValidationError(`Function name "${functionName}" is not in the supplied functions object`, this._functions, this._chatWrapper, generatedFunctionName);
|
||||
return functionName;
|
||||
}
|
||||
_validateFunctions() {
|
||||
for (const functionsName of Object.keys(this._functions)) {
|
||||
if (functionsName.includes(" ") || functionsName.includes("\n") || functionsName.includes("\t"))
|
||||
throw new Error(`Function name "${functionsName}" contains spaces, new lines or tabs`);
|
||||
else if (functionsName === "")
|
||||
throw new Error("Function name cannot be an empty string");
|
||||
}
|
||||
}
|
||||
}
|
||||
function getGbnfGrammarForFunctionName(functions, chatWrapper) {
|
||||
const grammarGenerator = new GbnfGrammarGenerator();
|
||||
const functionNameGrammars = [];
|
||||
for (const functionName of Object.keys(functions))
|
||||
functionNameGrammars.push(new GbnfVerbatimText(functionName));
|
||||
const callGrammar = new GbnfOr(functionNameGrammars);
|
||||
const rootTerminal = new GbnfGrammar([
|
||||
...(chatWrapper.settings.functions.call.optionalPrefixSpace ? ["[ ]?"] : []),
|
||||
callGrammar.resolve(grammarGenerator)
|
||||
]);
|
||||
const rootGrammar = rootTerminal.getGrammar();
|
||||
return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]");
|
||||
}
|
||||
//# sourceMappingURL=FunctionCallNameGrammar.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"FunctionCallNameGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AAEtD,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AAErF,OAAO,EAAC,WAAW,EAAC,MAAM,kDAAkD,CAAC;AAE7E,OAAO,EAAC,MAAM,EAAC,MAAM,6CAA6C,CAAC;AACnE,OAAO,EAAC,gBAAgB,EAAC,MAAM,uDAAuD,CAAC;AAEvF,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,uBAAoE,SAAQ,YAAY;IAChF,UAAU,CAAY;IACtB,YAAY,CAAc;IAE3C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB;QAC3E,MAAM,OAAO,GAAG,6BAA6B,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;QAEtE,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACzC,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAEhC,IAAI,CAAC,kBAAkB,EAAE,CAAC;IAC9B,CAAC;IAEM,iBAAiB,CAAC,qBAA6B;QAClD,IAAI,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,IAAI,qBAAqB,CAAC,CAAC,CAAC,KAAK,GAAG;YACjG,qBAAqB,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAG,qBAAqB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAEzD,MAAM,YAAY,GAAG,qBAAqB,CAAC,KAAK,CAC5C,CAAC,EACD,YAAY,GAAG,CAAC;YACZ,CAAC,CAAC,qBAAqB,CAAC,MAAM;YAC9B,CAAC,CAAC,YAAY,CACO,CAAC;QAE9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC;YAC7C,MAAM,IAAI,gCAAgC,CACtC,kBAAkB,YAAY,2CAA2C,EACzE,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,qBAAqB,CACxB,CAAC;QAEN,OAAO,YAAY,CAAC;IACxB,CAAC;IAEO,kBAAkB;QACtB,KAAK,MAAM,aAAa,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACvD,IAAI,aAAa,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,QAAQ,CAAC,IAAI,CAAC;gBAC3F,MAAM,IAAI,KAAK,CAAC,kBAAkB,aAAa,sCAAsC,CAAC,CAAC;iBACtF,IAAI,aAAa,KAAK,EAAE;gBACzB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACnE,CAAC;IACL,CAAC;CACJ;AAED,SAAS,6BAA6B,CAClC,SAAoB,EAAE,WAAwB;IAE9C,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAEpD,MAAM,oBAAoB,GAAmB,EAAE,CAAC;IAEhD,KAAK,MAAM,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC;QAC7C,oBAAoB,CAAC,IAAI,CAAC,IAAI,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC;IAElE,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAErD,MAAM,YAAY,GAAG,IAAI,WAAW,CAAC;QACjC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,WAAW,CAAC,OAAO,CAAC,gBAAgB,CAAC;KACxC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC;IAE9C,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,QAAQ,CAAC,CAAC;AACrE,CAAC"}
|
||||
16
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.d.ts
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.d.ts
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
import { LlamaGrammar } from "../../LlamaGrammar.js";
|
||||
import { ChatModelFunctions } from "../../../types.js";
|
||||
import { ChatWrapper } from "../../../ChatWrapper.js";
|
||||
import { Llama } from "../../../bindings/Llama.js";
|
||||
import { GbnfJsonSchema } from "../../../utils/gbnfJson/types.js";
|
||||
export declare class FunctionCallParamsGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
|
||||
private readonly _functions;
|
||||
private readonly _chatWrapper;
|
||||
private readonly _functionName;
|
||||
private readonly _paramsSchema;
|
||||
constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper, functionName: string, paramsSchema: GbnfJsonSchema);
|
||||
parseParams(callText: string): {
|
||||
params: any;
|
||||
raw: string;
|
||||
};
|
||||
}
|
||||
45
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js
generated
vendored
Normal file
45
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js
generated
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
import { LlamaGrammar } from "../../LlamaGrammar.js";
|
||||
import { LlamaText } from "../../../utils/LlamaText.js";
|
||||
import { validateObjectAgainstGbnfSchema } from "../../../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
|
||||
import { GbnfGrammarGenerator } from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
|
||||
import { getGbnfJsonTerminalForGbnfJsonSchema } from "../../../utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js";
|
||||
import { LlamaFunctionCallValidationError } from "./LlamaFunctionCallValidationError.js";
|
||||
export class FunctionCallParamsGrammar extends LlamaGrammar {
|
||||
_functions;
|
||||
_chatWrapper;
|
||||
_functionName;
|
||||
_paramsSchema;
|
||||
constructor(llama, functions, chatWrapper, functionName, paramsSchema) {
|
||||
const grammar = getGbnfGrammarForFunctionParams(paramsSchema);
|
||||
super(llama, {
|
||||
grammar,
|
||||
stopGenerationTriggers: [LlamaText("\n".repeat(4))],
|
||||
trimWhitespaceSuffix: true
|
||||
});
|
||||
this._functions = functions;
|
||||
this._chatWrapper = chatWrapper;
|
||||
this._functionName = functionName;
|
||||
this._paramsSchema = paramsSchema;
|
||||
}
|
||||
parseParams(callText) {
|
||||
const endIndex = callText.lastIndexOf("\n".repeat(4));
|
||||
if (endIndex < 0)
|
||||
throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to end with stop generation trigger`, this._functions, this._chatWrapper, callText);
|
||||
const paramsString = callText.slice(0, endIndex);
|
||||
if (paramsString.trim().length === 0)
|
||||
throw new LlamaFunctionCallValidationError(`Expected function call params for function "${this._functionName}" to not be empty`, this._functions, this._chatWrapper, callText);
|
||||
const params = JSON.parse(paramsString);
|
||||
validateObjectAgainstGbnfSchema(params, this._paramsSchema);
|
||||
return {
|
||||
params: params, // prevent infinite TS type instantiation
|
||||
raw: paramsString
|
||||
};
|
||||
}
|
||||
}
|
||||
function getGbnfGrammarForFunctionParams(paramsSchema) {
|
||||
const grammarGenerator = new GbnfGrammarGenerator();
|
||||
const rootTerminal = getGbnfJsonTerminalForGbnfJsonSchema(paramsSchema, grammarGenerator);
|
||||
const rootGrammar = rootTerminal.resolve(grammarGenerator, true);
|
||||
return grammarGenerator.generateGbnfFile(rootGrammar + ` "${"\\n".repeat(4)}"`);
|
||||
}
|
||||
//# sourceMappingURL=FunctionCallParamsGrammar.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"FunctionCallParamsGrammar.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAC,SAAS,EAAC,MAAM,6BAA6B,CAAC;AACtD,OAAO,EAAC,+BAA+B,EAAC,MAAM,kEAAkE,CAAC;AAEjH,OAAO,EAAC,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AACrF,OAAO,EAAC,oCAAoC,EAAC,MAAM,uEAAuE,CAAC;AAI3H,OAAO,EAAC,gCAAgC,EAAC,MAAM,uCAAuC,CAAC;AAGvF,MAAM,OAAO,yBAAsE,SAAQ,YAAY;IAClF,UAAU,CAAY;IACtB,YAAY,CAAc;IAC1B,aAAa,CAAS;IACtB,aAAa,CAAiB;IAE/C,YAAmB,KAAY,EAAE,SAAoB,EAAE,WAAwB,EAAE,YAAoB,EAAE,YAA4B;QAC/H,MAAM,OAAO,GAAG,+BAA+B,CAAC,YAAY,CAAC,CAAC;QAE9D,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;QAChC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAEM,WAAW,CAAC,QAAgB;QAC/B,MAAM,QAAQ,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,CAAC;YACZ,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,uCAAuC,EACxG,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAEjD,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAChC,MAAM,IAAI,gCAAgC,CACtC,+CAA+C,IAAI,CAAC,aAAa,mBAAmB,EACpF,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,YAAY,EACjB,QAAQ,CACX,CAAC;QAEN,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAExC,+BAA+B,CAAC,MAAM,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QAE5D,OAAO;YACH,MAAM,EAAE,MAAa,EAAE,yCAAyC;YAChE,GAAG,EAAE,YAAY;SACpB,CAAC;IACN,CAAC;CACJ;AAED,SAAS,+BAA+B,CAAC,YAA4B;IACjE,MAAM,gBAAgB,GAAG,IAAI,oBAAoB,EAAE,CAAC;IACpD,MAAM,YAAY,GAAG,oCAAoC,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;IAC1F,MAAM,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;IAEjE,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,WAAW,GAAG,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACpF,CAAC"}
|
||||
8
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.d.ts
generated
vendored
Normal file
8
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.d.ts
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
import { ChatModelFunctions } from "../../../types.js";
|
||||
import { ChatWrapper } from "../../../ChatWrapper.js";
|
||||
export declare class LlamaFunctionCallValidationError<const Functions extends ChatModelFunctions> extends Error {
|
||||
readonly functions: Functions;
|
||||
readonly chatWrapper: ChatWrapper;
|
||||
readonly callText: string;
|
||||
constructor(message: string, functions: Functions, chatWrapper: ChatWrapper, callText: string);
|
||||
}
|
||||
12
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js
generated
vendored
Normal file
12
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js
generated
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
export class LlamaFunctionCallValidationError extends Error {
|
||||
functions;
|
||||
chatWrapper;
|
||||
callText;
|
||||
constructor(message, functions, chatWrapper, callText) {
|
||||
super(message);
|
||||
this.functions = functions;
|
||||
this.chatWrapper = chatWrapper;
|
||||
this.callText = callText;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaFunctionCallValidationError.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaFunctionCallValidationError.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts"],"names":[],"mappings":"AAIA,MAAM,OAAO,gCAA6E,SAAQ,KAAK;IACnF,SAAS,CAAY;IACrB,WAAW,CAAc;IACzB,QAAQ,CAAS;IAEjC,YAAmB,OAAe,EAAE,SAAoB,EAAE,WAAwB,EAAE,QAAgB;QAChG,KAAK,CAAC,OAAO,CAAC,CAAC;QAEf,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC7B,CAAC;CACJ"}
|
||||
@@ -0,0 +1,16 @@
|
||||
import { ChatHistoryItem, Tokenizer } from "../../../../types.js";
|
||||
import { ChatWrapper } from "../../../../ChatWrapper.js";
|
||||
export declare function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }: {
|
||||
chatHistory: ChatHistoryItem[];
|
||||
maxTokensCount: number;
|
||||
tokenizer: Tokenizer;
|
||||
chatWrapper: ChatWrapper;
|
||||
lastShiftMetadata?: object | null;
|
||||
}): Promise<{
|
||||
chatHistory: ChatHistoryItem[];
|
||||
metadata: CalculationMetadata;
|
||||
}>;
|
||||
type CalculationMetadata = {
|
||||
removedCharactersNumber: number;
|
||||
};
|
||||
export {};
|
||||
@@ -0,0 +1,254 @@
|
||||
import { isChatModelResponseFunctionCall, isChatModelResponseSegment } from "../../../../types.js";
|
||||
import { findCharacterRemovalCountToFitChatHistoryInContext } from "../../../../utils/findCharacterRemovalCountToFitChatHistoryInContext.js";
|
||||
import { truncateLlamaTextAndRoundToWords, truncateTextAndRoundToWords } from "../../../../utils/truncateTextAndRoundToWords.js";
|
||||
import { LlamaText } from "../../../../utils/LlamaText.js";
|
||||
export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory, maxTokensCount, tokenizer, chatWrapper, lastShiftMetadata }) {
|
||||
let initialCharactersRemovalCount = 0;
|
||||
if (isCalculationMetadata(lastShiftMetadata))
|
||||
initialCharactersRemovalCount = lastShiftMetadata.removedCharactersNumber;
|
||||
const { removedCharactersCount, compressedChatHistory } = await findCharacterRemovalCountToFitChatHistoryInContext({
|
||||
chatHistory,
|
||||
tokensCountToFit: maxTokensCount,
|
||||
initialCharactersRemovalCount,
|
||||
tokenizer,
|
||||
chatWrapper,
|
||||
failedCompressionErrorMessage: "Failed to compress chat history for context shift due to a too long prompt or system message that cannot be compressed without affecting the generation quality. " +
|
||||
"Consider increasing the context size or shortening the long prompt or system message.",
|
||||
compressChatHistory({ chatHistory, charactersToRemove, estimatedCharactersPerToken }) {
|
||||
const res = chatHistory.map((item) => structuredClone(item));
|
||||
let charactersLeftToRemove = charactersToRemove;
|
||||
function compressFunctionCalls() {
|
||||
for (let i = res.length - 1; i >= 0 && charactersLeftToRemove > 0; i--) {
|
||||
const historyItem = res[i];
|
||||
if (historyItem.type !== "model")
|
||||
continue;
|
||||
for (let t = historyItem.response.length - 1; t >= 0 && charactersLeftToRemove > 0; t--) {
|
||||
const item = historyItem.response[t];
|
||||
if (typeof item === "string" || item.type !== "functionCall")
|
||||
continue;
|
||||
if (item.rawCall == null)
|
||||
continue;
|
||||
const originalRawCallTokensLength = LlamaText.fromJSON(item.rawCall).tokenize(tokenizer, "trimLeadingSpace").length;
|
||||
const newRawCallText = chatWrapper.generateFunctionCall(item.name, item.params);
|
||||
const newRawCallTextTokensLength = newRawCallText.tokenize(tokenizer, "trimLeadingSpace").length;
|
||||
if (newRawCallTextTokensLength < originalRawCallTokensLength) {
|
||||
item.rawCall = newRawCallText.toJSON();
|
||||
charactersLeftToRemove -= ((originalRawCallTokensLength - newRawCallTextTokensLength) * estimatedCharactersPerToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
function removeHistoryThatLedToModelResponseAtIndex(index) {
|
||||
let removedItems = 0;
|
||||
for (let i = index - 1; i >= 0; i--) {
|
||||
const historyItem = res[i];
|
||||
if (historyItem == null)
|
||||
continue;
|
||||
if (historyItem.type === "model")
|
||||
break; // stop removing history items if we reach another model response
|
||||
if (i === 0 && historyItem.type === "system")
|
||||
break; // keep the first system message
|
||||
if (historyItem.type === "user" || historyItem.type === "system") {
|
||||
const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove, undefined, false);
|
||||
const newTextString = newText.toString();
|
||||
const historyItemString = LlamaText.fromJSON(historyItem.text).toString();
|
||||
if (newText.values.length === 0) {
|
||||
res.splice(i, 1);
|
||||
i++;
|
||||
removedItems++;
|
||||
charactersLeftToRemove -= historyItemString.length;
|
||||
}
|
||||
else if (newTextString.length < historyItemString.length) {
|
||||
charactersLeftToRemove -= historyItemString.length - newTextString.length;
|
||||
if (historyItem.type === "user")
|
||||
historyItem.text = newText.toString();
|
||||
else
|
||||
historyItem.text = newText.toJSON();
|
||||
}
|
||||
}
|
||||
else {
|
||||
void historyItem;
|
||||
}
|
||||
}
|
||||
return removedItems;
|
||||
}
|
||||
function compressHistoryThatLedToModelResponseAtIndex(index, keepTokensCount = 0) {
|
||||
let removedItems = 0;
|
||||
let promptStartIndex = undefined;
|
||||
for (let i = index - 1; i >= 0; i--) {
|
||||
const historyItem = res[i];
|
||||
if (historyItem == null)
|
||||
continue;
|
||||
if (historyItem.type === "model") {
|
||||
promptStartIndex = i + 1;
|
||||
break;
|
||||
}
|
||||
if (i === 0 && historyItem.type === "system") {
|
||||
promptStartIndex = i + 1;
|
||||
break; // keep the first system message
|
||||
}
|
||||
}
|
||||
if (promptStartIndex == null || promptStartIndex >= index)
|
||||
return 0;
|
||||
for (let i = promptStartIndex; i < index && charactersLeftToRemove > 0; i++) {
|
||||
const historyItem = res[i];
|
||||
if (historyItem == null || historyItem.type !== "user")
|
||||
continue;
|
||||
let removeChars = Math.min(charactersLeftToRemove, historyItem.text.length);
|
||||
if (keepTokensCount > 0) {
|
||||
removeChars -= Math.floor(keepTokensCount * estimatedCharactersPerToken);
|
||||
if (removeChars < 0)
|
||||
removeChars = 0;
|
||||
keepTokensCount -= Math.min(keepTokensCount, Math.max(0, historyItem.text.length - removeChars) / estimatedCharactersPerToken);
|
||||
}
|
||||
const newText = truncateTextAndRoundToWords(historyItem.text, removeChars, undefined, false);
|
||||
if (newText.length === 0) {
|
||||
res.splice(i, 1);
|
||||
i--;
|
||||
index--;
|
||||
removedItems++;
|
||||
charactersLeftToRemove -= historyItem.text.length;
|
||||
}
|
||||
else {
|
||||
charactersLeftToRemove -= historyItem.text.length - newText.length;
|
||||
historyItem.text = newText;
|
||||
}
|
||||
}
|
||||
return removedItems;
|
||||
}
|
||||
function removeEmptySegmentsFromModelResponse(modelResponse) {
|
||||
const stack = [];
|
||||
for (let t = 0; t < modelResponse.length && charactersLeftToRemove > 0; t++) {
|
||||
const item = modelResponse[t];
|
||||
const isLastItem = t === modelResponse.length - 1;
|
||||
if (!isChatModelResponseSegment(item))
|
||||
continue;
|
||||
const type = item.segmentType;
|
||||
const topStack = stack.at(-1);
|
||||
if (topStack?.type === type) {
|
||||
if (item.ended && item.text === "" && topStack.canRemove) {
|
||||
modelResponse.splice(t, 1);
|
||||
t--;
|
||||
modelResponse.splice(topStack.startIndex, 1);
|
||||
t--;
|
||||
stack.pop();
|
||||
}
|
||||
else if (!item.ended && item.text === "" && !isLastItem) {
|
||||
modelResponse.splice(t, 1);
|
||||
t--;
|
||||
}
|
||||
else if (!item.ended && item.text !== "")
|
||||
topStack.canRemove = false;
|
||||
else if (item.ended)
|
||||
stack.pop();
|
||||
}
|
||||
else if (!item.ended)
|
||||
stack.push({
|
||||
type,
|
||||
startIndex: t,
|
||||
canRemove: item.text === ""
|
||||
});
|
||||
}
|
||||
}
|
||||
function compressFirstModelResponse() {
|
||||
for (let i = 0; i < res.length && charactersLeftToRemove > 0; i++) {
|
||||
const historyItem = res[i];
|
||||
const isLastHistoryItem = i === res.length - 1;
|
||||
if (historyItem.type !== "model")
|
||||
continue;
|
||||
for (let t = 0; t < historyItem.response.length && charactersLeftToRemove > 0; t++) {
|
||||
const item = historyItem.response[t];
|
||||
const isLastText = t === historyItem.response.length - 1;
|
||||
if (isLastHistoryItem && isLastText)
|
||||
continue;
|
||||
if (typeof item === "string") {
|
||||
const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove, undefined, true);
|
||||
if (newText === "") {
|
||||
historyItem.response.splice(t, 1);
|
||||
t--;
|
||||
charactersLeftToRemove -= item.length;
|
||||
}
|
||||
else if (newText.length < item.length) {
|
||||
historyItem.response[t] = newText;
|
||||
charactersLeftToRemove -= item.length - newText.length;
|
||||
}
|
||||
}
|
||||
else if (isChatModelResponseFunctionCall(item)) {
|
||||
historyItem.response.splice(t, 1);
|
||||
t--;
|
||||
const functionCallAndResultTokenUsage = chatWrapper.generateFunctionCallsAndResults([item], true)
|
||||
.tokenize(tokenizer, "trimLeadingSpace").length;
|
||||
charactersLeftToRemove -= functionCallAndResultTokenUsage * estimatedCharactersPerToken;
|
||||
}
|
||||
else if (isChatModelResponseSegment(item)) {
|
||||
if (item.text !== "") {
|
||||
const newText = truncateTextAndRoundToWords(item.text, charactersLeftToRemove, undefined, true);
|
||||
if (newText === "" && item.ended) {
|
||||
const emptySegmentTokenUsage = chatWrapper.generateModelResponseText([{ ...item, text: "" }], true)
|
||||
.tokenize(tokenizer, "trimLeadingSpace").length;
|
||||
historyItem.response.splice(t, 1);
|
||||
t--;
|
||||
charactersLeftToRemove -= item.text.length + emptySegmentTokenUsage * estimatedCharactersPerToken;
|
||||
}
|
||||
else {
|
||||
charactersLeftToRemove -= item.text.length - newText.length;
|
||||
item.text = newText;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
void item;
|
||||
}
|
||||
removeEmptySegmentsFromModelResponse(historyItem.response);
|
||||
if (historyItem.response.length === 0) {
|
||||
// if the model response is removed from the history,
|
||||
// the things that led to it are not important anymore
|
||||
i -= removeHistoryThatLedToModelResponseAtIndex(i);
|
||||
res.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
function compressLastModelResponse(minCharactersToKeep = 60) {
|
||||
const lastHistoryItem = res[res.length - 1];
|
||||
if (lastHistoryItem == null || lastHistoryItem.type !== "model")
|
||||
return;
|
||||
const lastResponseItem = lastHistoryItem.response[lastHistoryItem.response.length - 1];
|
||||
if (lastResponseItem == null || typeof lastResponseItem !== "string")
|
||||
return;
|
||||
compressHistoryThatLedToModelResponseAtIndex(res.length - 1, maxTokensCount / 4);
|
||||
if (charactersLeftToRemove <= 0)
|
||||
return;
|
||||
const nextTextLength = Math.max(Math.min(lastResponseItem.length, minCharactersToKeep), lastResponseItem.length - charactersLeftToRemove);
|
||||
const charactersToRemoveFromText = lastResponseItem.length - nextTextLength;
|
||||
const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText, undefined, true);
|
||||
if (newText.length < lastResponseItem.length) {
|
||||
lastHistoryItem.response[lastHistoryItem.response.length - 1] = newText;
|
||||
charactersLeftToRemove -= lastResponseItem.length - newText.length;
|
||||
}
|
||||
if (charactersLeftToRemove <= 0)
|
||||
return;
|
||||
compressHistoryThatLedToModelResponseAtIndex(res.length - 1);
|
||||
}
|
||||
compressFunctionCalls();
|
||||
if (charactersLeftToRemove <= 0)
|
||||
return res;
|
||||
compressFirstModelResponse();
|
||||
if (charactersLeftToRemove <= 0)
|
||||
return res;
|
||||
compressLastModelResponse();
|
||||
return res;
|
||||
}
|
||||
});
|
||||
const newMetadata = {
|
||||
removedCharactersNumber: removedCharactersCount
|
||||
};
|
||||
return {
|
||||
chatHistory: compressedChatHistory,
|
||||
metadata: newMetadata
|
||||
};
|
||||
}
|
||||
function isCalculationMetadata(metadata) {
|
||||
return metadata != null && typeof metadata === "object" && typeof metadata.removedCharactersNumber === "number";
|
||||
}
|
||||
//# sourceMappingURL=eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map
|
||||
File diff suppressed because one or more lines are too long
433
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts
generated
vendored
Normal file
433
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts
generated
vendored
Normal file
@@ -0,0 +1,433 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { ChatWrapper } from "../../ChatWrapper.js";
|
||||
import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
|
||||
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
|
||||
import { LlamaGrammar } from "../LlamaGrammar.js";
|
||||
import { LLamaChatContextShiftOptions, LlamaChatResponseChunk, LlamaChatResponseFunctionCallParamsChunk } from "../LlamaChat/LlamaChat.js";
|
||||
import { EvaluationPriority } from "../LlamaContext/types.js";
|
||||
import { TokenBias } from "../TokenBias.js";
|
||||
import { LlamaText } from "../../utils/LlamaText.js";
|
||||
import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
|
||||
export type LlamaChatSessionOptions = {
|
||||
contextSequence: LlamaContextSequence;
|
||||
/** `"auto"` is used by default */
|
||||
chatWrapper?: "auto" | ChatWrapper;
|
||||
systemPrompt?: string;
|
||||
/**
|
||||
* Add the system prompt even on models that don't support a system prompt.
|
||||
*
|
||||
* Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
|
||||
* but forcing the system prompt on unsupported models may not always work as expected.
|
||||
*
|
||||
* Use with caution.
|
||||
*/
|
||||
forceAddSystemPrompt?: boolean;
|
||||
/**
|
||||
* Automatically dispose the sequence when the session is disposed.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
autoDisposeSequence?: boolean;
|
||||
contextShift?: LlamaChatSessionContextShiftOptions;
|
||||
};
|
||||
export type LlamaChatSessionContextShiftOptions = {
|
||||
/**
|
||||
* The number of tokens to delete from the context window to make space for new ones.
|
||||
* Defaults to 10% of the context size.
|
||||
*/
|
||||
size?: LLamaChatContextShiftOptions["size"];
|
||||
/**
|
||||
* The strategy to use when deleting tokens from the context window.
|
||||
*
|
||||
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
|
||||
*/
|
||||
strategy?: LLamaChatContextShiftOptions["strategy"];
|
||||
};
|
||||
export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
|
||||
/**
|
||||
* Called as the model generates the main response with the generated text chunk.
|
||||
*
|
||||
* Useful for streaming the generated response as it's being generated.
|
||||
*
|
||||
* Includes only the main response without any text segments (like thoughts).
|
||||
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
||||
*/
|
||||
onTextChunk?: (text: string) => void;
|
||||
/**
|
||||
* Called as the model generates the main response with the generated tokens.
|
||||
*
|
||||
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
|
||||
*
|
||||
* Includes only the main response without any segments (like thoughts).
|
||||
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
|
||||
*/
|
||||
onToken?: (tokens: Token[]) => void;
|
||||
/**
|
||||
* Called as the model generates a response with the generated text and tokens,
|
||||
* including segment information (when the generated output is part of a segment).
|
||||
*
|
||||
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
|
||||
*
|
||||
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
|
||||
*/
|
||||
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
|
||||
/**
|
||||
* An AbortSignal to later abort the generation.
|
||||
*
|
||||
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
|
||||
*
|
||||
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
|
||||
*/
|
||||
signal?: AbortSignal;
|
||||
/**
|
||||
* When a response already started being generated and then the signal is aborted,
|
||||
* the generation will stop and the response will be returned as is instead of throwing an error.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
stopOnAbortSignal?: boolean;
|
||||
/** Maximum number of tokens to generate */
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* Temperature is a hyperparameter that controls the randomness of the generated text.
|
||||
* It affects the probability distribution of the model's output tokens.
|
||||
*
|
||||
* A higher temperature (e.g., 1.5) makes the output more random and creative,
|
||||
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
|
||||
*
|
||||
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
|
||||
*
|
||||
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
*
|
||||
* Set to `0` to disable.
|
||||
* Disabled by default (set to `0`).
|
||||
*/
|
||||
temperature?: number;
|
||||
/**
|
||||
* From the next token candidates, discard the percentage of tokens with the lowest probability.
|
||||
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
|
||||
* This is useful for generating more high-quality results when using a high temperature.
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
* Disabled by default.
|
||||
*/
|
||||
minP?: number;
|
||||
/**
|
||||
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
|
||||
* An integer number between `1` and the size of the vocabulary.
|
||||
* Set to `0` to disable (which uses the full vocabulary).
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than 0.
|
||||
*/
|
||||
topK?: number;
|
||||
/**
|
||||
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
|
||||
* and samples the next token only from this set.
|
||||
* A float number between `0` and `1`.
|
||||
* Set to `1` to disable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
*/
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
/**
|
||||
* Trim whitespace from the end of the generated text
|
||||
* Disabled by default.
|
||||
*/
|
||||
trimWhitespaceSuffix?: boolean;
|
||||
/**
|
||||
* Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
|
||||
*
|
||||
* May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
|
||||
* so avoid using it together with function calling if you notice unexpected behavior.
|
||||
*/
|
||||
responsePrefix?: string;
|
||||
/**
|
||||
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
/**
|
||||
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
|
||||
*/
|
||||
customStopTriggers?: (LlamaText | string | (string | Token)[])[];
|
||||
/**
|
||||
* Called as the model generates function calls with the generated parameters chunk for each function call.
|
||||
*
|
||||
* Useful for streaming the generated function call parameters as they're being generated.
|
||||
* Only useful in specific use cases,
|
||||
* such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
|
||||
*
|
||||
* The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
|
||||
* according to the function parameters schema.
|
||||
*
|
||||
* Each function call has its own `callIndex` you can use to distinguish between them.
|
||||
*
|
||||
* Only relevant when using function calling (via passing the `functions` option).
|
||||
*/
|
||||
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
||||
/**
|
||||
* Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
|
||||
*/
|
||||
budgets?: {
|
||||
/**
|
||||
* Budget for thought tokens.
|
||||
*
|
||||
* Defaults to `Infinity`.
|
||||
*/
|
||||
thoughtTokens?: number;
|
||||
/**
|
||||
* Budget for comment tokens.
|
||||
*
|
||||
* Defaults to `Infinity`.
|
||||
*/
|
||||
commentTokens?: number;
|
||||
};
|
||||
} & ({
|
||||
grammar?: LlamaGrammar;
|
||||
functions?: never;
|
||||
documentFunctionParams?: never;
|
||||
maxParallelFunctionCalls?: never;
|
||||
onFunctionCallParamsChunk?: never;
|
||||
} | {
|
||||
grammar?: never;
|
||||
functions?: Functions | ChatSessionModelFunctions;
|
||||
documentFunctionParams?: boolean;
|
||||
maxParallelFunctionCalls?: number;
|
||||
onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void;
|
||||
});
|
||||
export type LLamaChatCompletePromptOptions = {
|
||||
/**
|
||||
* Generate a completion for the given user prompt up to the given number of tokens.
|
||||
*
|
||||
* Defaults to `256` or half the context size, whichever is smaller.
|
||||
*/
|
||||
maxTokens?: LLamaChatPromptOptions["maxTokens"];
|
||||
/**
|
||||
* When a completion already started being generated and then the given `signal` is aborted,
|
||||
* the generation will stop and the completion will be returned as-is instead of throwing an error.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
|
||||
/**
|
||||
* Called as the model generates a completion with the generated text chunk.
|
||||
*
|
||||
* Useful for streaming the generated completion as it's being generated.
|
||||
*/
|
||||
onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
|
||||
/**
|
||||
* Called as the model generates a completion with the generated tokens.
|
||||
*
|
||||
* Preferably, you'd want to use `onTextChunk` instead of this.
|
||||
*/
|
||||
onToken?: LLamaChatPromptOptions["onToken"];
|
||||
signal?: LLamaChatPromptOptions["signal"];
|
||||
temperature?: LLamaChatPromptOptions["temperature"];
|
||||
minP?: LLamaChatPromptOptions["minP"];
|
||||
topK?: LLamaChatPromptOptions["topK"];
|
||||
topP?: LLamaChatPromptOptions["topP"];
|
||||
seed?: LLamaChatPromptOptions["seed"];
|
||||
trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
|
||||
evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
|
||||
repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
|
||||
tokenBias?: LLamaChatPromptOptions["tokenBias"];
|
||||
customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
|
||||
grammar?: LlamaGrammar;
|
||||
/**
|
||||
* Functions are not used by the model here,
|
||||
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
||||
* to avoid context shifts.
|
||||
*
|
||||
* It's best to provide the same functions that were used for the previous prompt here.
|
||||
*/
|
||||
functions?: ChatSessionModelFunctions;
|
||||
/**
|
||||
* Functions are not used by the model here,
|
||||
* but are used for keeping the instructions given to the model about the functions in the current context state,
|
||||
* to avoid context shifts.
|
||||
*
|
||||
* It's best to provide the same value that was used for the previous prompt here.
|
||||
*/
|
||||
documentFunctionParams?: boolean;
|
||||
/**
|
||||
* Whether to complete the prompt as a model response.
|
||||
*
|
||||
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
|
||||
* This is a good option to workaround some models that don't support used prompt completions.
|
||||
* - **`true`**: Always complete as a model response
|
||||
* - **`false`**: Never complete as a model response
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
completeAsModel?: "auto" | boolean | {
|
||||
/**
|
||||
* Whether to complete the prompt as a model response.
|
||||
*
|
||||
* - **`"auto"`**: Automatically determine whether to complete as a model response based on the model used.
|
||||
* This is a good option to workaround some models that don't support used prompt completions.
|
||||
* - **`true`**: Always complete as a model response
|
||||
* - **`false`**: Never complete as a model response
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
enabled?: "auto" | boolean;
|
||||
/**
|
||||
* The messages to append to the chat history to generate a completion as a model response.
|
||||
*
|
||||
* If the last message is a model message, the prompt will be pushed to it for the completion,
|
||||
* otherwise a new model message will be added with the prompt.
|
||||
*
|
||||
* It must contain a user message or a system message before the model message.
|
||||
*
|
||||
* Default to:
|
||||
* ```ts
|
||||
* [
|
||||
* {
|
||||
* type: "system",
|
||||
* text: "For your next response predict what the user may send next. " +
|
||||
* "No yapping, no whitespace. Match the user's language and tone."
|
||||
* },
|
||||
* {type: "user", text: ""},
|
||||
* {type: "model", response: [""]}
|
||||
* ]
|
||||
* ```
|
||||
*/
|
||||
appendedMessages?: ChatHistoryItem[];
|
||||
};
|
||||
};
|
||||
export type LLamaChatPreloadPromptOptions = {
|
||||
signal?: LLamaChatCompletePromptOptions["signal"];
|
||||
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
|
||||
functions?: LLamaChatCompletePromptOptions["functions"];
|
||||
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
|
||||
};
|
||||
export type LlamaChatSessionRepeatPenalty = {
|
||||
/**
|
||||
* Number of recent tokens generated by the model to apply penalties to repetition of.
|
||||
* Defaults to `64`.
|
||||
*/
|
||||
lastTokens?: number;
|
||||
punishTokensFilter?: (tokens: Token[]) => Token[];
|
||||
/**
|
||||
* Penalize new line tokens.
|
||||
* Enabled by default.
|
||||
*/
|
||||
penalizeNewLine?: boolean;
|
||||
/**
|
||||
* The relative amount to lower the probability of the tokens in `punishTokens` by
|
||||
* Defaults to `1.1`.
|
||||
* Set to `1` to disable.
|
||||
*/
|
||||
penalty?: number;
|
||||
/**
|
||||
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
frequencyPenalty?: number;
|
||||
/**
|
||||
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
presencePenalty?: number;
|
||||
};
|
||||
/**
|
||||
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
|
||||
*/
|
||||
export declare class LlamaChatSession {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
constructor(options: LlamaChatSessionOptions);
|
||||
dispose({ disposeSequence }?: {
|
||||
disposeSequence?: boolean;
|
||||
}): void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void;
|
||||
get disposed(): boolean;
|
||||
get chatWrapper(): ChatWrapper;
|
||||
get sequence(): LlamaContextSequence;
|
||||
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
|
||||
get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
|
||||
prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
|
||||
/**
|
||||
* @param prompt
|
||||
* @param [options]
|
||||
*/
|
||||
promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
|
||||
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
|
||||
responseText: string;
|
||||
stopReason: "customStopTrigger";
|
||||
customStopTrigger: (string | Token)[];
|
||||
remainingGenerationAfterStop: string | Token[] | undefined;
|
||||
} | {
|
||||
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
|
||||
responseText: string;
|
||||
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
|
||||
remainingGenerationAfterStop: string | Token[] | undefined;
|
||||
customStopTrigger?: undefined;
|
||||
}>;
|
||||
/**
|
||||
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
|
||||
* and feel faster.
|
||||
*
|
||||
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
|
||||
* @param prompt - the prompt to preload
|
||||
* @param [options]
|
||||
*/
|
||||
preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
|
||||
/**
|
||||
* Preload a user prompt into the current context sequence state and generate a completion for it.
|
||||
*
|
||||
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
|
||||
* > so consider limiting the length of prompts you preload.
|
||||
* >
|
||||
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
|
||||
* @param prompt - the prompt to preload
|
||||
* @param [options]
|
||||
*/
|
||||
completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
|
||||
/**
|
||||
* Create a smart completion engine that caches the prompt completions
|
||||
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
|
||||
*
|
||||
* All completions are made and cache is used only for the current chat session state.
|
||||
* You can create a single completion engine for an entire chat session.
|
||||
*/
|
||||
createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
|
||||
/**
|
||||
* See `completePrompt` for more information.
|
||||
* @param prompt
|
||||
* @param [options]
|
||||
*/
|
||||
completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel }?: LLamaChatCompletePromptOptions): Promise<{
|
||||
completion: string;
|
||||
stopReason: "customStopTrigger";
|
||||
customStopTrigger: (string | Token)[];
|
||||
remainingGenerationAfterStop: string | Token[] | undefined;
|
||||
} | {
|
||||
completion: string;
|
||||
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
|
||||
remainingGenerationAfterStop: string | Token[] | undefined;
|
||||
customStopTrigger?: undefined;
|
||||
}>;
|
||||
getChatHistory(): ChatHistoryItem[];
|
||||
getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
|
||||
setChatHistory(chatHistory: ChatHistoryItem[]): void;
|
||||
/** Clear the chat history and reset it to the initial state. */
|
||||
resetChatHistory(): void;
|
||||
}
|
||||
622
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js
generated
vendored
Normal file
622
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js
generated
vendored
Normal file
@@ -0,0 +1,622 @@
|
||||
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
|
||||
import { appendUserMessageToChatHistory } from "../../utils/appendUserMessageToChatHistory.js";
|
||||
import { LlamaChat } from "../LlamaChat/LlamaChat.js";
|
||||
import { wrapAbortSignal } from "../../utils/wrapAbortSignal.js";
|
||||
import { safeEventCallback } from "../../utils/safeEventCallback.js";
|
||||
import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
|
||||
import { LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
|
||||
const defaultCompleteAsModel = {
|
||||
enabled: "auto",
|
||||
appendedMessages: [
|
||||
{
|
||||
type: "system",
|
||||
text: "For your next response predict what the user may send next. No yapping, no whitespace. Match the user's language and tone."
|
||||
},
|
||||
{ type: "user", text: "" },
|
||||
{ type: "model", response: [""] }
|
||||
]
|
||||
};
|
||||
/**
|
||||
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
|
||||
*/
|
||||
export class LlamaChatSession {
|
||||
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
||||
/** @internal */ _autoDisposeSequence;
|
||||
/** @internal */ _contextShift;
|
||||
/** @internal */ _forceAddSystemPrompt;
|
||||
/** @internal */ _systemPrompt;
|
||||
/** @internal */ _chatLock = {};
|
||||
/** @internal */ _chatHistory;
|
||||
/** @internal */ _lastEvaluation;
|
||||
/** @internal */ _canUseContextWindowForCompletion = true;
|
||||
/** @internal */ _chat;
|
||||
/** @internal */ _chatHistoryStateRef = {};
|
||||
/** @internal */ _preloadAndCompleteAbortControllers = new Set();
|
||||
onDispose = new EventRelay();
|
||||
constructor(options) {
|
||||
const { contextSequence, chatWrapper = "auto", systemPrompt, forceAddSystemPrompt = false, autoDisposeSequence = false, contextShift } = options;
|
||||
if (contextSequence == null)
|
||||
throw new Error("contextSequence cannot be null");
|
||||
if (contextSequence.disposed)
|
||||
throw new DisposedError();
|
||||
this._contextShift = contextShift;
|
||||
this._forceAddSystemPrompt = forceAddSystemPrompt;
|
||||
this._systemPrompt = systemPrompt;
|
||||
this._chat = new LlamaChat({
|
||||
autoDisposeSequence,
|
||||
chatWrapper,
|
||||
contextSequence
|
||||
});
|
||||
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
|
||||
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
|
||||
this._chatHistory = this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt });
|
||||
else
|
||||
this._chatHistory = [];
|
||||
this._autoDisposeSequence = autoDisposeSequence;
|
||||
this._disposeAggregator.add(this._chat.onDispose.createListener(() => {
|
||||
this.dispose();
|
||||
}));
|
||||
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
||||
}
|
||||
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
|
||||
if (this._chat == null)
|
||||
return;
|
||||
this._chat.dispose({ disposeSequence });
|
||||
this._chat = null;
|
||||
this._disposeAggregator.dispose();
|
||||
}
|
||||
/** @hidden */
|
||||
[Symbol.dispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
get disposed() {
|
||||
return this._chat == null || this._chat.disposed;
|
||||
}
|
||||
get chatWrapper() {
|
||||
if (this._chat == null)
|
||||
throw new DisposedError();
|
||||
return this._chat.chatWrapper;
|
||||
}
|
||||
get sequence() {
|
||||
if (this._chat == null)
|
||||
throw new DisposedError();
|
||||
return this._chat.sequence;
|
||||
}
|
||||
get context() {
|
||||
return this.sequence.context;
|
||||
}
|
||||
get model() {
|
||||
return this.sequence.model;
|
||||
}
|
||||
async prompt(prompt, options = {}) {
|
||||
const { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers } = options;
|
||||
const { responseText } = await this.promptWithMeta(prompt, {
|
||||
// this is a workaround to allow passing both `functions` and `grammar`
|
||||
functions: functions,
|
||||
grammar: grammar,
|
||||
documentFunctionParams: documentFunctionParams,
|
||||
maxParallelFunctionCalls: maxParallelFunctionCalls,
|
||||
onFunctionCallParamsChunk: onFunctionCallParamsChunk,
|
||||
onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens,
|
||||
temperature, minP, topK, topP, seed,
|
||||
trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
|
||||
});
|
||||
return responseText;
|
||||
}
|
||||
/**
|
||||
* @param prompt
|
||||
* @param [options]
|
||||
*/
|
||||
async promptWithMeta(prompt, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, onFunctionCallParamsChunk, budgets, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority } = {}) {
|
||||
this._ensureNotDisposed();
|
||||
if (grammar != null && grammar._llama !== this.model._llama)
|
||||
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
|
||||
this._stopAllPreloadAndPromptCompletions();
|
||||
return await withLock([this._chatLock, "evaluation"], signal, async () => {
|
||||
this._ensureNotDisposed();
|
||||
this._stopAllPreloadAndPromptCompletions();
|
||||
if (this._chat == null)
|
||||
throw new DisposedError();
|
||||
const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
|
||||
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
|
||||
let lastEvaluation = this._canUseContextWindowForCompletion
|
||||
? this._lastEvaluation
|
||||
: undefined;
|
||||
let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
|
||||
let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
|
||||
? undefined
|
||||
: appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
|
||||
let previousFunctionCalls = 0;
|
||||
const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
|
||||
? responsePrefix
|
||||
: undefined;
|
||||
newChatHistory.push({
|
||||
type: "model",
|
||||
response: resolvedResponsePrefix != null
|
||||
? [resolvedResponsePrefix]
|
||||
: []
|
||||
});
|
||||
if (newContextWindowChatHistory != null)
|
||||
newContextWindowChatHistory.push({
|
||||
type: "model",
|
||||
response: resolvedResponsePrefix != null
|
||||
? [resolvedResponsePrefix]
|
||||
: []
|
||||
});
|
||||
if (resolvedResponsePrefix != null) {
|
||||
safeEventCallback(onToken)?.(this.model.tokenize(resolvedResponsePrefix));
|
||||
safeEventCallback(onTextChunk)?.(resolvedResponsePrefix);
|
||||
safeEventCallback(onResponseChunk)?.({
|
||||
type: undefined,
|
||||
segmentType: undefined,
|
||||
text: resolvedResponsePrefix,
|
||||
tokens: this.model.tokenize(resolvedResponsePrefix)
|
||||
});
|
||||
}
|
||||
try {
|
||||
while (true) {
|
||||
const functionCallsAndResults = [];
|
||||
let canThrowFunctionCallingErrors = false;
|
||||
let abortedOnFunctionCallError = false;
|
||||
const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
|
||||
const { lastEvaluation: currentLastEvaluation, metadata } = await this._chat.generateResponse(newChatHistory, {
|
||||
functions,
|
||||
documentFunctionParams,
|
||||
maxParallelFunctionCalls,
|
||||
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
|
||||
onTextChunk: safeEventCallback(onTextChunk),
|
||||
onToken: safeEventCallback(onToken),
|
||||
onResponseChunk: safeEventCallback(onResponseChunk),
|
||||
onFunctionCallParamsChunk: onFunctionCallParamsChunk == null
|
||||
? undefined
|
||||
: safeEventCallback((chunk) => onFunctionCallParamsChunk?.({
|
||||
callIndex: previousFunctionCalls + chunk.callIndex,
|
||||
functionName: chunk.functionName,
|
||||
paramsChunk: chunk.paramsChunk,
|
||||
done: chunk.done
|
||||
})),
|
||||
budgets: {
|
||||
includeCurrentResponse: true,
|
||||
thoughtTokens: budgets?.thoughtTokens,
|
||||
commentTokens: budgets?.commentTokens
|
||||
},
|
||||
signal: abortController.signal,
|
||||
stopOnAbortSignal,
|
||||
repeatPenalty,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed,
|
||||
tokenBias,
|
||||
customStopTriggers,
|
||||
maxTokens,
|
||||
temperature,
|
||||
trimWhitespaceSuffix,
|
||||
contextShift: {
|
||||
...this._contextShift,
|
||||
lastEvaluationMetadata: lastEvaluation?.contextShiftMetadata
|
||||
},
|
||||
evaluationPriority,
|
||||
lastEvaluationContextWindow: {
|
||||
history: newContextWindowChatHistory,
|
||||
minimumOverlapPercentageToPreventContextShift: 0.5
|
||||
},
|
||||
onFunctionCall: async (functionCall) => {
|
||||
functionCallsAndResults.push((async () => {
|
||||
try {
|
||||
const functionDefinition = functions?.[functionCall.functionName];
|
||||
if (functionDefinition == null)
|
||||
throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
|
||||
const functionCallResult = await functionDefinition.handler(functionCall.params);
|
||||
return {
|
||||
functionCall,
|
||||
functionDefinition,
|
||||
functionCallResult
|
||||
};
|
||||
}
|
||||
catch (err) {
|
||||
if (!abortController.signal.aborted) {
|
||||
abortedOnFunctionCallError = true;
|
||||
abortController.abort(err);
|
||||
}
|
||||
if (canThrowFunctionCallingErrors)
|
||||
throw err;
|
||||
return null;
|
||||
}
|
||||
})());
|
||||
}
|
||||
});
|
||||
this._ensureNotDisposed();
|
||||
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
|
||||
throw abortController.signal.reason;
|
||||
if (maxTokens != null)
|
||||
maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
|
||||
lastEvaluation = currentLastEvaluation;
|
||||
newChatHistory = lastEvaluation.cleanHistory;
|
||||
if (functionCallsAndResults.length > 0) {
|
||||
canThrowFunctionCallingErrors = true;
|
||||
const functionCallResultsPromise = Promise.all(functionCallsAndResults);
|
||||
const raceEventAbortController = new AbortController();
|
||||
await Promise.race([
|
||||
functionCallResultsPromise,
|
||||
new Promise((accept, reject) => {
|
||||
abortController.signal.addEventListener("abort", () => {
|
||||
if (abortedOnFunctionCallError || !stopOnAbortSignal)
|
||||
reject(abortController.signal.reason);
|
||||
else
|
||||
accept();
|
||||
}, { signal: raceEventAbortController.signal });
|
||||
if (abortController.signal.aborted) {
|
||||
if (abortedOnFunctionCallError || !stopOnAbortSignal)
|
||||
reject(abortController.signal.reason);
|
||||
else
|
||||
accept();
|
||||
}
|
||||
})
|
||||
]);
|
||||
raceEventAbortController.abort();
|
||||
this._ensureNotDisposed();
|
||||
if (!abortController.signal.aborted) {
|
||||
const functionCallResults = (await functionCallResultsPromise)
|
||||
.filter((result) => result != null);
|
||||
this._ensureNotDisposed();
|
||||
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
|
||||
throw abortController.signal.reason;
|
||||
newContextWindowChatHistory = lastEvaluation.contextWindow;
|
||||
let startNewChunk = supportsParallelFunctionCalling;
|
||||
for (const { functionCall, functionDefinition, functionCallResult } of functionCallResults) {
|
||||
newChatHistory = addFunctionCallToChatHistory({
|
||||
chatHistory: newChatHistory,
|
||||
functionName: functionCall.functionName,
|
||||
functionDescription: functionDefinition.description,
|
||||
callParams: functionCall.params,
|
||||
callResult: functionCallResult,
|
||||
rawCall: functionCall.raw,
|
||||
startsNewChunk: startNewChunk
|
||||
});
|
||||
newContextWindowChatHistory = addFunctionCallToChatHistory({
|
||||
chatHistory: newContextWindowChatHistory,
|
||||
functionName: functionCall.functionName,
|
||||
functionDescription: functionDefinition.description,
|
||||
callParams: functionCall.params,
|
||||
callResult: functionCallResult,
|
||||
rawCall: functionCall.raw,
|
||||
startsNewChunk: startNewChunk
|
||||
});
|
||||
startNewChunk = false;
|
||||
previousFunctionCalls++;
|
||||
}
|
||||
lastEvaluation.cleanHistory = newChatHistory;
|
||||
lastEvaluation.contextWindow = newContextWindowChatHistory;
|
||||
if (abortController.signal.aborted && !abortedOnFunctionCallError && stopOnAbortSignal) {
|
||||
metadata.stopReason = "abort";
|
||||
metadata.remainingGenerationAfterStop = undefined;
|
||||
}
|
||||
else
|
||||
continue;
|
||||
}
|
||||
}
|
||||
this._lastEvaluation = lastEvaluation;
|
||||
this._canUseContextWindowForCompletion = true;
|
||||
this._chatHistory = newChatHistory;
|
||||
this._chatHistoryStateRef = {};
|
||||
const lastModelResponseItem = getLastModelResponseItem(newChatHistory);
|
||||
const responseText = lastModelResponseItem.response
|
||||
.filter((item) => typeof item === "string")
|
||||
.join("");
|
||||
if (metadata.stopReason === "customStopTrigger")
|
||||
return {
|
||||
response: lastModelResponseItem.response,
|
||||
responseText,
|
||||
stopReason: metadata.stopReason,
|
||||
customStopTrigger: metadata.customStopTrigger,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
return {
|
||||
response: lastModelResponseItem.response,
|
||||
responseText,
|
||||
stopReason: metadata.stopReason,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
}
|
||||
}
|
||||
finally {
|
||||
disposeAbortController();
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
|
||||
* and feel faster.
|
||||
*
|
||||
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
|
||||
* @param prompt - the prompt to preload
|
||||
* @param [options]
|
||||
*/
|
||||
async preloadPrompt(prompt, options = {}) {
|
||||
await this.completePromptWithMeta(prompt, {
|
||||
...options,
|
||||
completeAsModel: false,
|
||||
maxTokens: 0
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Preload a user prompt into the current context sequence state and generate a completion for it.
|
||||
*
|
||||
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
|
||||
* > so consider limiting the length of prompts you preload.
|
||||
* >
|
||||
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
|
||||
* @param prompt - the prompt to preload
|
||||
* @param [options]
|
||||
*/
|
||||
async completePrompt(prompt, options = {}) {
|
||||
const { completion } = await this.completePromptWithMeta(prompt, options);
|
||||
return completion;
|
||||
}
|
||||
/**
|
||||
* Create a smart completion engine that caches the prompt completions
|
||||
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
|
||||
*
|
||||
* All completions are made and cache is used only for the current chat session state.
|
||||
* You can create a single completion engine for an entire chat session.
|
||||
*/
|
||||
createPromptCompletionEngine(options) {
|
||||
return LlamaChatSessionPromptCompletionEngine._create(this, options);
|
||||
}
|
||||
/**
|
||||
* See `completePrompt` for more information.
|
||||
* @param prompt
|
||||
* @param [options]
|
||||
*/
|
||||
async completePromptWithMeta(prompt, { maxTokens, stopOnAbortSignal = false, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority, completeAsModel } = {}) {
|
||||
this._ensureNotDisposed();
|
||||
if (grammar != null) {
|
||||
if (grammar._llama == null)
|
||||
throw new Error("The grammar passed to this function is not a LlamaGrammar instance.");
|
||||
else if (grammar._llama !== this.model._llama)
|
||||
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
|
||||
}
|
||||
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
|
||||
this._preloadAndCompleteAbortControllers.add(abortController);
|
||||
const completeAsModelEnabled = typeof completeAsModel == "boolean"
|
||||
? completeAsModel
|
||||
: completeAsModel === "auto"
|
||||
? "auto"
|
||||
: completeAsModel?.enabled ?? defaultCompleteAsModel.enabled;
|
||||
const modelArchitecture = this.model.fileInfo.metadata?.general?.architecture;
|
||||
const shouldCompleteAsModel = completeAsModelEnabled === "auto"
|
||||
? modelArchitecture === GgufArchitectureType.gptOss
|
||||
: completeAsModelEnabled;
|
||||
try {
|
||||
return await withLock([this._chatLock, "evaluation"], abortController.signal, async () => {
|
||||
this._ensureNotDisposed();
|
||||
if (this._chat == null)
|
||||
throw new DisposedError();
|
||||
if (shouldCompleteAsModel) {
|
||||
const messagesToAppendOption = (typeof completeAsModel == "boolean" || completeAsModel === "auto")
|
||||
? defaultCompleteAsModel.appendedMessages
|
||||
: completeAsModel?.appendedMessages ?? defaultCompleteAsModel.appendedMessages;
|
||||
const messagesToAppend = messagesToAppendOption.length === 0
|
||||
? defaultCompleteAsModel.appendedMessages
|
||||
: messagesToAppendOption;
|
||||
const addMessageToChatHistory = (chatHistory) => {
|
||||
const newHistory = chatHistory.slice();
|
||||
if (messagesToAppend.at(0)?.type === "model")
|
||||
newHistory.push({ type: "user", text: "" });
|
||||
for (let i = 0; i < messagesToAppend.length; i++) {
|
||||
const item = messagesToAppend[i];
|
||||
const isLastItem = i === messagesToAppend.length - 1;
|
||||
if (item == null)
|
||||
continue;
|
||||
if (isLastItem && item.type === "model") {
|
||||
const newResponse = item.response.slice();
|
||||
if (typeof newResponse.at(-1) === "string")
|
||||
newResponse.push(newResponse.pop() + prompt);
|
||||
else
|
||||
newResponse.push(prompt);
|
||||
newHistory.push({
|
||||
type: "model",
|
||||
response: newResponse
|
||||
});
|
||||
}
|
||||
else
|
||||
newHistory.push(item);
|
||||
}
|
||||
if (messagesToAppend.at(-1)?.type !== "model")
|
||||
newHistory.push({ type: "model", response: [prompt] });
|
||||
return {
|
||||
history: newHistory,
|
||||
addedCount: newHistory.length - chatHistory.length
|
||||
};
|
||||
};
|
||||
const { history: messagesWithPrompt, addedCount } = addMessageToChatHistory(this._chatHistory);
|
||||
const { response, lastEvaluation, metadata } = await this._chat.generateResponse(messagesWithPrompt, {
|
||||
abortOnNonText: true,
|
||||
functions,
|
||||
documentFunctionParams,
|
||||
grammar: grammar, // this is allowed only because `abortOnNonText` is enabled
|
||||
onTextChunk,
|
||||
onToken,
|
||||
signal: abortController.signal,
|
||||
stopOnAbortSignal: true,
|
||||
repeatPenalty,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed,
|
||||
tokenBias,
|
||||
customStopTriggers,
|
||||
maxTokens: maxTokens == null
|
||||
? undefined
|
||||
: Math.min(1, maxTokens), // regular prompting ignores `maxTokens: 0`
|
||||
temperature,
|
||||
trimWhitespaceSuffix,
|
||||
contextShift: {
|
||||
...this._contextShift,
|
||||
lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
|
||||
},
|
||||
evaluationPriority,
|
||||
lastEvaluationContextWindow: {
|
||||
history: this._lastEvaluation?.contextWindow == null
|
||||
? undefined
|
||||
: addMessageToChatHistory(this._lastEvaluation?.contextWindow).history,
|
||||
minimumOverlapPercentageToPreventContextShift: 0.8
|
||||
}
|
||||
});
|
||||
this._ensureNotDisposed();
|
||||
this._lastEvaluation = {
|
||||
cleanHistory: this._chatHistory,
|
||||
contextWindow: lastEvaluation.contextWindow.slice(0, -addedCount),
|
||||
contextShiftMetadata: lastEvaluation.contextShiftMetadata
|
||||
};
|
||||
this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
|
||||
if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
|
||||
throw abortController.signal.reason;
|
||||
if (metadata.stopReason === "customStopTrigger")
|
||||
return {
|
||||
completion: response,
|
||||
stopReason: metadata.stopReason,
|
||||
customStopTrigger: metadata.customStopTrigger,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
return {
|
||||
completion: response,
|
||||
stopReason: metadata.stopReason,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
}
|
||||
else {
|
||||
const { completion, lastEvaluation, metadata } = await this._chat.loadChatAndCompleteUserMessage(asWithLastUserMessageRemoved(this._chatHistory), {
|
||||
initialUserPrompt: prompt,
|
||||
functions,
|
||||
documentFunctionParams,
|
||||
grammar,
|
||||
onTextChunk,
|
||||
onToken,
|
||||
signal: abortController.signal,
|
||||
stopOnAbortSignal: true,
|
||||
repeatPenalty,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed,
|
||||
tokenBias,
|
||||
customStopTriggers,
|
||||
maxTokens,
|
||||
temperature,
|
||||
trimWhitespaceSuffix,
|
||||
contextShift: {
|
||||
...this._contextShift,
|
||||
lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
|
||||
},
|
||||
evaluationPriority,
|
||||
lastEvaluationContextWindow: {
|
||||
history: asWithLastUserMessageRemoved(this._lastEvaluation?.contextWindow),
|
||||
minimumOverlapPercentageToPreventContextShift: 0.8
|
||||
}
|
||||
});
|
||||
this._ensureNotDisposed();
|
||||
this._lastEvaluation = {
|
||||
cleanHistory: this._chatHistory,
|
||||
contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
|
||||
contextShiftMetadata: lastEvaluation.contextShiftMetadata
|
||||
};
|
||||
this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
|
||||
if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
|
||||
throw abortController.signal.reason;
|
||||
if (metadata.stopReason === "customStopTrigger")
|
||||
return {
|
||||
completion: completion,
|
||||
stopReason: metadata.stopReason,
|
||||
customStopTrigger: metadata.customStopTrigger,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
return {
|
||||
completion: completion,
|
||||
stopReason: metadata.stopReason,
|
||||
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
finally {
|
||||
this._preloadAndCompleteAbortControllers.delete(abortController);
|
||||
disposeAbortController();
|
||||
}
|
||||
}
|
||||
getChatHistory() {
|
||||
return structuredClone(this._chatHistory);
|
||||
}
|
||||
getLastEvaluationContextWindow() {
|
||||
if (this._lastEvaluation == null)
|
||||
return null;
|
||||
return structuredClone(this._lastEvaluation?.contextWindow);
|
||||
}
|
||||
setChatHistory(chatHistory) {
|
||||
this._chatHistory = structuredClone(chatHistory);
|
||||
this._chatHistoryStateRef = {};
|
||||
this._lastEvaluation = undefined;
|
||||
this._canUseContextWindowForCompletion = false;
|
||||
}
|
||||
/** Clear the chat history and reset it to the initial state. */
|
||||
resetChatHistory() {
|
||||
if (this._chat == null || this.disposed)
|
||||
throw new DisposedError();
|
||||
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
|
||||
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
|
||||
this.setChatHistory(this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt }));
|
||||
else
|
||||
this.setChatHistory([]);
|
||||
}
|
||||
/** @internal */
|
||||
_stopAllPreloadAndPromptCompletions() {
|
||||
for (const abortController of this._preloadAndCompleteAbortControllers)
|
||||
abortController.abort();
|
||||
this._preloadAndCompleteAbortControllers.clear();
|
||||
}
|
||||
/** @internal */
|
||||
_ensureNotDisposed() {
|
||||
if (this.disposed)
|
||||
throw new DisposedError();
|
||||
}
|
||||
}
|
||||
function addFunctionCallToChatHistory({ chatHistory, functionName, functionDescription, callParams, callResult, rawCall, startsNewChunk }) {
|
||||
const newChatHistory = chatHistory.slice();
|
||||
if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
|
||||
newChatHistory.push({
|
||||
type: "model",
|
||||
response: []
|
||||
});
|
||||
const lastModelResponseItem = newChatHistory[newChatHistory.length - 1];
|
||||
const newLastModelResponseItem = { ...lastModelResponseItem };
|
||||
newChatHistory[newChatHistory.length - 1] = newLastModelResponseItem;
|
||||
const modelResponse = newLastModelResponseItem.response.slice();
|
||||
newLastModelResponseItem.response = modelResponse;
|
||||
const functionCall = {
|
||||
type: "functionCall",
|
||||
name: functionName,
|
||||
description: functionDescription,
|
||||
params: callParams,
|
||||
result: callResult,
|
||||
rawCall
|
||||
};
|
||||
if (startsNewChunk)
|
||||
functionCall.startsNewChunk = true;
|
||||
modelResponse.push(functionCall);
|
||||
return newChatHistory;
|
||||
}
|
||||
function getLastModelResponseItem(chatHistory) {
|
||||
if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "model")
|
||||
throw new Error("Expected chat history to end with a model response");
|
||||
return chatHistory[chatHistory.length - 1];
|
||||
}
|
||||
function asWithLastUserMessageRemoved(chatHistory) {
|
||||
if (chatHistory == null)
|
||||
return chatHistory;
|
||||
const newChatHistory = chatHistory.slice();
|
||||
while (newChatHistory.at(-1)?.type === "user")
|
||||
newChatHistory.pop();
|
||||
return newChatHistory;
|
||||
}
|
||||
//# sourceMappingURL=LlamaChatSession.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
43
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts
generated
vendored
Normal file
43
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts
generated
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
import type { LlamaContextSequence } from "../../LlamaContext/LlamaContext.js";
|
||||
import type { LLamaChatCompletePromptOptions } from "../LlamaChatSession.js";
|
||||
export type LLamaChatPromptCompletionEngineOptions = {
|
||||
/**
|
||||
* Max tokens to allow for preloading a prompt and generating a completion for it.
|
||||
*
|
||||
* Defaults to `256` or half of the context size, whichever is smaller.
|
||||
*/
|
||||
maxPreloadTokens?: number;
|
||||
onGeneration?(prompt: string, completion: string): void;
|
||||
/**
|
||||
* Max number of completions to cache.
|
||||
*
|
||||
* Defaults to `100`.
|
||||
*/
|
||||
maxCachedCompletions?: number;
|
||||
temperature?: LLamaChatCompletePromptOptions["temperature"];
|
||||
minP?: LLamaChatCompletePromptOptions["minP"];
|
||||
topK?: LLamaChatCompletePromptOptions["topK"];
|
||||
topP?: LLamaChatCompletePromptOptions["topP"];
|
||||
seed?: LLamaChatCompletePromptOptions["seed"];
|
||||
trimWhitespaceSuffix?: LLamaChatCompletePromptOptions["trimWhitespaceSuffix"];
|
||||
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
|
||||
repeatPenalty?: LLamaChatCompletePromptOptions["repeatPenalty"];
|
||||
tokenBias?: LLamaChatCompletePromptOptions["tokenBias"];
|
||||
customStopTriggers?: LLamaChatCompletePromptOptions["customStopTriggers"];
|
||||
grammar?: LLamaChatCompletePromptOptions["grammar"];
|
||||
functions?: LLamaChatCompletePromptOptions["functions"];
|
||||
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
|
||||
completeAsModel?: LLamaChatCompletePromptOptions["completeAsModel"];
|
||||
};
|
||||
export declare const defaultMaxPreloadTokens: (sequence: LlamaContextSequence) => number;
|
||||
export declare class LlamaChatSessionPromptCompletionEngine {
|
||||
private constructor();
|
||||
dispose(): void;
|
||||
/**
|
||||
* Get completion for the prompt from the cache,
|
||||
* and begin preloading this prompt into the context sequence and completing it.
|
||||
*
|
||||
* On completion progress, `onGeneration` (configured for this engine instance) will be called.
|
||||
*/
|
||||
complete(prompt: string): string;
|
||||
}
|
||||
191
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js
generated
vendored
Normal file
191
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js
generated
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
import { DisposeAggregator, DisposedError } from "lifecycle-utils";
|
||||
import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
|
||||
import { LruCache } from "../../../utils/LruCache.js";
|
||||
import { safeEventCallback } from "../../../utils/safeEventCallback.js";
|
||||
export const defaultMaxPreloadTokens = (sequence) => {
|
||||
const defaultValue = 256;
|
||||
return sequence.model.fileInsights.swaSize != null
|
||||
? Math.min(Math.ceil(sequence.model.fileInsights.swaSize / 2), defaultValue, Math.ceil(sequence.contextSize / 2))
|
||||
: Math.min(defaultValue, Math.ceil(sequence.contextSize / 2));
|
||||
};
|
||||
const defaultMaxCachedCompletions = 100;
|
||||
export class LlamaChatSessionPromptCompletionEngine {
|
||||
/** @internal */ _chatSession;
|
||||
/** @internal */ _maxPreloadTokens;
|
||||
/** @internal */ _maxCachedCompletions;
|
||||
/** @internal */ _onGeneration;
|
||||
/** @internal */ _completionOptions;
|
||||
/** @internal */ _completionCaches = new WeakMap();
|
||||
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
||||
/** @internal */ _currentCompletionAbortController = new AbortController();
|
||||
/** @internal */ _lastPrompt;
|
||||
/** @internal */ _disposed = false;
|
||||
constructor(chatSession, { maxPreloadTokens = defaultMaxPreloadTokens(chatSession.sequence), onGeneration, maxCachedCompletions = defaultMaxCachedCompletions, ...options }) {
|
||||
this._chatSession = chatSession;
|
||||
this._maxPreloadTokens = Math.max(1, maxPreloadTokens);
|
||||
this._maxCachedCompletions = Math.max(1, maxCachedCompletions);
|
||||
this._onGeneration = safeEventCallback(onGeneration);
|
||||
this._completionOptions = options;
|
||||
this.dispose = this.dispose.bind(this);
|
||||
this._disposeAggregator.add(this._chatSession.onDispose.createListener(this.dispose));
|
||||
this._disposeAggregator.add(() => {
|
||||
this._disposed = true;
|
||||
this._currentCompletionAbortController.abort();
|
||||
});
|
||||
}
|
||||
dispose() {
|
||||
if (this._disposed)
|
||||
return;
|
||||
this._disposeAggregator.dispose();
|
||||
}
|
||||
/**
|
||||
* Get completion for the prompt from the cache,
|
||||
* and begin preloading this prompt into the context sequence and completing it.
|
||||
*
|
||||
* On completion progress, `onGeneration` (configured for this engine instance) will be called.
|
||||
*/
|
||||
complete(prompt) {
|
||||
if (this._disposed)
|
||||
throw new DisposedError();
|
||||
const completionCache = this._getCurrentCompletionCache();
|
||||
const completion = completionCache.getCompletion(prompt);
|
||||
if (this._lastPrompt == null || !(this._lastPrompt + (completion ?? "")).startsWith(prompt)) {
|
||||
this._lastPrompt = prompt;
|
||||
this._restartCompletion(completionCache);
|
||||
}
|
||||
this._lastPrompt = prompt;
|
||||
return completion ?? "";
|
||||
}
|
||||
/** @internal */
|
||||
_getCurrentCompletionCache() {
|
||||
const completionCache = this._completionCaches.get(this._chatSession._chatHistoryStateRef);
|
||||
if (completionCache != null)
|
||||
return completionCache;
|
||||
const newCompletionCache = new CompletionCache(this._maxCachedCompletions);
|
||||
this._completionCaches.set(this._chatSession._chatHistoryStateRef, newCompletionCache);
|
||||
return newCompletionCache;
|
||||
}
|
||||
/** @internal */
|
||||
_restartCompletion(completionCache) {
|
||||
if (this._disposed)
|
||||
return;
|
||||
this._currentCompletionAbortController.abort();
|
||||
this._currentCompletionAbortController = new AbortController();
|
||||
const prompt = this._lastPrompt;
|
||||
if (prompt == null)
|
||||
return;
|
||||
const existingCompletion = completionCache.getCompletion(prompt);
|
||||
const promptToComplete = prompt + (existingCompletion ?? "");
|
||||
const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete, false, "trimLeadingSpace").length;
|
||||
const leftTokens = Math.max(0, this._maxPreloadTokens - currentPromptTokens);
|
||||
if (leftTokens === 0)
|
||||
return;
|
||||
const currentAbortController = this._currentCompletionAbortController;
|
||||
const currentAbortSignal = this._currentCompletionAbortController.signal;
|
||||
let currentCompletion = "";
|
||||
void this._chatSession.completePrompt(promptToComplete, {
|
||||
...this._completionOptions,
|
||||
stopOnAbortSignal: false,
|
||||
maxTokens: leftTokens,
|
||||
signal: currentAbortSignal,
|
||||
onTextChunk: (chunk) => {
|
||||
currentCompletion += chunk;
|
||||
const completion = (existingCompletion ?? "") + currentCompletion;
|
||||
completionCache.putCompletion(prompt, completion);
|
||||
if (this._getCurrentCompletionCache() !== completionCache) {
|
||||
currentAbortController.abort();
|
||||
return;
|
||||
}
|
||||
if (this._lastPrompt === prompt)
|
||||
this._onGeneration?.(prompt, completion);
|
||||
}
|
||||
})
|
||||
.then(() => {
|
||||
if (this._lastPrompt !== prompt && this._getCurrentCompletionCache() === completionCache)
|
||||
return this._restartCompletion(completionCache);
|
||||
})
|
||||
.catch((err) => {
|
||||
if ((currentAbortSignal.aborted && err === currentAbortSignal.reason) || err instanceof DOMException)
|
||||
return;
|
||||
console.error(getConsoleLogPrefix(false, false), err);
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
static _create(chatSession, options = {}) {
|
||||
return new LlamaChatSessionPromptCompletionEngine(chatSession, options);
|
||||
}
|
||||
}
|
||||
class CompletionCache {
|
||||
/** @internal */ _cache;
|
||||
/** @internal */ _rootNode = [new Map()];
|
||||
constructor(maxInputs) {
|
||||
this._cache = new LruCache(maxInputs, {
|
||||
onDelete: (key) => {
|
||||
this._deleteInput(key);
|
||||
}
|
||||
});
|
||||
}
|
||||
get maxInputs() {
|
||||
return this._cache.maxSize;
|
||||
}
|
||||
getCompletion(input) {
|
||||
let node = this._rootNode;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
if (node == null)
|
||||
return null;
|
||||
const [next, completion] = node;
|
||||
const char = input[i];
|
||||
if (!next.has(char)) {
|
||||
if (completion != null && completion.startsWith(input.slice(i))) {
|
||||
this._cache.get(input.slice(0, i));
|
||||
return completion.slice(input.length - i);
|
||||
}
|
||||
}
|
||||
node = next.get(char);
|
||||
}
|
||||
if (node == null)
|
||||
return null;
|
||||
const [, possibleCompletion] = node;
|
||||
if (possibleCompletion != null) {
|
||||
this._cache.get(input);
|
||||
return possibleCompletion;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
putCompletion(input, completion) {
|
||||
this._cache.set(input, null);
|
||||
let node = this._rootNode;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const [next] = node;
|
||||
const char = input[i];
|
||||
if (!next.has(char))
|
||||
next.set(char, [new Map()]);
|
||||
node = next.get(char);
|
||||
}
|
||||
const currentCompletion = node[1];
|
||||
if (currentCompletion != null && currentCompletion.startsWith(completion))
|
||||
return currentCompletion;
|
||||
node[1] = completion;
|
||||
return completion;
|
||||
}
|
||||
/** @internal */
|
||||
_deleteInput(input) {
|
||||
let lastNodeWithMultipleChildren = this._rootNode;
|
||||
let lastNodeWithMultipleChildrenDeleteChar = input[0];
|
||||
let node = this._rootNode;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const [next] = node;
|
||||
const char = input[i];
|
||||
if (next.size > 1) {
|
||||
lastNodeWithMultipleChildren = node;
|
||||
lastNodeWithMultipleChildrenDeleteChar = char;
|
||||
}
|
||||
if (!next.has(char))
|
||||
return;
|
||||
node = next.get(char);
|
||||
}
|
||||
if (lastNodeWithMultipleChildrenDeleteChar !== "")
|
||||
lastNodeWithMultipleChildren[0].delete(lastNodeWithMultipleChildrenDeleteChar);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaChatSessionPromptCompletionEngine.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
15
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts
generated
vendored
Normal file
15
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../../../utils/gbnfJson/types.js";
|
||||
import { ChatSessionModelFunction } from "../../../types.js";
|
||||
/**
|
||||
* Define a function that can be used by the model in a chat session, and return it.
|
||||
*
|
||||
* This is a helper function to facilitate defining functions with full TypeScript type information.
|
||||
*
|
||||
* The handler function can return a Promise, and the return value will be awaited before being returned to the model.
|
||||
* @param functionDefinition
|
||||
*/
|
||||
export declare function defineChatSessionFunction<const Params extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs>>({ description, params, handler }: {
|
||||
description?: string;
|
||||
params?: Readonly<Params> & GbnfJsonSchema<Defs>;
|
||||
handler: (params: GbnfJsonSchemaToType<NoInfer<Params>>) => Promise<any> | any;
|
||||
}): ChatSessionModelFunction<NoInfer<Params>>;
|
||||
16
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
/**
|
||||
* Define a function that can be used by the model in a chat session, and return it.
|
||||
*
|
||||
* This is a helper function to facilitate defining functions with full TypeScript type information.
|
||||
*
|
||||
* The handler function can return a Promise, and the return value will be awaited before being returned to the model.
|
||||
* @param functionDefinition
|
||||
*/
|
||||
export function defineChatSessionFunction({ description, params, handler }) {
|
||||
return {
|
||||
description,
|
||||
params,
|
||||
handler
|
||||
};
|
||||
}
|
||||
//# sourceMappingURL=defineChatSessionFunction.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"defineChatSessionFunction.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaChatSession/utils/defineChatSessionFunction.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AACH,MAAM,UAAU,yBAAyB,CAGvC,EACE,WAAW,EACX,MAAM,EACN,OAAO,EAKV;IACG,OAAO;QACH,WAAW;QACX,MAAM;QACN,OAAO;KACV,CAAC;AACN,CAAC"}
|
||||
186
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.d.ts
generated
vendored
Normal file
186
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.d.ts
generated
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { LLamaContextualRepeatPenalty, Token } from "../types.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import { LlamaGrammar } from "./LlamaGrammar.js";
|
||||
import { EvaluationPriority } from "./LlamaContext/types.js";
|
||||
import { LlamaContextSequence } from "./LlamaContext/LlamaContext.js";
|
||||
import { TokenBias } from "./TokenBias.js";
|
||||
export type LlamaCompletionOptions = {
|
||||
contextSequence: LlamaContextSequence;
|
||||
/**
|
||||
* Automatically dispose the sequence when the object is disposed.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
autoDisposeSequence?: boolean;
|
||||
};
|
||||
export type LlamaCompletionGenerationOptions = {
|
||||
/**
|
||||
* Called as the model generates a completion with the generated text chunk.
|
||||
*
|
||||
* Useful for streaming the generated completion as it's being generated.
|
||||
*/
|
||||
onTextChunk?: (text: string) => void;
|
||||
/**
|
||||
* Called as the model generates a completion with the generated tokens.
|
||||
*
|
||||
* Preferably, you'd want to use `onTextChunk` instead of this.
|
||||
*/
|
||||
onToken?: (tokens: Token[]) => void;
|
||||
/**
|
||||
* An AbortSignal to later abort the generation.
|
||||
*
|
||||
* When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
|
||||
*
|
||||
* > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
|
||||
*/
|
||||
signal?: AbortSignal;
|
||||
/**
|
||||
* When a completion already started being generated and then the signal is aborted,
|
||||
* the generation will stop and the completion will be returned as is instead of throwing an error.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
stopOnAbortSignal?: boolean;
|
||||
/** Maximum number of tokens to generate */
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* Temperature is a hyperparameter that controls the randomness of the generated text.
|
||||
* It affects the probability distribution of the model's output tokens.
|
||||
*
|
||||
* A higher temperature (e.g., 1.5) makes the output more random and creative,
|
||||
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
|
||||
*
|
||||
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
|
||||
*
|
||||
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
*
|
||||
* Set to `0` to disable.
|
||||
* Disabled by default (set to `0`).
|
||||
*/
|
||||
temperature?: number;
|
||||
/**
|
||||
* From the next token candidates, discard the percentage of tokens with the lowest probability.
|
||||
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
|
||||
* This is useful for generating more high-quality results when using a high temperature.
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
* Disabled by default.
|
||||
*/
|
||||
minP?: number;
|
||||
/**
|
||||
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
|
||||
* An integer number between `1` and the size of the vocabulary.
|
||||
* Set to `0` to disable (which uses the full vocabulary).
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than 0.
|
||||
*/
|
||||
topK?: number;
|
||||
/**
|
||||
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
|
||||
* and samples the next token only from this set.
|
||||
* A float number between `0` and `1`.
|
||||
* Set to `1` to disable.
|
||||
*
|
||||
* Only relevant when `temperature` is set to a value greater than `0`.
|
||||
*/
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
/**
|
||||
* Trim whitespace from the end of the generated text
|
||||
* Disabled by default.
|
||||
*/
|
||||
trimWhitespaceSuffix?: boolean;
|
||||
repeatPenalty?: false | LLamaContextualRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
/**
|
||||
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
grammar?: LlamaGrammar;
|
||||
/**
|
||||
* Custom stop triggers to stop the completion when any of the provided triggers are found.
|
||||
*/
|
||||
customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
|
||||
/**
|
||||
* The number of tokens to delete from the context window to make space for new ones.
|
||||
* Defaults to 10% of the context size.
|
||||
*/
|
||||
contextShiftSize?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
|
||||
/**
|
||||
* Context shift reconstructs the context with partial relevant data to continue generation when the context fills up.
|
||||
* This flag disables this behavior.
|
||||
* This flag will cause the generation to stop when the context fills up
|
||||
* by setting an appropriate `maxTokens` value or lowering the given `maxTokens` value when needed.
|
||||
* This flag will cause the generation to fail if there's no space for generating new tokens at all with the given inputs.
|
||||
*
|
||||
* Disabled by default. Not recommended unless you know what you're doing.
|
||||
*/
|
||||
disableContextShift?: boolean;
|
||||
};
|
||||
export type LlamaInfillGenerationOptions = LlamaCompletionGenerationOptions & {
|
||||
/**
|
||||
* The minimum number of tokens to keep from the prefix input when making a context shift.
|
||||
* Defaults to 10% of the context size.
|
||||
*/
|
||||
minPrefixKeepTokens?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
|
||||
};
|
||||
export type LlamaCompletionResponse = {
|
||||
response: string;
|
||||
metadata: {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
|
||||
} | {
|
||||
remainingGenerationAfterStop?: string | Token[];
|
||||
stopReason: "customStopTrigger";
|
||||
customStopTrigger: (string | Token)[];
|
||||
};
|
||||
};
|
||||
/**
|
||||
* @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
|
||||
*/
|
||||
export declare class LlamaCompletion {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
constructor({ contextSequence, autoDisposeSequence }: LlamaCompletionOptions);
|
||||
dispose({ disposeSequence }?: {
|
||||
disposeSequence?: boolean;
|
||||
}): void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void;
|
||||
get disposed(): boolean;
|
||||
get infillSupported(): boolean;
|
||||
/**
|
||||
* Generate a completion for an input.
|
||||
*/
|
||||
generateCompletion(input: Token[] | string | LlamaText, options?: LlamaCompletionGenerationOptions): Promise<string>;
|
||||
/**
|
||||
* Same as `generateCompletion`, but returns additional metadata about the generation.
|
||||
* See `generateCompletion` for more information.
|
||||
*/
|
||||
generateCompletionWithMeta(input: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, customStopTriggers, contextShiftSize, disableContextShift }?: LlamaCompletionGenerationOptions): Promise<LlamaCompletionResponse>;
|
||||
/**
|
||||
* Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
|
||||
* should connect to a given continuation (`suffixInput`).
|
||||
* For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
|
||||
* to make the final text be `123456789`.
|
||||
*/
|
||||
generateInfillCompletion(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, options?: LlamaInfillGenerationOptions): Promise<string>;
|
||||
/**
|
||||
* Same as `generateInfillCompletion`, but returns additional metadata about the generation.
|
||||
* See `generateInfillCompletion` for more information.
|
||||
*/
|
||||
generateInfillCompletionWithMeta(prefixInput: Token[] | string | LlamaText, suffixInput: Token[] | string | LlamaText, { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, grammar, contextShiftSize, customStopTriggers, minPrefixKeepTokens, disableContextShift }?: LlamaInfillGenerationOptions): Promise<LlamaCompletionResponse>;
|
||||
}
|
||||
495
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js
generated
vendored
Normal file
495
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js
generated
vendored
Normal file
@@ -0,0 +1,495 @@
|
||||
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
|
||||
import { tokenizeInput } from "../utils/tokenizeInput.js";
|
||||
import { UnsupportedError } from "../utils/UnsupportedError.js";
|
||||
import { removeNullFields } from "../utils/removeNullFields.js";
|
||||
import { TokenStreamRegulator } from "../utils/TokenStreamRegulator.js";
|
||||
import { StopGenerationDetector } from "../utils/StopGenerationDetector.js";
|
||||
import { UNKNOWN_UNICODE_CHAR } from "../consts.js";
|
||||
import { getQueuedTokensBeforeStopTrigger } from "../utils/getQueuedTokensBeforeStopTrigger.js";
|
||||
import { safeEventCallback } from "../utils/safeEventCallback.js";
|
||||
import { pushAll } from "../utils/pushAll.js";
|
||||
import { GgufArchitectureType } from "../gguf/types/GgufMetadataTypes.js";
|
||||
import { resolveBeginningTokenToPrepend } from "../utils/tokenizerUtils.js";
|
||||
import { LlamaGrammarEvaluationState } from "./LlamaGrammarEvaluationState.js";
|
||||
const defaultContextShiftSize = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
|
||||
const defaultMinPrefixKeepTokens = ((sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)));
|
||||
/**
|
||||
* @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
|
||||
*/
|
||||
export class LlamaCompletion {
|
||||
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
||||
/** @internal */ _autoDisposeSequence;
|
||||
/** @internal */ _sequence;
|
||||
onDispose = new EventRelay();
|
||||
constructor({ contextSequence, autoDisposeSequence = false }) {
|
||||
this._sequence = contextSequence;
|
||||
this._autoDisposeSequence = autoDisposeSequence;
|
||||
this._disposeAggregator.add(this._sequence.onDispose.createListener(() => {
|
||||
this.dispose();
|
||||
}));
|
||||
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
||||
}
|
||||
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
|
||||
if (this._sequence == null || this.disposed)
|
||||
return;
|
||||
if (disposeSequence)
|
||||
this._sequence.dispose();
|
||||
this._sequence = null;
|
||||
this._disposeAggregator.dispose();
|
||||
}
|
||||
/** @hidden */
|
||||
[Symbol.dispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
get disposed() {
|
||||
return this._sequence == null || this._sequence.disposed;
|
||||
}
|
||||
get infillSupported() {
|
||||
if (this._sequence == null)
|
||||
throw new DisposedError();
|
||||
return this._sequence.model.tokens.infill.prefix != null &&
|
||||
this._sequence.model.tokens.infill.suffix != null;
|
||||
}
|
||||
/**
|
||||
* Generate a completion for an input.
|
||||
*/
|
||||
async generateCompletion(input, options = {}) {
|
||||
const { response } = await this.generateCompletionWithMeta(input, options);
|
||||
return response;
|
||||
}
|
||||
/**
|
||||
* Same as `generateCompletion`, but returns additional metadata about the generation.
|
||||
* See `generateCompletion` for more information.
|
||||
*/
|
||||
async generateCompletionWithMeta(input, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, customStopTriggers, contextShiftSize = defaultContextShiftSize, disableContextShift } = {}) {
|
||||
if (this._sequence == null || this.disposed)
|
||||
throw new DisposedError();
|
||||
const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
|
||||
const extraEosTokens = getExtraCompletionEosTokens(this._sequence.model);
|
||||
async function fitInputIntoContext({ maxTokens, tokens }) {
|
||||
const res = [];
|
||||
if (beginningTokenToPrepend != null)
|
||||
res.push(beginningTokenToPrepend);
|
||||
const inputTokensSize = Math.max(0, Math.min(maxTokens - res.length, tokens.length));
|
||||
if (inputTokensSize === 0 && tokens.length > 0)
|
||||
throw new Error("The context size is too small to generate a response for the given input");
|
||||
const slicedTokens = tokens.slice(-inputTokensSize);
|
||||
pushAll(res, slicedTokens);
|
||||
return res;
|
||||
}
|
||||
const ensureNotAborted = () => {
|
||||
if (signal?.aborted && !stopOnAbortSignal)
|
||||
throw signal.reason;
|
||||
if (this.disposed)
|
||||
throw new DisposedError();
|
||||
};
|
||||
return await withLock([this, "generateCompletion"], signal, async () => {
|
||||
ensureNotAborted();
|
||||
if (this._sequence == null || this.disposed)
|
||||
throw new DisposedError();
|
||||
const resolvedInput = tokenizeInput(input, this._sequence.model.tokenizer, beginningTokenToPrepend != null
|
||||
? "trimLeadingSpace"
|
||||
: undefined);
|
||||
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
|
||||
ensureNotAborted();
|
||||
const inputTokens = await fitInputIntoContext({
|
||||
maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
|
||||
tokens: resolvedInput
|
||||
});
|
||||
ensureNotAborted();
|
||||
const resolvedMaxTokens = !disableContextShift
|
||||
? maxTokens
|
||||
: (maxTokens != null && maxTokens > 0)
|
||||
? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
|
||||
: this._sequence.context.contextSize - inputTokens.length;
|
||||
this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
|
||||
return await this._generateResponse(inputTokens, {
|
||||
onTextChunk: safeEventCallback(onTextChunk),
|
||||
onToken: safeEventCallback(onToken),
|
||||
signal,
|
||||
stopOnAbortSignal,
|
||||
maxTokens: resolvedMaxTokens,
|
||||
temperature,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed,
|
||||
trimWhitespaceSuffix,
|
||||
repeatPenalty,
|
||||
tokenBias,
|
||||
evaluationPriority,
|
||||
grammar,
|
||||
contextShiftSize,
|
||||
customStopTriggers
|
||||
}, {
|
||||
async contextShift({ shiftSize, res, pendingTokens, sequence }) {
|
||||
return {
|
||||
newContextState: await fitInputIntoContext({
|
||||
maxTokens: sequence.context.contextSize - shiftSize,
|
||||
tokens: [...resolvedInput, ...res, ...pendingTokens]
|
||||
})
|
||||
};
|
||||
},
|
||||
extraEosTokens
|
||||
});
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Infill (also known as Fill-In-Middle), generates a completion for an input (`prefixInput`) that
|
||||
* should connect to a given continuation (`suffixInput`).
|
||||
* For example, for `prefixInput: "123"` and `suffixInput: "789"`, the model is expected to generate `456`
|
||||
* to make the final text be `123456789`.
|
||||
*/
|
||||
async generateInfillCompletion(prefixInput, suffixInput, options = {}) {
|
||||
const { response } = await this.generateInfillCompletionWithMeta(prefixInput, suffixInput, options);
|
||||
return response;
|
||||
}
|
||||
/**
|
||||
* Same as `generateInfillCompletion`, but returns additional metadata about the generation.
|
||||
* See `generateInfillCompletion` for more information.
|
||||
*/
|
||||
async generateInfillCompletionWithMeta(prefixInput, suffixInput, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers, minPrefixKeepTokens = defaultMinPrefixKeepTokens, disableContextShift = false } = {}) {
|
||||
if (this._sequence == null || this.disposed)
|
||||
throw new DisposedError();
|
||||
const prefixToken = this._sequence.model.tokens.infill.prefix;
|
||||
const suffixToken = this._sequence.model.tokens.infill.suffix;
|
||||
const middleToken = this._sequence.model.tokens.infill.middle;
|
||||
const beginningTokenToPrepend = resolveBeginningTokenToPrepend(this._sequence.model.vocabularyType, this._sequence.model.tokens);
|
||||
if (prefixToken == null || suffixToken == null)
|
||||
throw new UnsupportedError("Infill completions are not supported by this model");
|
||||
const extraEosTokens = getExtraInfillEosTokens(this._sequence.model);
|
||||
async function fitInputIntoContext({ maxTokens, prefixTokens, suffixTokens, sequence }) {
|
||||
if (prefixToken == null || suffixToken == null)
|
||||
throw new UnsupportedError("Infill completions are not supported by this model");
|
||||
// 2 - InfillPrefix token, InfillSuffix token
|
||||
const specialTokensInContext = 2 +
|
||||
(middleToken != null ? 1 : 0) +
|
||||
(beginningTokenToPrepend != null ? 1 : 0);
|
||||
const resolvedMaxTokens = maxTokens - specialTokensInContext;
|
||||
let sizeLeftToFill = resolvedMaxTokens;
|
||||
let suffixTokensSize = Math.min(sizeLeftToFill, suffixTokens.length);
|
||||
sizeLeftToFill -= suffixTokensSize;
|
||||
let prefixTokensSize = Math.min(sizeLeftToFill, prefixTokens.length);
|
||||
sizeLeftToFill -= prefixTokensSize;
|
||||
if (sizeLeftToFill <= 0 && disableContextShift)
|
||||
throw new Error("The context size is too small to generate a response for the given input, and context shift is disabled. " +
|
||||
"Consider removing `disableContextShift` or reducing the input size.");
|
||||
const resolvedMinPrefixKeepTokens = Math.min(Math.min(resolvedMaxTokens, prefixTokens.length), Math.max(1, Math.floor(minPrefixKeepTokens instanceof Function
|
||||
? await minPrefixKeepTokens(sequence)
|
||||
: minPrefixKeepTokens)));
|
||||
if (prefixTokensSize < resolvedMinPrefixKeepTokens) {
|
||||
const diffToFill = Math.min(suffixTokensSize, resolvedMinPrefixKeepTokens - prefixTokensSize);
|
||||
prefixTokensSize += diffToFill;
|
||||
suffixTokensSize -= diffToFill;
|
||||
}
|
||||
const resolvedPrefixTokens = prefixTokens.slice(-prefixTokensSize);
|
||||
const resolvedSuffixTokens = suffixTokens.slice(0, suffixTokensSize);
|
||||
const newContextState = [];
|
||||
if (beginningTokenToPrepend != null)
|
||||
newContextState.push(beginningTokenToPrepend);
|
||||
if (middleToken != null) {
|
||||
newContextState.push(prefixToken);
|
||||
pushAll(newContextState, resolvedPrefixTokens);
|
||||
newContextState.push(suffixToken);
|
||||
pushAll(newContextState, resolvedSuffixTokens);
|
||||
newContextState.push(middleToken);
|
||||
}
|
||||
else {
|
||||
newContextState.push(suffixToken);
|
||||
pushAll(newContextState, resolvedSuffixTokens);
|
||||
newContextState.push(prefixToken);
|
||||
pushAll(newContextState, resolvedPrefixTokens);
|
||||
}
|
||||
return newContextState;
|
||||
}
|
||||
const ensureNotAborted = () => {
|
||||
if (signal?.aborted && !stopOnAbortSignal)
|
||||
throw signal.reason;
|
||||
if (this.disposed)
|
||||
throw new DisposedError();
|
||||
};
|
||||
return await withLock([this, "generateCompletion"], signal, async () => {
|
||||
ensureNotAborted();
|
||||
if (this._sequence == null || this.disposed)
|
||||
throw new DisposedError();
|
||||
const resolvedPrefixInputTokens = tokenizeInput(prefixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
|
||||
const resolvedSuffixInputTokens = tokenizeInput(suffixInput, this._sequence.model.tokenizer, "trimLeadingSpace");
|
||||
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, this._sequence);
|
||||
ensureNotAborted();
|
||||
const inputTokens = await fitInputIntoContext({
|
||||
maxTokens: this._sequence.context.contextSize - resolvedContextShiftSize,
|
||||
prefixTokens: resolvedPrefixInputTokens,
|
||||
suffixTokens: resolvedSuffixInputTokens,
|
||||
sequence: this._sequence
|
||||
});
|
||||
ensureNotAborted();
|
||||
const resolvedMaxTokens = !disableContextShift
|
||||
? maxTokens
|
||||
: (maxTokens != null && maxTokens > 0)
|
||||
? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
|
||||
: this._sequence.context.contextSize - inputTokens.length;
|
||||
this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
|
||||
return await this._generateResponse(inputTokens, {
|
||||
onTextChunk: safeEventCallback(onTextChunk),
|
||||
onToken: safeEventCallback(onToken),
|
||||
signal,
|
||||
stopOnAbortSignal,
|
||||
maxTokens: resolvedMaxTokens,
|
||||
temperature,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed,
|
||||
trimWhitespaceSuffix,
|
||||
repeatPenalty,
|
||||
tokenBias,
|
||||
evaluationPriority,
|
||||
grammar,
|
||||
contextShiftSize,
|
||||
customStopTriggers
|
||||
}, {
|
||||
async contextShift({ shiftSize, res, pendingTokens, sequence }) {
|
||||
return {
|
||||
newContextState: await fitInputIntoContext({
|
||||
maxTokens: sequence.context.contextSize - shiftSize,
|
||||
prefixTokens: [...resolvedPrefixInputTokens, ...res, ...pendingTokens],
|
||||
suffixTokens: resolvedSuffixInputTokens,
|
||||
sequence
|
||||
})
|
||||
};
|
||||
},
|
||||
extraEosTokens
|
||||
});
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
async _generateResponse(tokens, { onTextChunk, onToken, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix = false, repeatPenalty = {}, tokenBias, evaluationPriority = 5, grammar, contextShiftSize = defaultContextShiftSize, customStopTriggers }, { contextShift, extraEosTokens = new Set() }) {
|
||||
if (this._sequence == null)
|
||||
throw new DisposedError();
|
||||
const sequence = this._sequence;
|
||||
const model = sequence.model;
|
||||
const context = sequence.context;
|
||||
const res = [];
|
||||
const pendingTokens = [];
|
||||
const grammarEvaluationState = grammar != null
|
||||
? new LlamaGrammarEvaluationState({ model, grammar })
|
||||
: undefined;
|
||||
const { lastTokens: repeatPenaltyLastTokens = 64, punishTokensFilter, penalizeNewLine, penalty, frequencyPenalty, presencePenalty } = repeatPenalty === false
|
||||
? { lastTokens: 0 }
|
||||
: repeatPenalty;
|
||||
const streamRegulator = new TokenStreamRegulator();
|
||||
const stopGenerationDetector = new StopGenerationDetector();
|
||||
const customStopGenerationTriggersDetector = new StopGenerationDetector();
|
||||
const locksToReleaseOnValidGeneration = [];
|
||||
const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
|
||||
let inputTokens = tokens;
|
||||
let generatedTokens = 0;
|
||||
if (grammar != null)
|
||||
StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
|
||||
.map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
|
||||
if (customStopTriggers != null)
|
||||
StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
|
||||
.map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
|
||||
const ensureNotAborted = () => {
|
||||
if (signal?.aborted && !stopOnAbortSignal)
|
||||
throw signal.reason;
|
||||
if (this.disposed)
|
||||
throw new DisposedError();
|
||||
};
|
||||
const getPenaltyTokens = () => {
|
||||
if (this._sequence == null)
|
||||
throw new DisposedError();
|
||||
let punishTokens = res.slice(-repeatPenaltyLastTokens);
|
||||
if (punishTokensFilter != null)
|
||||
punishTokens = punishTokensFilter(punishTokens);
|
||||
if (penalizeNewLine == null || !penalizeNewLine) {
|
||||
const nlToken = model.tokens.nl;
|
||||
if (nlToken != null)
|
||||
punishTokens = punishTokens.filter((token) => token !== nlToken);
|
||||
}
|
||||
return punishTokens;
|
||||
};
|
||||
while (true) {
|
||||
ensureNotAborted();
|
||||
let shouldContextShift = false;
|
||||
if (inputTokens.length === 1 && sequence.nextTokenIndex !== 0)
|
||||
await sequence.eraseContextTokenRanges([{
|
||||
start: 0,
|
||||
end: sequence.nextTokenIndex
|
||||
}]);
|
||||
else {
|
||||
const lastToken = inputTokens[inputTokens.length - 1];
|
||||
// we need to decode at least one token to generate a response
|
||||
inputTokens.pop();
|
||||
await sequence.adaptStateToTokens(inputTokens, false);
|
||||
inputTokens.push(lastToken);
|
||||
ensureNotAborted();
|
||||
const firstDifferentIndex = sequence.nextTokenIndex;
|
||||
inputTokens.splice(0, firstDifferentIndex);
|
||||
}
|
||||
const evaluationIterator = sequence.evaluate(inputTokens, removeNullFields({
|
||||
temperature, minP, topK, topP, seed,
|
||||
grammarEvaluationState,
|
||||
repeatPenalty: !repeatPenaltyEnabled ? undefined : {
|
||||
punishTokens: getPenaltyTokens,
|
||||
maxPunishTokens: repeatPenaltyLastTokens,
|
||||
penalty,
|
||||
frequencyPenalty,
|
||||
presencePenalty
|
||||
},
|
||||
tokenBias,
|
||||
evaluationPriority,
|
||||
yieldEogToken: true
|
||||
}));
|
||||
const pendingPartialTokens = [];
|
||||
for await (const token of evaluationIterator) {
|
||||
ensureNotAborted();
|
||||
generatedTokens++;
|
||||
const tokens = pendingPartialTokens.length === 0
|
||||
? [token]
|
||||
: [...pendingPartialTokens, token];
|
||||
const text = model.detokenize([token]);
|
||||
if (pendingPartialTokens.length === 0 &&
|
||||
text.endsWith(UNKNOWN_UNICODE_CHAR) &&
|
||||
!model.isSpecialToken(token) &&
|
||||
!model.isEogToken(token)) {
|
||||
pendingPartialTokens.push(token);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
pendingPartialTokens.length = 0;
|
||||
const queuedTokenRelease = streamRegulator.addChunk({ tokens, text });
|
||||
if (text.endsWith(UNKNOWN_UNICODE_CHAR) || ((grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === "") || (text === "" && locksToReleaseOnValidGeneration.length > 0 && !model.isSpecialToken(token))) {
|
||||
locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
|
||||
}
|
||||
else {
|
||||
while (locksToReleaseOnValidGeneration.length > 0)
|
||||
locksToReleaseOnValidGeneration.shift().dispose();
|
||||
}
|
||||
stopGenerationDetector.recordGeneration({ text, tokens, queuedTokenRelease });
|
||||
customStopGenerationTriggersDetector.recordGeneration({ text, tokens, queuedTokenRelease });
|
||||
if (model.isEogToken(token) || extraEosTokens.has(token))
|
||||
queuedTokenRelease.createTokenIndexLock(0);
|
||||
pushAll(pendingTokens, streamRegulator.popFreeChunkTokens());
|
||||
if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
|
||||
model.isEogToken(token) || extraEosTokens.has(token)) {
|
||||
const triggeredStops = stopGenerationDetector.hasTriggeredStops
|
||||
? stopGenerationDetector.getTriggeredStops()
|
||||
: customStopGenerationTriggersDetector.getTriggeredStops();
|
||||
const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
|
||||
const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(triggeredStops, partiallyFreeTokens, model.tokenizer);
|
||||
pushAll(pendingTokens, queuedTokensBeforeStopTrigger);
|
||||
const { firstRemainingGenerationAfterStop } = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
|
||||
if (pendingTokens.length > 0) {
|
||||
onToken?.(pendingTokens.slice());
|
||||
onTextChunk?.(model.detokenize(pendingTokens, false, res));
|
||||
}
|
||||
pushAll(res, pendingTokens);
|
||||
pendingTokens.length = 0;
|
||||
let modelResponse = model.detokenize(res);
|
||||
if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
|
||||
modelResponse = modelResponse.trimEnd();
|
||||
const isEogToken = model.isEogToken(token) || extraEosTokens.has(token);
|
||||
if (isEogToken || stopGenerationDetector.hasTriggeredStops)
|
||||
return {
|
||||
response: modelResponse,
|
||||
metadata: {
|
||||
remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
|
||||
stopReason: isEogToken
|
||||
? "eogToken"
|
||||
: "stopGenerationTrigger"
|
||||
}
|
||||
};
|
||||
return {
|
||||
response: modelResponse,
|
||||
metadata: {
|
||||
remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
|
||||
stopReason: "customStopTrigger",
|
||||
customStopTrigger: triggeredStops[0].stopTrigger
|
||||
}
|
||||
};
|
||||
}
|
||||
if (pendingTokens.length > 0) {
|
||||
onToken?.(pendingTokens.slice());
|
||||
onTextChunk?.(model.detokenize(pendingTokens, false, res));
|
||||
pushAll(res, pendingTokens);
|
||||
pendingTokens.length = 0;
|
||||
}
|
||||
}
|
||||
const aborted = (signal?.aborted ?? false) && stopOnAbortSignal;
|
||||
const maxTokensReached = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
|
||||
if (aborted || maxTokensReached) {
|
||||
let modelResponse = model.detokenize(res);
|
||||
if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
|
||||
modelResponse = modelResponse.trimEnd();
|
||||
return {
|
||||
response: modelResponse,
|
||||
metadata: {
|
||||
stopReason: aborted
|
||||
? "abort"
|
||||
: "maxTokens"
|
||||
}
|
||||
};
|
||||
}
|
||||
if (sequence.nextTokenIndex >= context.contextSize - 1) {
|
||||
shouldContextShift = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (shouldContextShift) {
|
||||
const resolvedContextShiftSize = await resolveContextShiftSize(contextShiftSize, sequence);
|
||||
ensureNotAborted();
|
||||
const { newContextState } = await contextShift({
|
||||
shiftSize: resolvedContextShiftSize,
|
||||
res,
|
||||
pendingTokens,
|
||||
sequence
|
||||
});
|
||||
ensureNotAborted();
|
||||
inputTokens = newContextState;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
throw new Error("The context size is too small to generate a response");
|
||||
}
|
||||
}
|
||||
async function resolveContextShiftSize(contextShiftSize, sequence) {
|
||||
if (typeof contextShiftSize === "number")
|
||||
return contextShiftSize;
|
||||
else if (contextShiftSize instanceof Function)
|
||||
return Math.min(sequence.context.contextSize, Math.max(1, Math.floor(contextShiftSize instanceof Function
|
||||
? await contextShiftSize(sequence)
|
||||
: contextShiftSize)));
|
||||
return defaultContextShiftSize(sequence);
|
||||
}
|
||||
function getExtraCompletionEosTokens(model) {
|
||||
const extraEosTokens = new Set();
|
||||
if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
|
||||
model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
|
||||
for (const token of model.iterateAllTokens()) {
|
||||
const tokenText = model.detokenize([token], true);
|
||||
if (tokenText === "<|file_separator|>" || tokenText === "<|fim_prefix|>") {
|
||||
extraEosTokens.add(token);
|
||||
if (extraEosTokens.size === 2)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return extraEosTokens;
|
||||
}
|
||||
function getExtraInfillEosTokens(model) {
|
||||
const extraEosTokens = new Set();
|
||||
if (model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma ||
|
||||
model.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) {
|
||||
for (const token of model.iterateAllTokens()) {
|
||||
const tokenText = model.detokenize([token], true);
|
||||
if (tokenText === "<|file_separator|>") {
|
||||
extraEosTokens.add(token);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return extraEosTokens;
|
||||
}
|
||||
//# sourceMappingURL=LlamaCompletion.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaCompletion.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
245
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
generated
vendored
Normal file
245
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
generated
vendored
Normal file
@@ -0,0 +1,245 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { Token } from "../../types.js";
|
||||
import { TokenMeter } from "../TokenMeter.js";
|
||||
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
|
||||
import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
|
||||
import { TokenPredictor } from "./TokenPredictor.js";
|
||||
export declare class LlamaContext {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
dispose(): Promise<void>;
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose](): Promise<void>;
|
||||
get disposed(): boolean;
|
||||
get model(): LlamaModel;
|
||||
get contextSize(): number;
|
||||
get batchSize(): number;
|
||||
get flashAttention(): boolean;
|
||||
/**
|
||||
* The actual size of the state in the memory in bytes.
|
||||
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
||||
*/
|
||||
get stateSize(): number;
|
||||
/** The number of threads currently used to evaluate tokens */
|
||||
get currentThreads(): number;
|
||||
/**
|
||||
* The number of threads that are preferred to be used to evaluate tokens.
|
||||
*
|
||||
* The actual number of threads used may be lower when other evaluations are running in parallel.
|
||||
*/
|
||||
get idealThreads(): number;
|
||||
getAllocatedContextSize(): number;
|
||||
get totalSequences(): number;
|
||||
get sequencesLeft(): number;
|
||||
/**
|
||||
* Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
|
||||
* When there are no sequences left, this method will throw an error.
|
||||
*/
|
||||
getSequence(options?: {
|
||||
contextShift?: ContextShiftOptions;
|
||||
/**
|
||||
* Token predictor to use for the sequence.
|
||||
* Don't share the same token predictor between multiple sequences.
|
||||
*
|
||||
* Using a token predictor doesn't affect the generation output itself -
|
||||
* it only allows for greater parallelization of the token evaluation to speed up the generation.
|
||||
*
|
||||
* > **Note:** that if a token predictor is too resource intensive,
|
||||
* > it can slow down the generation process due to the overhead of running the predictor.
|
||||
* >
|
||||
* > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
|
||||
*
|
||||
* Automatically disposed when disposing the sequence.
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
|
||||
*/
|
||||
tokenPredictor?: TokenPredictor;
|
||||
}): LlamaContextSequence;
|
||||
dispatchPendingBatch(): void;
|
||||
/**
|
||||
* Print the timings of token evaluation since that last print for this context.
|
||||
*
|
||||
* Requires the `performanceTracking` option to be enabled.
|
||||
*
|
||||
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
||||
* it won't print anything.
|
||||
*/
|
||||
printTimings(): Promise<void>;
|
||||
}
|
||||
export declare class LlamaContextSequence {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
dispose(): void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void;
|
||||
get disposed(): boolean;
|
||||
get context(): LlamaContext;
|
||||
get model(): LlamaModel;
|
||||
/** The maximum number of tokens that the sequence state can hold */
|
||||
get contextSize(): number;
|
||||
/** The index where the next evaluated token will be placed in the context */
|
||||
get nextTokenIndex(): number;
|
||||
/** The current context state tokens */
|
||||
get contextTokens(): Token[];
|
||||
get tokenMeter(): TokenMeter;
|
||||
/**
|
||||
* The token predictor used when creating this sequence.
|
||||
*/
|
||||
get tokenPredictor(): TokenPredictor | undefined;
|
||||
/**
|
||||
* Get the index of the first token in the KV cache.
|
||||
*
|
||||
* If you remove any tokens from the state that come before this index,
|
||||
* no cached prefix tokens evaluation state will be used for the next evaluation.
|
||||
*
|
||||
* For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
|
||||
* then the cached state for range `0-10` will be used in the next evaluation,
|
||||
* but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
|
||||
* and will be re-evaluated in the next evaluation.
|
||||
*
|
||||
* This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
|
||||
*
|
||||
* When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
|
||||
*
|
||||
* When the KV cache is empty, this index will be `-1`.
|
||||
*
|
||||
* You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
|
||||
*/
|
||||
get stateCellsStartIndex(): number;
|
||||
/**
|
||||
* Statistics of token predictions using the sequence's `tokenPredictor`.
|
||||
*
|
||||
* The statistics change only when token prediction is used in this sequence.
|
||||
*
|
||||
* `validated` + `refuted` = total number of evaluated predictions.
|
||||
*
|
||||
* Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
|
||||
*/
|
||||
get tokenPredictions(): {
|
||||
/** Number of token predictions that were actually used (tokens that were validated and then consumed) */
|
||||
used: number;
|
||||
/** Number of token predictions that were not used (tokens that were validated and were not consumed) */
|
||||
unused: number;
|
||||
/** Number of token predictions that were validated successfully */
|
||||
validated: number;
|
||||
/** Number of token predictions that were refuted */
|
||||
refuted: number;
|
||||
};
|
||||
get isLoadedToMemory(): boolean;
|
||||
compareContextTokens(tokens: Token[]): {
|
||||
firstDifferentIndex: number;
|
||||
};
|
||||
/**
|
||||
* Erase parts of the context state to align it with the given tokens.
|
||||
*
|
||||
* If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
|
||||
*
|
||||
* To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
|
||||
*
|
||||
* If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
|
||||
* which incurs token evaluation of the shifted tokens.
|
||||
*/
|
||||
adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
|
||||
/**
|
||||
* Clear the history of the sequence.
|
||||
*/
|
||||
clearHistory(): Promise<void>;
|
||||
/**
|
||||
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
||||
* The start of each range is inclusive, and the end of each range is exclusive.
|
||||
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
||||
*/
|
||||
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
|
||||
*
|
||||
* This method uses the token predictor (when provided) to generate new tokens faster.
|
||||
*/
|
||||
evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
|
||||
/**
|
||||
* Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
|
||||
*
|
||||
* Configure the additional metadata options to choose which metadata to include.
|
||||
*/
|
||||
evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence without generating new tokens.
|
||||
*/
|
||||
evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/** Override the sequence context shift options for this evaluation */
|
||||
contextShift?: ContextShiftOptions;
|
||||
}): Promise<void>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence with custom options for each token.
|
||||
*
|
||||
* This method allows for more precise control of the generation process.
|
||||
*
|
||||
* A next token will be generated for a given token only if any of the `generateNext` options for it are used.
|
||||
*
|
||||
* To generate more tokens after this method finishes,
|
||||
* use it again with token(s) you selected to add to the context from the previous evaluation.
|
||||
*
|
||||
* This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
|
||||
* Use the `evaluate` method when you need to use token prediction.
|
||||
* @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
|
||||
* For indexes that have no output, there won't be any value at the corresponding index in the output array.
|
||||
*
|
||||
* It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
|
||||
*/
|
||||
controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/** Override the sequence context shift options for this evaluation */
|
||||
contextShift?: ContextShiftOptions;
|
||||
/** Called on each token result after it's generated */
|
||||
onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
|
||||
}): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
|
||||
/**
|
||||
* Save the current context sequence evaluation state to a file.
|
||||
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
|
||||
*/
|
||||
saveStateToFile(filePath: string): Promise<{
|
||||
fileSize: number;
|
||||
}>;
|
||||
/**
|
||||
* Load a context sequence evaluation state from a file.
|
||||
*
|
||||
* Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
|
||||
*
|
||||
* You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
|
||||
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
|
||||
*/
|
||||
loadStateFromFile(filePath: string, acceptRisk: {
|
||||
/**
|
||||
* Loading a state file created using a different model may crash the process.
|
||||
*
|
||||
* You must accept this risk to use this feature.
|
||||
*/
|
||||
acceptRisk: true;
|
||||
}): Promise<void>;
|
||||
}
|
||||
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
|
||||
contextSize: number;
|
||||
sequences: number;
|
||||
}): number;
|
||||
export declare function getDefaultContextSequences(): number;
|
||||
export declare function getDefaultModelContextSize({ trainContextSize }: {
|
||||
trainContextSize?: number;
|
||||
}): number;
|
||||
1691
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
generated
vendored
Normal file
1691
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export {};
|
||||
31
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
generated
vendored
Normal file
31
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
generated
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
/** @internal */
|
||||
export class LlamaSampler {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _sampler;
|
||||
/** @internal */ disposed = false;
|
||||
constructor(model) {
|
||||
this._llama = model._llama;
|
||||
this._sampler = new this._llama._bindings.AddonSampler(model._model);
|
||||
this.asyncDispose = this.asyncDispose.bind(this);
|
||||
}
|
||||
dispose() {
|
||||
this.disposed = true;
|
||||
this._sampler.dispose();
|
||||
}
|
||||
async asyncDispose() {
|
||||
this.disposed = true;
|
||||
this._sampler.dispose();
|
||||
}
|
||||
applyConfig(config) {
|
||||
return this._sampler.applyConfig(config);
|
||||
}
|
||||
/** @internal */
|
||||
static _canBeNextTokenForGrammarEvaluationState(llama, grammarEvaluationState, token) {
|
||||
return llama._bindings.AddonSampler.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
|
||||
}
|
||||
/** @internal */
|
||||
static _acceptTokenOnGrammarEvaluationState(llama, grammarEvaluationState, token) {
|
||||
llama._bindings.AddonSampler.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaSampler.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
|
||||
55
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
generated
vendored
Normal file
55
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
import { Token } from "../../types.js";
|
||||
import { SequenceEvaluateOptions } from "./types.js";
|
||||
import { LlamaContextSequence } from "./LlamaContext.js";
|
||||
/**
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
|
||||
*/
|
||||
export declare abstract class TokenPredictor {
|
||||
/**
|
||||
* Resets the state of the predictor.
|
||||
*
|
||||
* Called before the generation starts.
|
||||
*/
|
||||
abstract reset(params: {
|
||||
/** The target sequence that this token predictor is generating tokens for */
|
||||
targetSequence: LlamaContextSequence;
|
||||
/**
|
||||
* The tokens that are or will be loaded into the state.
|
||||
*
|
||||
* The initial predictions should be based on these tokens.
|
||||
*
|
||||
* When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
|
||||
*/
|
||||
stateTokens: Token[];
|
||||
/**
|
||||
* Options used for the evaluation on the target sequence.
|
||||
*
|
||||
* The `grammarEvaluationState` is cloned before being passed to the token predictor,
|
||||
* so it can be modified without affecting the original state.
|
||||
*/
|
||||
evaluateOptions: Readonly<SequenceEvaluateOptions>;
|
||||
}): Promise<void> | void;
|
||||
abstract pushTokens(tokens: Token[]): void;
|
||||
/**
|
||||
* Predicts the next tokens based on the current state.
|
||||
*
|
||||
* If the generation should wait until the minimum predications are ready,
|
||||
* this method should return a promise that resolves when the minimum predictions are ready.
|
||||
*
|
||||
* A background prediction process can be started when this function is called,
|
||||
* so that the next predictions will be ready when this function is called again.
|
||||
*/
|
||||
abstract predictTokens(): Promise<Token[]> | Token[];
|
||||
/**
|
||||
* Stops the prediction process when it runs in the background.
|
||||
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
|
||||
*/
|
||||
stop(untilPredictionsExhausted?: boolean): Promise<void> | void;
|
||||
/**
|
||||
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*/
|
||||
updateInputTokens(tokens: Token[]): void;
|
||||
dispose(): Promise<void> | void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void | Promise<void>;
|
||||
}
|
||||
20
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
generated
vendored
Normal file
20
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
|
||||
*/
|
||||
export class TokenPredictor {
|
||||
/**
|
||||
* Stops the prediction process when it runs in the background.
|
||||
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
|
||||
*/
|
||||
stop(untilPredictionsExhausted) { }
|
||||
/**
|
||||
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*/
|
||||
updateInputTokens(tokens) { }
|
||||
dispose() { }
|
||||
/** @hidden */
|
||||
[Symbol.dispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=TokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}
|
||||
56
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
generated
vendored
Normal file
56
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
import { Token } from "../../../types.js";
|
||||
import { SequenceEvaluateOptions } from "../types.js";
|
||||
import { LlamaContextSequence } from "../LlamaContext.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
/**
|
||||
* Predicts the next tokens by evaluating the current state of the target sequence
|
||||
* on a draft sequence from a smaller and faster draft model.
|
||||
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
|
||||
*/
|
||||
export declare class DraftSequenceTokenPredictor extends TokenPredictor {
|
||||
constructor(draftSequence: LlamaContextSequence, options?: {
|
||||
/**
|
||||
* The minimum number of tokens to draft.
|
||||
*
|
||||
* Defaults to `0`.
|
||||
*/
|
||||
minTokens?: number;
|
||||
/**
|
||||
* Maximum number of tokens to draft.
|
||||
*
|
||||
* Defaults to `16`.
|
||||
*/
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* Evaluate options default to the values of the target sequence.
|
||||
*
|
||||
* You can override any of the options for the prediction here.
|
||||
*/
|
||||
evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
|
||||
/**
|
||||
* Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
|
||||
* When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
|
||||
* are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
|
||||
*
|
||||
* A number between `0` and `1` representing the minimum probability of the token to be generated.
|
||||
*
|
||||
* Set to `0` to disable.
|
||||
*
|
||||
* Defaults to `0.6`.
|
||||
*/
|
||||
minConfidence?: number;
|
||||
});
|
||||
get draftSequence(): LlamaContextSequence;
|
||||
get minTokens(): number;
|
||||
get maxTokens(): number;
|
||||
get minConfidence(): number | undefined;
|
||||
reset({ targetSequence, stateTokens, evaluateOptions }: {
|
||||
targetSequence: LlamaContextSequence;
|
||||
stateTokens: Token[];
|
||||
evaluateOptions: Readonly<SequenceEvaluateOptions>;
|
||||
}): Promise<void>;
|
||||
pushTokens(tokens: Token[]): void;
|
||||
predictTokens(): Token[] | Promise<Token[]>;
|
||||
stop(untilPredictionsExhausted?: boolean): void;
|
||||
dispose(): void;
|
||||
}
|
||||
266
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
generated
vendored
Normal file
266
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,266 @@
|
||||
import { withLock } from "lifecycle-utils";
|
||||
import { pushAll } from "../../../utils/pushAll.js";
|
||||
import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
|
||||
import { LlamaSampler } from "../LlamaSampler.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
const defaultPredictionMinTokens = 0;
|
||||
const defaultPredictionMaxTokens = 16;
|
||||
const defaultPredictionMinConfidence = 0.6;
|
||||
/**
|
||||
* Predicts the next tokens by evaluating the current state of the target sequence
|
||||
* on a draft sequence from a smaller and faster draft model.
|
||||
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
|
||||
*/
|
||||
export class DraftSequenceTokenPredictor extends TokenPredictor {
|
||||
/** @internal */ _draftSequence;
|
||||
/** @internal */ _minTokens;
|
||||
/** @internal */ _maxTokens;
|
||||
/** @internal */ _minConfidence;
|
||||
/** @internal */ _stateTokens = [];
|
||||
/** @internal */ _pendingEvalTokens = [];
|
||||
/** @internal */ _predictedTokens = [];
|
||||
/** @internal */ _evaluateOptions = {};
|
||||
/** @internal */ _overrideEvaluateOptions = {};
|
||||
/** @internal */ _grammarEvaluationStateOption;
|
||||
/** @internal */ _currentEvaluationAbortController = new AbortController();
|
||||
/** @internal */ _resetAbortController = new AbortController();
|
||||
/** @internal */ _stopped = true;
|
||||
/** @internal */ _waitForPredictionExhaustion = false;
|
||||
/** @internal */ _minTokensCallbacks = [];
|
||||
/** @internal */ _resetPredictions = false;
|
||||
/** @internal */ _iterator;
|
||||
/** @internal */ _active = false;
|
||||
/** @internal */ _disposed = false;
|
||||
constructor(draftSequence, options = {}) {
|
||||
super();
|
||||
this._draftSequence = draftSequence;
|
||||
this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
|
||||
this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
|
||||
this._overrideEvaluateOptions = options.evaluateOptions ?? {};
|
||||
this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
|
||||
if (draftSequence.disposed)
|
||||
throw new Error("The draft sequence is disposed");
|
||||
}
|
||||
get draftSequence() {
|
||||
return this._draftSequence;
|
||||
}
|
||||
get minTokens() {
|
||||
return this._minTokens;
|
||||
}
|
||||
get maxTokens() {
|
||||
return this._maxTokens;
|
||||
}
|
||||
get minConfidence() {
|
||||
return this._minConfidence;
|
||||
}
|
||||
async reset({ targetSequence, stateTokens, evaluateOptions }) {
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._resetAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
this._resetAbortController = new AbortController();
|
||||
this._stopped = true;
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
const currentAbortSignal = this._resetAbortController.signal;
|
||||
targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
|
||||
try {
|
||||
await withLock([this, "evaluate"], currentAbortSignal, async () => {
|
||||
this._stateTokens = stateTokens.slice();
|
||||
this._pendingEvalTokens = [];
|
||||
this._predictedTokens = [];
|
||||
this._resetPredictions = false;
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
const lastToken = this._stateTokens.pop();
|
||||
if (lastToken != null)
|
||||
this._pendingEvalTokens.push(lastToken);
|
||||
this._evaluateOptions = evaluateOptions;
|
||||
this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
|
||||
? this._evaluateOptions.grammarEvaluationState()?.clone()
|
||||
: this._evaluateOptions.grammarEvaluationState?.clone();
|
||||
const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
|
||||
await this._draftSequence.adaptStateToTokens(newStateTokens, true);
|
||||
newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
|
||||
await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
|
||||
contextShift: this._evaluateOptions.contextShift,
|
||||
evaluationPriority: this._evaluateOptions.evaluationPriority
|
||||
});
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err !== currentAbortSignal.reason)
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
pushTokens(tokens) {
|
||||
const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
|
||||
? this._evaluateOptions.grammarEvaluationState()?.clone()
|
||||
: this._evaluateOptions.grammarEvaluationState?.clone();
|
||||
void withLock([this, "pushTokens"], async () => {
|
||||
this._grammarEvaluationStateOption = grammarEvaluationStateOption;
|
||||
const tokensToPush = tokens.slice();
|
||||
while (!this._resetPredictions && tokensToPush.length > 0) {
|
||||
const token = tokensToPush.shift();
|
||||
if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
|
||||
this._predictedTokens.shift();
|
||||
}
|
||||
else {
|
||||
tokensToPush.unshift(token);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tokensToPush.length === 0) {
|
||||
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
|
||||
this._resume();
|
||||
return;
|
||||
}
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
pushAll(this._pendingEvalTokens, tokensToPush);
|
||||
this._resetPredictions = true;
|
||||
this._resume();
|
||||
});
|
||||
}
|
||||
predictTokens() {
|
||||
if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
|
||||
return this._predictedTokens;
|
||||
this._stopped = false;
|
||||
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._resume();
|
||||
}
|
||||
if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
|
||||
return this._predictedTokens;
|
||||
if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
|
||||
if (this._resetPredictions)
|
||||
return [];
|
||||
return this._predictedTokens;
|
||||
}
|
||||
return new Promise((accept) => void this._minTokensCallbacks.push(accept))
|
||||
.then(() => {
|
||||
if (this._resetPredictions)
|
||||
return [];
|
||||
return this._predictedTokens;
|
||||
});
|
||||
}
|
||||
stop(untilPredictionsExhausted = false) {
|
||||
this._stopped = true;
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
if (untilPredictionsExhausted)
|
||||
this._waitForPredictionExhaustion = true;
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
});
|
||||
}
|
||||
dispose() {
|
||||
this._disposed = true;
|
||||
this._stopped = true;
|
||||
this._resetAbortController.abort();
|
||||
this._currentEvaluationAbortController.abort();
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
_canIterate() {
|
||||
return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
|
||||
}
|
||||
/** @internal */
|
||||
_resume() {
|
||||
if (this._active || !this._canIterate())
|
||||
return;
|
||||
this._active = true;
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
try {
|
||||
const abortSignal = this._currentEvaluationAbortController.signal;
|
||||
if (!this._canIterate() || abortSignal.aborted)
|
||||
return;
|
||||
const resetPredications = async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._resetPredictions = false;
|
||||
const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
|
||||
this._predictedTokens = [];
|
||||
await this._draftSequence.eraseContextTokenRanges([{
|
||||
start: this._draftSequence.nextTokenIndex - tokenToDelete,
|
||||
end: this._draftSequence.nextTokenIndex
|
||||
}]);
|
||||
};
|
||||
const createIterator = () => {
|
||||
const tokens = this._pendingEvalTokens;
|
||||
this._pendingEvalTokens = [];
|
||||
return this.draftSequence.evaluateWithMetadata(tokens, { confidence: true }, {
|
||||
...this._evaluateOptions,
|
||||
...this._overrideEvaluateOptions,
|
||||
grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
|
||||
});
|
||||
};
|
||||
if (this._resetPredictions)
|
||||
await resetPredications();
|
||||
if (!this._canIterate() || abortSignal.aborted)
|
||||
return;
|
||||
let iterator = createIterator();
|
||||
this._iterator = iterator;
|
||||
while (this._canIterate() && !abortSignal.aborted) {
|
||||
const { value, done } = await iterator.next();
|
||||
let shouldBreak = done;
|
||||
if (value != null) {
|
||||
const { token, confidence } = value;
|
||||
if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
|
||||
confidence < this._minConfidence) {
|
||||
this._iterator = undefined;
|
||||
await iterator.return();
|
||||
this._waitForPredictionExhaustion = true;
|
||||
shouldBreak = true;
|
||||
}
|
||||
else
|
||||
this._predictedTokens.push(token);
|
||||
}
|
||||
if (this._resetPredictions && !abortSignal.aborted) {
|
||||
await resetPredications();
|
||||
iterator = createIterator();
|
||||
this._iterator = iterator;
|
||||
continue;
|
||||
}
|
||||
if (this._predictedTokens.length >= this._minTokens) {
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
}
|
||||
if (shouldBreak) {
|
||||
this._iterator = undefined;
|
||||
await iterator.return();
|
||||
this._waitForPredictionExhaustion = true;
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
this._active = false;
|
||||
}
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
_getGrammarEvaluationStateWithTokens(tokens) {
|
||||
if (this._grammarEvaluationStateOption == null)
|
||||
return undefined;
|
||||
const clone = this._grammarEvaluationStateOption.clone();
|
||||
for (const token of tokens) {
|
||||
const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
|
||||
if (!canAddToken) {
|
||||
console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
|
||||
this._grammarEvaluationStateOption = undefined;
|
||||
return undefined;
|
||||
}
|
||||
LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=DraftSequenceTokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
58
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
generated
vendored
Normal file
58
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
import { Token } from "../../../types.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
/**
|
||||
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
|
||||
*
|
||||
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
|
||||
* such as in text summarization or modifying code).
|
||||
*
|
||||
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*
|
||||
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
|
||||
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
|
||||
*/
|
||||
export declare class InputLookupTokenPredictor extends TokenPredictor {
|
||||
constructor(options?: {
|
||||
patternLength?: {
|
||||
/**
|
||||
* Min pattern length to look for in the input tokens.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
min?: number;
|
||||
/**
|
||||
* Max pattern length to look for in the input tokens.
|
||||
*
|
||||
* Set to `0` to disable the max pattern size.
|
||||
*
|
||||
* Defaults to `0`.
|
||||
*/
|
||||
max?: number;
|
||||
};
|
||||
predictionLength?: {
|
||||
/**
|
||||
* Minimum number of tokens to predict.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
min?: number;
|
||||
/**
|
||||
* Maximum number of tokens to predict.
|
||||
*
|
||||
* Defaults to `3`.
|
||||
*/
|
||||
max?: number;
|
||||
};
|
||||
});
|
||||
get patternMinLength(): number;
|
||||
get patternMaxLength(): number;
|
||||
get predictionMinLength(): number;
|
||||
get predictionMaxLength(): number;
|
||||
reset({ stateTokens }: {
|
||||
stateTokens: Token[];
|
||||
}): void;
|
||||
updateInputTokens(tokens: Token[]): void;
|
||||
pushTokens(tokens: Token[]): void;
|
||||
predictTokens(): Token[];
|
||||
dispose(): void;
|
||||
}
|
||||
138
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
generated
vendored
Normal file
138
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
import { DisposedError } from "lifecycle-utils";
|
||||
import { pushAll } from "../../../utils/pushAll.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
const defaultPatternMinLength = 1;
|
||||
const defaultPatternMaxLength = 0;
|
||||
const defaultPredictionMinLength = 1;
|
||||
const defaultPredictionMaxLength = 3;
|
||||
/**
|
||||
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
|
||||
*
|
||||
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
|
||||
* such as in text summarization or modifying code).
|
||||
*
|
||||
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*
|
||||
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
|
||||
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
|
||||
*/
|
||||
export class InputLookupTokenPredictor extends TokenPredictor {
|
||||
/** @internal */ _patternMinLength;
|
||||
/** @internal */ _patternMaxLength;
|
||||
/** @internal */ _predictionMinLength;
|
||||
/** @internal */ _predictionMaxLength;
|
||||
/** @internal */ _lastPredictionMatchStartIndex = undefined;
|
||||
/** @internal */ _lastPredictionMatchLength = undefined;
|
||||
/** @internal */ _stateTokens = [];
|
||||
/** @internal */ _inputTokens = [];
|
||||
/** @internal */ _disposed = false;
|
||||
constructor(options = {}) {
|
||||
super();
|
||||
this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
|
||||
this._patternMaxLength = Math.floor(Math.max(0, Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)));
|
||||
this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
|
||||
this._predictionMaxLength = Math.floor(Math.max(this._patternMinLength, options.predictionLength?.max ?? defaultPredictionMaxLength));
|
||||
}
|
||||
get patternMinLength() {
|
||||
return this._patternMinLength;
|
||||
}
|
||||
get patternMaxLength() {
|
||||
return this._patternMaxLength;
|
||||
}
|
||||
get predictionMinLength() {
|
||||
return this._predictionMinLength;
|
||||
}
|
||||
get predictionMaxLength() {
|
||||
return this._predictionMaxLength;
|
||||
}
|
||||
reset({ stateTokens }) {
|
||||
this._stateTokens = stateTokens.slice();
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
updateInputTokens(tokens) {
|
||||
this._inputTokens = tokens.slice();
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
pushTokens(tokens) {
|
||||
pushAll(this._stateTokens, tokens);
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
this._lastPredictionMatchLength += tokens.length;
|
||||
}
|
||||
}
|
||||
predictTokens() {
|
||||
if (this._disposed)
|
||||
throw new DisposedError();
|
||||
if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
|
||||
return [];
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
for (let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1, s = this._stateTokens.length - 1; p >= this._lastPredictionMatchStartIndex && s >= 0; p--, s--) {
|
||||
if (this._inputTokens[p] !== this._stateTokens[s]) {
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
|
||||
if (predictionEndIndex < this._inputTokens.length) {
|
||||
return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
|
||||
if (matchStartIndex == null || matchLength == null)
|
||||
return [];
|
||||
const predictionEndIndex = matchStartIndex + matchLength;
|
||||
const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
|
||||
if (res.length >= this._predictionMinLength) {
|
||||
this._lastPredictionMatchStartIndex = matchStartIndex;
|
||||
this._lastPredictionMatchLength = matchLength;
|
||||
return res;
|
||||
}
|
||||
return [];
|
||||
}
|
||||
dispose() {
|
||||
this._disposed = true;
|
||||
this._stateTokens = [];
|
||||
this._inputTokens = [];
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
/** @internal */
|
||||
_findLongestPatternIndex(findIn, lookupPattern) {
|
||||
const checkIndexes = [];
|
||||
let bestIndex = -1;
|
||||
let bestIndexDiff = -1;
|
||||
for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
|
||||
const token = findIn[i];
|
||||
for (let j = checkIndexes.length - 1; j >= 0; j--) {
|
||||
const startIndex = checkIndexes[j];
|
||||
const indexDiff = startIndex - i;
|
||||
if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength)) {
|
||||
checkIndexes.splice(j, 1);
|
||||
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
|
||||
bestIndex = startIndex;
|
||||
bestIndexDiff = indexDiff;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (token === lookupPattern[lookupPattern.length - 1])
|
||||
checkIndexes.unshift(i);
|
||||
}
|
||||
for (let j = checkIndexes.length - 1; j >= 0; j--) {
|
||||
const startIndex = checkIndexes[j];
|
||||
const indexDiff = startIndex + 1;
|
||||
checkIndexes.splice(j, 1);
|
||||
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
|
||||
bestIndex = startIndex;
|
||||
bestIndexDiff = indexDiff;
|
||||
}
|
||||
}
|
||||
if (bestIndex >= 0)
|
||||
return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
|
||||
return [];
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=InputLookupTokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
458
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
generated
vendored
Normal file
458
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
generated
vendored
Normal file
@@ -0,0 +1,458 @@
|
||||
import { PickOptions } from "../../utils/utilTypes.js";
|
||||
import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
|
||||
import type { TokenBias } from "../TokenBias.js";
|
||||
import type { Token } from "../../types.js";
|
||||
import type { LlamaContextSequence } from "./LlamaContext.js";
|
||||
export type LlamaContextOptions = {
|
||||
/**
|
||||
* number of sequences for the context.
|
||||
* Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context.
|
||||
* Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other.
|
||||
* This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch).
|
||||
*
|
||||
* Each sequence increases the memory usage of the context.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
sequences?: number;
|
||||
/**
|
||||
* The number of tokens the model can see at once.
|
||||
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
|
||||
* the model was trained on.
|
||||
* - **`number`** - set the context size to a specific number of tokens.
|
||||
* If there's not enough VRAM, an error will be thrown.
|
||||
* Use with caution.
|
||||
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
|
||||
* up to the size the model was trained on, but at least `min` and at most `max`.
|
||||
*
|
||||
* The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
|
||||
* aligns the context size to multiples of 256 for performance reasons.
|
||||
* To check the actual context size that gets created, use the `.contextSize` property
|
||||
* of the created context instance or any of its sequences.
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
contextSize?: "auto" | number | {
|
||||
min?: number;
|
||||
max?: number;
|
||||
};
|
||||
/**
|
||||
* The number of tokens that can be processed at once by the GPU.
|
||||
*
|
||||
* Defaults to `512` or `contextSize` if `contextSize` is less than `512`.
|
||||
*/
|
||||
batchSize?: number;
|
||||
/**
|
||||
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
|
||||
*
|
||||
* The support for flash attention is currently experimental and may not always work as expected.
|
||||
* Use with caution.
|
||||
*
|
||||
* This option will be ignored if flash attention is not supported by the model.
|
||||
*
|
||||
* Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
|
||||
*
|
||||
* Upon flash attention exiting the experimental status, the default value will become `true`
|
||||
* (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
|
||||
*/
|
||||
flashAttention?: boolean;
|
||||
/**
|
||||
* number of threads to use to evaluate tokens.
|
||||
* set to 0 to use the maximum threads supported by the current machine hardware.
|
||||
*
|
||||
* This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running.
|
||||
* To ensure the minimum number of threads you want to use are always used,
|
||||
* set this to an object with a `min` property (see the `min` property description for more details).
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
|
||||
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
|
||||
*/
|
||||
threads?: number | {
|
||||
/**
|
||||
* The ideal number of threads to use for evaluations.
|
||||
*
|
||||
* If other evaluations are running, the actual number of threads may be lower than this value.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
|
||||
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
|
||||
*/
|
||||
ideal?: number;
|
||||
/**
|
||||
* Ensure evaluations always use at least this number of threads.
|
||||
*
|
||||
* Use with caution, since setting this value too high can lead to the context waiting too much time
|
||||
* to reserve this number of threads before the evaluation can start.
|
||||
*/
|
||||
min?: number;
|
||||
};
|
||||
/**
|
||||
* Control the parallel sequences processing behavior.
|
||||
*
|
||||
* See {@link BatchingOptions} for more information.
|
||||
*/
|
||||
batching?: BatchingOptions;
|
||||
/**
|
||||
* When using SWA (Sliding Window Attention) on a supported model,
|
||||
* extend the sliding window size to the current context size (meaning practically disabling SWA).
|
||||
*
|
||||
* Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
|
||||
* but will allow reusing the evaluation cache of any prefix length of the context sequence state
|
||||
* (instead of just the size of the sliding window when SWA is used).
|
||||
*
|
||||
* This option has no effect on models that do not support SWA (Sliding Window Attention).
|
||||
*
|
||||
* > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
|
||||
*
|
||||
* Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
|
||||
*/
|
||||
swaFullCache?: boolean;
|
||||
/**
|
||||
* Load the provided LoRA adapters onto the context.
|
||||
* LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
|
||||
* without the need for extensive retraining from scratch.
|
||||
*
|
||||
* If a string is provided, it will be treated as a path to a single LoRA adapter file.
|
||||
*
|
||||
* The adapters will be released from memory once the model (not just the context) is disposed.
|
||||
*/
|
||||
lora?: string | {
|
||||
adapters: Array<{
|
||||
filePath: string;
|
||||
/**
|
||||
* Defaults to `1`
|
||||
*/
|
||||
scale?: number;
|
||||
}>;
|
||||
/**
|
||||
* Called with the LoRA adapters load percentage when the LoRA adapters are being loaded.
|
||||
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
|
||||
*/
|
||||
onLoadProgress?(loadProgress: number): void;
|
||||
};
|
||||
/** An abort signal to abort the context creation */
|
||||
createSignal?: AbortSignal;
|
||||
/**
|
||||
* Ignore insufficient memory errors and continue with the context creation.
|
||||
* Can cause the process to crash if there's not enough VRAM for the new context.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
/**
|
||||
* On failed context creation, retry the creation with a smaller context size.
|
||||
*
|
||||
* Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties.
|
||||
*
|
||||
* Set `retries` to `false` to disable.
|
||||
*/
|
||||
failedCreationRemedy?: false | {
|
||||
/**
|
||||
* Retries to attempt to create the context.
|
||||
*
|
||||
* Defaults to `6`.
|
||||
*/
|
||||
retries?: number;
|
||||
/**
|
||||
* The percentage to decrease the context size by on each retry.
|
||||
* Should be a number between `0` and `1`.
|
||||
*
|
||||
* If a function is provided, it will be called with the current context size and should return the new context size.
|
||||
*
|
||||
* Defaults to `0.16`.
|
||||
*/
|
||||
autoContextSizeShrink?: number | ((contextSize: number) => number);
|
||||
};
|
||||
/**
|
||||
* Track the inference performance of the context, so using `.printTimings()` will work.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
performanceTracking?: boolean;
|
||||
};
|
||||
export type LlamaContextSequenceRepeatPenalty = {
|
||||
/** Tokens to lower the predication probability of to be the next predicted token */
|
||||
punishTokens: Token[] | (() => Token[]);
|
||||
/**
|
||||
* The maximum number of tokens that will be provided in the `punishTokens` array.
|
||||
*
|
||||
* This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation.
|
||||
*
|
||||
* Don't set this value too high, as it can allocate too much memory.
|
||||
*
|
||||
* Defaults to `64`.
|
||||
*/
|
||||
maxPunishTokens?: number;
|
||||
/**
|
||||
* The relative amount to lower the probability of the tokens in `punishTokens` by.
|
||||
*
|
||||
* Defaults to `1.1`.
|
||||
* Set to `1` to disable.
|
||||
*/
|
||||
penalty?: number;
|
||||
/**
|
||||
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`.
|
||||
*
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
frequencyPenalty?: number;
|
||||
/**
|
||||
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`.
|
||||
*
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
presencePenalty?: number;
|
||||
};
|
||||
export type BatchingOptions = {
|
||||
/**
|
||||
* The strategy used to dispatch items to be processed when there are items pending to be processed.
|
||||
* - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
|
||||
* You can provide a custom function to define a custom dispatch schedule.
|
||||
*
|
||||
* Defaults to `"nextCycle"`.
|
||||
*/
|
||||
dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule;
|
||||
/**
|
||||
* The strategy used to prioritize pending items to be processed.
|
||||
* - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
|
||||
* - **`"firstInFirstOut"`** - process items in the order they were added.
|
||||
* - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
|
||||
* See the {@link CustomBatchingPrioritizationStrategy} type for more information.
|
||||
*
|
||||
* Defaults to `"maximumParallelism"`.
|
||||
*/
|
||||
itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy;
|
||||
};
|
||||
/**
|
||||
* A function that schedules the dispatch of the batch items.
|
||||
* Call the `dispatch` function to dispatch the items.
|
||||
*/
|
||||
export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void;
|
||||
/**
|
||||
* A function that prioritizes the batch items to be processed.
|
||||
* The function receives an array of `items` and the `size` of how many tokens can be processed in this batch.
|
||||
*
|
||||
* The function should return an array of prioritized items,
|
||||
* where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received,
|
||||
* and where the `item` of each prioritized item is the same reference to an original item in the `items` array.
|
||||
*/
|
||||
export type CustomBatchingPrioritizationStrategy = (options: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}) => PrioritizedBatchItem[];
|
||||
export type ContextShiftOptions = {
|
||||
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
|
||||
strategy?: "eraseBeginning" | ((options: {
|
||||
sequence: LlamaContextSequence;
|
||||
size: number;
|
||||
}) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>);
|
||||
};
|
||||
export type ContextTokensDeleteRange = {
|
||||
start: number;
|
||||
end: number;
|
||||
};
|
||||
export type SequenceEvaluateOptions = {
|
||||
temperature?: number;
|
||||
minP?: number;
|
||||
topK?: number;
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Defaults to the current epoch time.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
|
||||
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/**
|
||||
* Override the sequence context shift options for this evaluation
|
||||
*
|
||||
* See {@link ContextShiftOptions} for more information.
|
||||
*/
|
||||
contextShift?: ContextShiftOptions;
|
||||
/**
|
||||
* Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
|
||||
* When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
yieldEogToken?: boolean;
|
||||
};
|
||||
export type SequenceEvaluateMetadataOptions = {
|
||||
/**
|
||||
* Get the confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `probabilities.get(token)` from the output.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
readonly confidence?: boolean;
|
||||
/**
|
||||
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
|
||||
*
|
||||
* Only enable when needed, as it impacts the performance.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
readonly probabilities?: boolean;
|
||||
};
|
||||
export type SequenceEvaluateOutput<Options extends {
|
||||
readonly confidence?: boolean;
|
||||
readonly probabilities?: boolean;
|
||||
} = {
|
||||
readonly confidence: true;
|
||||
readonly probabilities: true;
|
||||
}> = PickOptions<{
|
||||
/**
|
||||
* The next token generated by the model and selected using the given options (such a temperature).
|
||||
*/
|
||||
token: Token;
|
||||
/**
|
||||
* The confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `probabilities.get(token)`.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence: number;
|
||||
/**
|
||||
* The probabilities of the tokens from the vocabulary to be the next token.
|
||||
*
|
||||
* A probability is a number from `0` to `1`.
|
||||
*
|
||||
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
|
||||
*
|
||||
* The map is sorted by the probability of the tokens from the highest to the lowest,
|
||||
* and is reflected in the order of the entries when iterating over the map.
|
||||
* Use `.entries().next().value` to get the top probability pair
|
||||
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
|
||||
*/
|
||||
probabilities: Map<Token, number>;
|
||||
}, Options & {
|
||||
token: true;
|
||||
}>;
|
||||
export type ControlledEvaluateInputItem = Token | [
|
||||
token: Token,
|
||||
options: {
|
||||
generateNext?: {
|
||||
/**
|
||||
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
|
||||
*
|
||||
* Only enable when needed, as it impacts the performance.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
probabilities?: boolean;
|
||||
/**
|
||||
* Get the confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `next.probabilities.get(next.token)` from the output.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence?: boolean;
|
||||
/**
|
||||
* Generate the next token with the provided options using sampling.
|
||||
*
|
||||
* Setting this to `true` will generate probabilities for the next token and sample it.
|
||||
*/
|
||||
token?: boolean;
|
||||
options?: {
|
||||
temperature?: number;
|
||||
minP?: number;
|
||||
topK?: number;
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Defaults to the current epoch time.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
};
|
||||
};
|
||||
}
|
||||
];
|
||||
export type ControlledEvaluateIndexOutput = {
|
||||
next: {
|
||||
token?: Token | null;
|
||||
/**
|
||||
* The confidence (probability) of the selected token (the `token` field in this object).
|
||||
*
|
||||
* Same as `next.probabilities.get(next.token)`.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence?: number;
|
||||
/**
|
||||
* The probabilities of the tokens from the vocabulary to be the next token.
|
||||
*
|
||||
* A probability is a number from `0` to `1`.
|
||||
*
|
||||
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
|
||||
*
|
||||
* The map is sorted by the probability of the tokens from the highest to the lowest,
|
||||
* and is reflected in the order of the entries when iterating over the map.
|
||||
* Use `.entries().next().value` to get the top probability pair
|
||||
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
|
||||
*/
|
||||
probabilities?: Map<Token, number>;
|
||||
};
|
||||
};
|
||||
/**
|
||||
* 1 - low
|
||||
*
|
||||
* 5 - high
|
||||
*/
|
||||
export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
|
||||
export type BatchItem = {
|
||||
readonly tokens: readonly Token[];
|
||||
readonly logits: readonly (true | undefined)[];
|
||||
readonly evaluationPriority: EvaluationPriority;
|
||||
};
|
||||
export type PrioritizedBatchItem = {
|
||||
item: BatchItem;
|
||||
processAmount: number;
|
||||
};
|
||||
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
generated
vendored
Normal file
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
generated
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
export {};
|
||||
//# sourceMappingURL=types.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}
|
||||
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
|
||||
export declare function firstInFirstOutStrategy({ items, size }: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}): PrioritizedBatchItem[];
|
||||
16
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
export function firstInFirstOutStrategy({ items, size }) {
|
||||
const res = [];
|
||||
const sortedItems = items
|
||||
.slice()
|
||||
.sort((a, b) => b.evaluationPriority - a.evaluationPriority);
|
||||
let leftFreeTokens = size;
|
||||
for (const item of sortedItems) {
|
||||
const processAmount = Math.min(item.tokens.length, leftFreeTokens);
|
||||
res.push({ item, processAmount });
|
||||
leftFreeTokens -= processAmount;
|
||||
if (leftFreeTokens === 0)
|
||||
break;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
//# sourceMappingURL=firstInFirstOutStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
|
||||
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
|
||||
export declare function maximumParallelismStrategy({ items, size }: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}): PrioritizedBatchItem[];
|
||||
42
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
generated
vendored
Normal file
42
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
export function maximumParallelismStrategy({ items, size }) {
|
||||
let leftFreeTokens = size;
|
||||
const minTokensForEachItem = Math.floor(leftFreeTokens / items.length);
|
||||
const res = [];
|
||||
const clippedItems = [];
|
||||
for (const item of items) {
|
||||
const processAmount = Math.min(item.tokens.length, leftFreeTokens, minTokensForEachItem);
|
||||
const prioritizeItem = { item, processAmount };
|
||||
res.push(prioritizeItem);
|
||||
leftFreeTokens -= processAmount;
|
||||
if (processAmount < item.tokens.length)
|
||||
clippedItems.push(prioritizeItem);
|
||||
if (leftFreeTokens === 0)
|
||||
break;
|
||||
}
|
||||
for (let passesLeft = 3; leftFreeTokens > 0 && clippedItems.length > 0 && passesLeft > 0; passesLeft--) {
|
||||
const minIncreaseAmount = Math.ceil(leftFreeTokens / clippedItems.length);
|
||||
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
|
||||
const prioritizeItem = clippedItems[i];
|
||||
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
|
||||
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens, minIncreaseAmount);
|
||||
prioritizeItem.processAmount += increaseAmount;
|
||||
if (increaseAmount === unprocessedAmount) {
|
||||
clippedItems.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
clippedItems.sort((a, b) => b.item.evaluationPriority - a.item.evaluationPriority);
|
||||
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
|
||||
const prioritizeItem = clippedItems[i];
|
||||
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
|
||||
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens);
|
||||
prioritizeItem.processAmount += increaseAmount;
|
||||
if (increaseAmount === unprocessedAmount) {
|
||||
clippedItems.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
//# sourceMappingURL=maximumParallelismStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export declare function padSafeContextSize(value: number, padDirection: "up" | "down", padding?: number): number;
|
||||
18
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
generated
vendored
Normal file
18
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
generated
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
import { contextSizePad } from "../../../config.js";
|
||||
export function padSafeContextSize(value, padDirection, padding = contextSizePad) {
|
||||
const paddedSize = ggmlPad(value, padding);
|
||||
if (paddedSize === value)
|
||||
return value;
|
||||
else if (padDirection === "up")
|
||||
return paddedSize;
|
||||
else if (padDirection === "down") {
|
||||
const smallerPaddedSize = ggmlPad(value - padding, padding);
|
||||
if (smallerPaddedSize >= padding)
|
||||
return smallerPaddedSize;
|
||||
}
|
||||
return paddedSize;
|
||||
}
|
||||
function ggmlPad(value, padding) {
|
||||
return ((value + padding - 1) & ~(padding - 1));
|
||||
}
|
||||
//# sourceMappingURL=padSafeContextSize.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}
|
||||
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
generated
vendored
Normal file
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
import { BatchingOptions } from "../types.js";
|
||||
export declare function resolveBatchItemsPrioritizationStrategy(strategy: Required<BatchingOptions>["itemPrioritizationStrategy"]): import("../types.js").CustomBatchingPrioritizationStrategy;
|
||||
13
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
generated
vendored
Normal file
13
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
import { maximumParallelismStrategy } from "./batchItemsPrioritizationStrategies/maximumParallelismStrategy.js";
|
||||
import { firstInFirstOutStrategy } from "./batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js";
|
||||
export function resolveBatchItemsPrioritizationStrategy(strategy) {
|
||||
if (strategy instanceof Function)
|
||||
return strategy;
|
||||
else if (strategy === "maximumParallelism")
|
||||
return maximumParallelismStrategy;
|
||||
else if (strategy === "firstInFirstOut")
|
||||
return firstInFirstOutStrategy;
|
||||
void strategy;
|
||||
throw new Error(`Unknown batch items prioritize strategy: ${strategy}`);
|
||||
}
|
||||
//# sourceMappingURL=resolveBatchItemsPrioritizationStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}
|
||||
21
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.d.ts
generated
vendored
Normal file
21
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.d.ts
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
export type LlamaEmbeddingOptions = {
|
||||
vector: readonly number[];
|
||||
};
|
||||
export type LlamaEmbeddingJSON = {
|
||||
type: "embedding";
|
||||
vector: readonly number[];
|
||||
};
|
||||
export declare class LlamaEmbedding {
|
||||
readonly vector: readonly number[];
|
||||
constructor(options: LlamaEmbeddingOptions);
|
||||
toJSON(): LlamaEmbeddingJSON;
|
||||
/**
|
||||
* Calculates the cosine similarity between this embedding and another embedding.
|
||||
*
|
||||
* Note that you should only compare embeddings created by the exact same model file.
|
||||
* @returns A value between 0 and 1 representing the similarity between the embedding vectors,
|
||||
* where 1 means the embeddings are identical.
|
||||
*/
|
||||
calculateCosineSimilarity(other: LlamaEmbedding | LlamaEmbeddingJSON | readonly number[]): number;
|
||||
static fromJSON(json: LlamaEmbeddingJSON): LlamaEmbedding;
|
||||
}
|
||||
53
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js
generated
vendored
Normal file
53
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js
generated
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
export class LlamaEmbedding {
|
||||
vector;
|
||||
constructor(options) {
|
||||
this.vector = Object.freeze(options.vector.slice());
|
||||
}
|
||||
toJSON() {
|
||||
return {
|
||||
type: "embedding",
|
||||
vector: this.vector
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Calculates the cosine similarity between this embedding and another embedding.
|
||||
*
|
||||
* Note that you should only compare embeddings created by the exact same model file.
|
||||
* @returns A value between 0 and 1 representing the similarity between the embedding vectors,
|
||||
* where 1 means the embeddings are identical.
|
||||
*/
|
||||
calculateCosineSimilarity(other) {
|
||||
const otherVector = other instanceof Array
|
||||
? other
|
||||
: other.vector;
|
||||
if (otherVector == null)
|
||||
throw new Error("Other vector is null");
|
||||
else if (otherVector.length !== this.vector.length) {
|
||||
if (otherVector.length === 0 || this.vector.length === 0)
|
||||
return 0;
|
||||
else
|
||||
throw new Error("Vectors have different lengths");
|
||||
}
|
||||
let dotProduct = 0;
|
||||
let thisMagnitude = 0;
|
||||
let otherMagnitude = 0;
|
||||
for (let i = 0; i < this.vector.length; i++) {
|
||||
dotProduct += this.vector[i] * otherVector[i];
|
||||
thisMagnitude += Math.pow(this.vector[i], 2);
|
||||
otherMagnitude += Math.pow(otherVector[i], 2);
|
||||
}
|
||||
if (thisMagnitude === 0 && otherMagnitude === 0)
|
||||
return 1;
|
||||
else if (thisMagnitude === 0 || otherMagnitude === 0)
|
||||
return 0;
|
||||
const thisNorm = Math.sqrt(thisMagnitude);
|
||||
const otherNorm = Math.sqrt(otherMagnitude);
|
||||
return dotProduct / (thisNorm * otherNorm);
|
||||
}
|
||||
static fromJSON(json) {
|
||||
return new LlamaEmbedding({
|
||||
vector: json.vector
|
||||
});
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaEmbedding.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbedding.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaEmbedding.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbedding.ts"],"names":[],"mappings":"AASA,MAAM,OAAO,cAAc;IACP,MAAM,CAAoB;IAE1C,YAAmB,OAA8B;QAC7C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM;QACT,OAAO;YACH,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC;IACN,CAAC;IAED;;;;;;OAMG;IACI,yBAAyB,CAAC,KAA8D;QAC3F,MAAM,WAAW,GAAG,KAAK,YAAY,KAAK;YACtC,CAAC,CAAC,KAAK;YACP,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;QAEnB,IAAI,WAAW,IAAI,IAAI;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;aACvC,IAAI,WAAW,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACjD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC;gBACpD,OAAO,CAAC,CAAC;;gBAET,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;YAChD,aAAa,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;YAC9C,cAAc,IAAI,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAC;QACnD,CAAC;QAED,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAC3C,OAAO,CAAC,CAAC;aACR,IAAI,aAAa,KAAK,CAAC,IAAI,cAAc,KAAK,CAAC;YAChD,OAAO,CAAC,CAAC;QAEb,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAE5C,OAAO,UAAU,GAAG,CAAC,QAAQ,GAAG,SAAS,CAAC,CAAC;IAC/C,CAAC;IAEM,MAAM,CAAC,QAAQ,CAAC,IAAwB;QAC3C,OAAO,IAAI,cAAc,CAAC;YACtB,MAAM,EAAE,IAAI,CAAC,MAAM;SACtB,CAAC,CAAC;IACP,CAAC;CACJ"}
|
||||
52
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.d.ts
generated
vendored
Normal file
52
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.d.ts
generated
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { Token } from "../types.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import { LlamaEmbedding } from "./LlamaEmbedding.js";
|
||||
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
|
||||
export type LlamaEmbeddingContextOptions = {
|
||||
/**
|
||||
* The number of tokens the model can see at once.
|
||||
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
|
||||
* the model was trained on.
|
||||
* - **`number`** - set the context size to a specific number of tokens.
|
||||
* If there's not enough VRAM, an error will be thrown.
|
||||
* Use with caution.
|
||||
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
|
||||
* up to the size the model was trained on, but at least `min` and at most `max`.
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
contextSize?: "auto" | number | {
|
||||
min?: number;
|
||||
max?: number;
|
||||
};
|
||||
/** prompt processing batch size */
|
||||
batchSize?: number;
|
||||
/**
|
||||
* number of threads to use to evaluate tokens.
|
||||
* set to 0 to use the maximum threads supported by the current machine hardware
|
||||
*/
|
||||
threads?: number;
|
||||
/** An abort signal to abort the context creation */
|
||||
createSignal?: AbortSignal;
|
||||
/**
|
||||
* Ignore insufficient memory errors and continue with the context creation.
|
||||
* Can cause the process to crash if there's not enough VRAM for the new context.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
};
|
||||
/**
|
||||
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
|
||||
*/
|
||||
export declare class LlamaEmbeddingContext {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
getEmbeddingFor(input: Token[] | string | LlamaText): Promise<LlamaEmbedding>;
|
||||
dispose(): Promise<void>;
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose](): Promise<void>;
|
||||
get disposed(): boolean;
|
||||
get model(): LlamaModel;
|
||||
}
|
||||
86
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js
generated
vendored
Normal file
86
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js
generated
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
import { AsyncDisposeAggregator, EventRelay, withLock } from "lifecycle-utils";
|
||||
import { tokenizeInput } from "../utils/tokenizeInput.js";
|
||||
import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
|
||||
import { LlamaEmbedding } from "./LlamaEmbedding.js";
|
||||
/**
|
||||
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
|
||||
*/
|
||||
export class LlamaEmbeddingContext {
|
||||
/** @internal */ _llamaContext;
|
||||
/** @internal */ _sequence;
|
||||
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
|
||||
onDispose = new EventRelay();
|
||||
constructor({ _llamaContext }) {
|
||||
this._llamaContext = _llamaContext;
|
||||
this._sequence = this._llamaContext.getSequence();
|
||||
this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
|
||||
void this._disposeAggregator.dispose();
|
||||
}));
|
||||
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
||||
this._disposeAggregator.add(async () => {
|
||||
await this._llamaContext.dispose();
|
||||
});
|
||||
}
|
||||
async getEmbeddingFor(input) {
|
||||
const resolvedInput = tokenizeInput(input, this._llamaContext.model.tokenizer, undefined, true);
|
||||
if (resolvedInput.length > this._llamaContext.contextSize)
|
||||
throw new Error("Input is longer than the context size. " +
|
||||
"Try to increase the context size or use another model that supports longer contexts.");
|
||||
else if (resolvedInput.length === 0)
|
||||
return new LlamaEmbedding({
|
||||
vector: []
|
||||
});
|
||||
const beginningToken = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
|
||||
if (beginningToken != null && resolvedInput[0] !== beginningToken)
|
||||
resolvedInput.unshift(beginningToken);
|
||||
const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
|
||||
if (endToken != null && resolvedInput.at(-1) !== endToken)
|
||||
resolvedInput.push(endToken);
|
||||
return await withLock([this, "evaluate"], async () => {
|
||||
await this._sequence.eraseContextTokenRanges([{
|
||||
start: 0,
|
||||
end: this._sequence.nextTokenIndex
|
||||
}]);
|
||||
const iterator = this._sequence.evaluate(resolvedInput, { _noSampling: true });
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
for await (const token of iterator) {
|
||||
break; // only generate one token to get embeddings
|
||||
}
|
||||
const embedding = this._llamaContext._ctx.getEmbedding(resolvedInput.length);
|
||||
const embeddingVector = Array.from(embedding);
|
||||
return new LlamaEmbedding({
|
||||
vector: embeddingVector
|
||||
});
|
||||
});
|
||||
}
|
||||
async dispose() {
|
||||
await this._disposeAggregator.dispose();
|
||||
}
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
get disposed() {
|
||||
return this._llamaContext.disposed;
|
||||
}
|
||||
get model() {
|
||||
return this._llamaContext.model;
|
||||
}
|
||||
/** @internal */
|
||||
static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, ignoreMemorySafetyChecks }) {
|
||||
if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
|
||||
throw new Error("Computing embeddings is not supported for encoder-decoder models.");
|
||||
const llamaContext = await _model.createContext({
|
||||
contextSize,
|
||||
batchSize,
|
||||
threads,
|
||||
createSignal,
|
||||
ignoreMemorySafetyChecks,
|
||||
_embeddings: true
|
||||
});
|
||||
return new LlamaEmbeddingContext({
|
||||
_llamaContext: llamaContext
|
||||
});
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaEmbeddingContext.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaEmbeddingContext.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaEmbeddingContext.js","sourceRoot":"","sources":["../../src/evaluator/LlamaEmbeddingContext.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,sBAAsB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,iBAAiB,CAAC;AAG7E,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAC,8BAA8B,EAAE,uBAAuB,EAAC,MAAM,4BAA4B,CAAC;AACnG,OAAO,EAAC,cAAc,EAAC,MAAM,qBAAqB,CAAC;AA2CnD;;GAEG;AACH,MAAM,OAAO,qBAAqB;IAC9B,gBAAgB,CAAkB,aAAa,CAAe;IAC9D,gBAAgB,CAAkB,SAAS,CAAuB;IAClE,gBAAgB,CAAkB,kBAAkB,GAAG,IAAI,sBAAsB,EAAE,CAAC;IAEpE,SAAS,GAAG,IAAI,UAAU,EAAQ,CAAC;IAEnD,YAAoB,EAChB,aAAa,EAGhB;QACG,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;QACnC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC;QAElD,IAAI,CAAC,kBAAkB,CAAC,GAAG,CACvB,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,cAAc,CAAC,GAAG,EAAE;YAC7C,KAAK,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;QAC3C,CAAC,CAAC,CACL,CAAC;QACF,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;QAC1D,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE;YACnC,MAAM,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;QACvC,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,eAAe,CAAC,KAAmC;QAC5D,MAAM,aAAa,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEhG,IAAI,aAAa,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW;YACrD,MAAM,IAAI,KAAK,CACX,yCAAyC;gBACzC,sFAAsF,CACzF,CAAC;aACD,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;YAC/B,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,EAAE;aACb,CAAC,CAAC;QAEP,MAAM,cAAc,GAAG,8BAA8B,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACpG,IAAI,cAAc,IAAI,IAAI,IAAI,aAAa,CAAC,CAAC,CAAC,KAAK,cAAc;YAC7D,aAAa,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAE1C,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACvF,IAAI,QAAQ,IAAI,IAAI,IAAI,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,QAAQ;YACrD,aAAa,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEjC,OAAO,MAAM,QAAQ,CAAC,CAAC,IAA6B,EAAE,UAAU,CAAC,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,IAAI,CAAC,SAAS,CAAC,uBAAuB,CAAC,CAAC;oBAC1C,KAAK,EAAE,CAAC;oBACR,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc;iBACrC,CAAC,CAAC,CAAC;YAEJ,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAC,WAAW,EAAE,IAAI,EAAC,CAAC,CAAC;YAC7E,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;gBACjC,MAAM,CAAC,4CAA4C;YACvD,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC7E,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAE9C,OAAO,IAAI,cAAc,CAAC;gBACtB,MAAM,EAAE,eAAe;aAC1B,CAAC,CAAC;QACP,CAAC,CAAC,CAAC;IACP,CAAC;IAEM,KAAK,CAAC,OAAO;QAChB,MAAM,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,CAAC;IAC5C,CAAC;IAED,cAAc;IACP,CAAC,MAAM,CAAC,YAAY,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;IAED,IAAW,QAAQ;QACf,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;IACvC,CAAC;IAED,IAAW,KAAK;QACZ,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC;IACpC,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,EACxB,MAAM,EAGT,EAAE,EACC,WAAW,EACX,SAAS,EACT,OAAO,GAAG,CAAC,EACX,YAAY,EACZ,wBAAwB,EACG;QAC3B,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU,IAAI,MAAM,CAAC,YAAY,CAAC,UAAU;YAChE,MAAM,IAAI,KAAK,CAAC,mEAAmE,CAAC,CAAC;QAEzF,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC;YAC5C,WAAW;YACX,SAAS;YACT,OAAO;YACP,YAAY;YACZ,wBAAwB;YACxB,WAAW,EAAE,IAAI;SACpB,CAAC,CAAC;QAEH,OAAO,IAAI,qBAAqB,CAAC;YAC7B,aAAa,EAAE,YAAY;SAC9B,CAAC,CAAC;IACP,CAAC;CACJ"}
|
||||
39
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.d.ts
generated
vendored
Normal file
39
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.d.ts
generated
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import { Llama } from "../bindings/Llama.js";
|
||||
import { Token } from "../types.js";
|
||||
export type LlamaGrammarOptions = {
|
||||
/** GBNF grammar */
|
||||
grammar: string;
|
||||
/** Consider any of these as EOS for the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
|
||||
stopGenerationTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
|
||||
/** Trim whitespace from the end of the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
|
||||
trimWhitespaceSuffix?: boolean;
|
||||
/**
|
||||
* Root rule name.
|
||||
*
|
||||
* Defaults to `"root"`.
|
||||
*/
|
||||
rootRuleName?: string;
|
||||
};
|
||||
/**
|
||||
* @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
|
||||
*/
|
||||
export declare class LlamaGrammar {
|
||||
/**
|
||||
* > GBNF files are supported.
|
||||
* > More info here: [
|
||||
* github:ggml-org/llama.cpp:grammars/README.md
|
||||
* ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
|
||||
*
|
||||
* Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
|
||||
* @deprecated Use `llama.createGrammar(...)` instead.
|
||||
* @param llama
|
||||
* @param options
|
||||
*/
|
||||
constructor(llama: Llama, { grammar, stopGenerationTriggers, trimWhitespaceSuffix, rootRuleName }: LlamaGrammarOptions);
|
||||
get grammar(): string;
|
||||
get rootRuleName(): string;
|
||||
get stopGenerationTriggers(): readonly (string | import("../utils/LlamaText.js")._LlamaText | readonly (string | Token)[])[];
|
||||
get trimWhitespaceSuffix(): boolean;
|
||||
static getFor(llama: Llama, type: "json" | "json_arr" | "english" | "list" | "c" | "arithmetic" | "japanese" | "chess"): Promise<LlamaGrammar>;
|
||||
}
|
||||
72
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js
generated
vendored
Normal file
72
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js
generated
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
import path from "path";
|
||||
import fs from "fs-extra";
|
||||
import { getGrammarsFolder } from "../utils/getGrammarsFolder.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
/**
|
||||
* @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
|
||||
*/
|
||||
export class LlamaGrammar {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _grammar;
|
||||
/** @internal */ _stopGenerationTriggers;
|
||||
/** @internal */ _trimWhitespaceSuffix;
|
||||
/** @internal */ _grammarText;
|
||||
/** @internal */ _rootRuleName;
|
||||
/**
|
||||
* > GBNF files are supported.
|
||||
* > More info here: [
|
||||
* github:ggml-org/llama.cpp:grammars/README.md
|
||||
* ](https://github.com/ggml-org/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
|
||||
*
|
||||
* Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
|
||||
* @deprecated Use `llama.createGrammar(...)` instead.
|
||||
* @param llama
|
||||
* @param options
|
||||
*/
|
||||
constructor(llama, { grammar, stopGenerationTriggers = [], trimWhitespaceSuffix = false, rootRuleName = "root" }) {
|
||||
this._llama = llama;
|
||||
this._grammar = new this._llama._bindings.AddonGrammar(grammar, {
|
||||
addonExports: this._llama._bindings,
|
||||
rootRuleName
|
||||
});
|
||||
this._stopGenerationTriggers = stopGenerationTriggers ?? [];
|
||||
this._trimWhitespaceSuffix = trimWhitespaceSuffix;
|
||||
this._grammarText = grammar;
|
||||
this._rootRuleName = rootRuleName;
|
||||
}
|
||||
get grammar() {
|
||||
return this._grammarText;
|
||||
}
|
||||
get rootRuleName() {
|
||||
return this._rootRuleName;
|
||||
}
|
||||
get stopGenerationTriggers() {
|
||||
return this._stopGenerationTriggers;
|
||||
}
|
||||
get trimWhitespaceSuffix() {
|
||||
return this._trimWhitespaceSuffix;
|
||||
}
|
||||
/**
|
||||
* Test if the given text is compatible with the grammar.
|
||||
* @internal
|
||||
*/
|
||||
_testText(text) {
|
||||
return this._grammar.isTextCompatible(String(text));
|
||||
}
|
||||
static async getFor(llama, type) {
|
||||
const grammarsFolder = await getGrammarsFolder(llama.buildType);
|
||||
const grammarFile = path.join(grammarsFolder, type + ".gbnf");
|
||||
if (await fs.pathExists(grammarFile)) {
|
||||
const grammar = await fs.readFile(grammarFile, "utf8");
|
||||
return new LlamaGrammar(llama, {
|
||||
grammar,
|
||||
stopGenerationTriggers: [LlamaText(["\n".repeat((type === "json" || type === "json_arr")
|
||||
? 4
|
||||
: 10)])], // this is a workaround for the model not stopping to generate text,
|
||||
trimWhitespaceSuffix: true
|
||||
});
|
||||
}
|
||||
throw new Error(`Grammar file for type "${type}" was not found in "${grammarsFolder}"`);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaGrammar.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammar.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammar.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,UAAU,CAAC;AAC1B,OAAO,EAAC,iBAAiB,EAAC,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAwBhD;;GAEG;AACH,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAkB,uBAAuB,CAAgE;IACzH,gBAAgB,CAAkB,qBAAqB,CAAU;IACjE,gBAAgB,CAAkB,YAAY,CAAS;IACvD,gBAAgB,CAAkB,aAAa,CAAS;IAExD;;;;;;;;;;OAUG;IACH,YAAmB,KAAY,EAAE,EAC7B,OAAO,EAAE,sBAAsB,GAAG,EAAE,EAAE,oBAAoB,GAAG,KAAK,EAAE,YAAY,GAAG,MAAM,EACvE;QAClB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC;QACpB,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE;YAC5D,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YACnC,YAAY;SACf,CAAC,CAAC;QACH,IAAI,CAAC,uBAAuB,GAAG,sBAAsB,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,qBAAqB,GAAG,oBAAoB,CAAC;QAClD,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC;QAC5B,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED,IAAW,YAAY;QACnB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED,IAAW,sBAAsB;QAC7B,OAAO,IAAI,CAAC,uBAAuB,CAAC;IACxC,CAAC;IAED,IAAW,oBAAoB;QAC3B,OAAO,IAAI,CAAC,qBAAqB,CAAC;IACtC,CAAC;IAED;;;OAGG;IACI,SAAS,CAAC,IAAY;QACzB,OAAO,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;IACxD,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,KAAY,EAAE,IAA0F;QAC/H,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAEhE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,GAAG,OAAO,CAAC,CAAC;QAE9D,IAAI,MAAM,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YACnC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;YACvD,OAAO,IAAI,YAAY,CAAC,KAAK,EAAE;gBAC3B,OAAO;gBACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAC3C,CAAC,IAAI,KAAK,MAAM,IAAI,IAAI,KAAK,UAAU,CAAC;4BACpC,CAAC,CAAC,CAAC;4BACH,CAAC,CAAC,EAAE,CACX,CAAC,CAAC,CAAC,EAAE,oEAAoE;gBAC1E,oBAAoB,EAAE,IAAI;aAC7B,CAAC,CAAC;QACP,CAAC;QAED,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,uBAAuB,cAAc,GAAG,CAAC,CAAC;IAC5F,CAAC;CACJ"}
|
||||
19
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.d.ts
generated
vendored
Normal file
19
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.d.ts
generated
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
import type { LlamaGrammar } from "./LlamaGrammar.js";
|
||||
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
|
||||
export type LlamaGrammarEvaluationStateOptions = {
|
||||
model: LlamaModel;
|
||||
grammar: LlamaGrammar;
|
||||
};
|
||||
/**
|
||||
* Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
|
||||
*
|
||||
* Create a new grammar evaluation state for every response you generate with the model.
|
||||
*
|
||||
* This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
|
||||
*/
|
||||
export declare class LlamaGrammarEvaluationState {
|
||||
constructor(options: LlamaGrammarEvaluationStateOptions);
|
||||
constructor(existingState: LlamaGrammarEvaluationState);
|
||||
/** Clone the grammar evaluation state */
|
||||
clone(): LlamaGrammarEvaluationState;
|
||||
}
|
||||
29
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js
generated
vendored
Normal file
29
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js
generated
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/**
|
||||
* Grammar evaluation state is used to track the model response to determine the next allowed characters for the model to generate.
|
||||
*
|
||||
* Create a new grammar evaluation state for every response you generate with the model.
|
||||
*
|
||||
* This is only needed when using the `LlamaContext` class directly, since `LlamaChatSession` already handles this for you.
|
||||
*/
|
||||
export class LlamaGrammarEvaluationState {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _state;
|
||||
constructor(existingStateOrOptions) {
|
||||
if (existingStateOrOptions instanceof LlamaGrammarEvaluationState) {
|
||||
this._llama = existingStateOrOptions._llama;
|
||||
this._state = new this._llama._bindings.AddonGrammarEvaluationState(existingStateOrOptions._state);
|
||||
}
|
||||
else {
|
||||
const { model, grammar } = existingStateOrOptions;
|
||||
this._llama = model._llama;
|
||||
if (model._llama !== grammar._llama)
|
||||
throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
|
||||
this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
|
||||
}
|
||||
}
|
||||
/** Clone the grammar evaluation state */
|
||||
clone() {
|
||||
return new LlamaGrammarEvaluationState(this);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaGrammarEvaluationState.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaGrammarEvaluationState.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaGrammarEvaluationState.js","sourceRoot":"","sources":["../../src/evaluator/LlamaGrammarEvaluationState.ts"],"names":[],"mappings":"AAWA;;;;;;GAMG;AACH,MAAM,OAAO,2BAA2B;IACpC,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,MAAM,CAA8B;IAIrE,YAAmB,sBAAwF;QACvG,IAAI,sBAAsB,YAAY,2BAA2B,EAAE,CAAC;YAChE,IAAI,CAAC,MAAM,GAAG,sBAAsB,CAAC,MAAM,CAAC;YAC5C,IAAI,CAAC,MAAM,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,sBAAsB,CAAC,MAAM,CAAC,CAAC;QACvG,CAAC;aAAM,CAAC;YACJ,MAAM,EAAC,KAAK,EAAE,OAAO,EAAC,GAAG,sBAAsB,CAAC;YAChD,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;YAE3B,IAAI,KAAK,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM;gBAC/B,MAAM,IAAI,KAAK,CAAC,4EAA4E,CAAC,CAAC;YAElG,IAAI,CAAC,MAAM,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,2BAA2B,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzG,CAAC;IACL,CAAC;IAED,yCAAyC;IAClC,KAAK;QACR,OAAO,IAAI,2BAA2B,CAAC,IAAI,CAAC,CAAC;IACjD,CAAC;CACJ"}
|
||||
17
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.d.ts
generated
vendored
Normal file
17
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.d.ts
generated
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
import { GbnfJsonDefList, GbnfJsonSchema, GbnfJsonSchemaToType } from "../utils/gbnfJson/types.js";
|
||||
import { Llama } from "../bindings/Llama.js";
|
||||
import { LlamaGrammar } from "./LlamaGrammar.js";
|
||||
/**
|
||||
* @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
|
||||
* @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
|
||||
*/
|
||||
export declare class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema<Defs>, const Defs extends GbnfJsonDefList<Defs> = Record<any, any>> extends LlamaGrammar {
|
||||
private readonly _schema;
|
||||
/**
|
||||
* Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
|
||||
* @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
|
||||
*/
|
||||
constructor(llama: Llama, schema: Readonly<T> & GbnfJsonSchema<Defs>);
|
||||
get schema(): Readonly<T>;
|
||||
parse(json: string): GbnfJsonSchemaToType<T>;
|
||||
}
|
||||
35
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js
generated
vendored
Normal file
35
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js
generated
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
import { getGbnfGrammarForGbnfJsonSchema } from "../utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.js";
|
||||
import { validateObjectAgainstGbnfSchema } from "../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import { LlamaGrammar } from "./LlamaGrammar.js";
|
||||
/* eslint-disable @stylistic/max-len */
|
||||
/**
|
||||
* @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
|
||||
* @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
|
||||
*/
|
||||
export class LlamaJsonSchemaGrammar extends LlamaGrammar {
|
||||
_schema;
|
||||
/**
|
||||
* Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
|
||||
* @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
|
||||
*/
|
||||
constructor(llama, schema) {
|
||||
const grammar = getGbnfGrammarForGbnfJsonSchema(schema);
|
||||
super(llama, {
|
||||
grammar,
|
||||
stopGenerationTriggers: [LlamaText(["\n".repeat(4)])],
|
||||
trimWhitespaceSuffix: true
|
||||
});
|
||||
this._schema = schema;
|
||||
}
|
||||
get schema() {
|
||||
return this._schema;
|
||||
}
|
||||
parse(json) {
|
||||
const parsedJson = JSON.parse(json);
|
||||
validateObjectAgainstGbnfSchema(parsedJson, this._schema);
|
||||
return parsedJson;
|
||||
}
|
||||
}
|
||||
/* eslint-enable @stylistic/max-len */
|
||||
//# sourceMappingURL=LlamaJsonSchemaGrammar.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaJsonSchemaGrammar.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaJsonSchemaGrammar.js","sourceRoot":"","sources":["../../src/evaluator/LlamaJsonSchemaGrammar.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,+BAA+B,EAAC,MAAM,sDAAsD,CAAC;AACrG,OAAO,EAAC,+BAA+B,EAAC,MAAM,4DAA4D,CAAC;AAC3G,OAAO,EAAC,SAAS,EAAC,MAAM,uBAAuB,CAAC;AAEhD,OAAO,EAAC,YAAY,EAAC,MAAM,mBAAmB,CAAC;AAE/C,uCAAuC;AACvC;;;GAGG;AACH,MAAM,OAAO,sBAGX,SAAQ,YAAY;IACD,OAAO,CAAI;IAE5B;;;OAGG;IACH,YAAmB,KAAY,EAAE,MAA0C;QACvE,MAAM,OAAO,GAAG,+BAA+B,CAAC,MAAM,CAAC,CAAC;QAExD,KAAK,CAAC,KAAK,EAAE;YACT,OAAO;YACP,sBAAsB,EAAE,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrD,oBAAoB,EAAE,IAAI;SAC7B,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;IAC1B,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,OAAO,CAAC;IACxB,CAAC;IAEM,KAAK,CAAC,IAAY;QACrB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,+BAA+B,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAE1D,OAAO,UAAU,CAAC;IACtB,CAAC;CACJ;AACD,sCAAsC"}
|
||||
311
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.d.ts
generated
vendored
Normal file
311
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.d.ts
generated
vendored
Normal file
@@ -0,0 +1,311 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { Token, Tokenizer } from "../../types.js";
|
||||
import { ModelTypeDescription } from "../../bindings/AddonTypes.js";
|
||||
import { LlamaVocabularyType } from "../../bindings/types.js";
|
||||
import { GgufFileInfo } from "../../gguf/types/GgufFileInfoTypes.js";
|
||||
import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
|
||||
import { LlamaContextOptions } from "../LlamaContext/types.js";
|
||||
import { LlamaContext } from "../LlamaContext/LlamaContext.js";
|
||||
import { LlamaEmbeddingContext, LlamaEmbeddingContextOptions } from "../LlamaEmbeddingContext.js";
|
||||
import { GgufMetadata } from "../../gguf/types/GgufMetadataTypes.js";
|
||||
import { OverridesObject } from "../../utils/OverridesObject.js";
|
||||
import { LlamaRankingContext, LlamaRankingContextOptions } from "../LlamaRankingContext.js";
|
||||
import { TokenAttributes } from "./utils/TokenAttributes.js";
|
||||
import type { Llama } from "../../bindings/Llama.js";
|
||||
import type { BuiltinSpecialTokenValue } from "../../utils/LlamaText.js";
|
||||
export type LlamaModelOptions = {
|
||||
/** path to the model on the filesystem */
|
||||
modelPath: string;
|
||||
/**
|
||||
* Number of layers to store in VRAM.
|
||||
* - **`"auto"`** - adapt to the current VRAM state and try to fit as many layers as possible in it.
|
||||
* Takes into account the VRAM required to create a context with a `contextSize` set to `"auto"`.
|
||||
* - **`"max"`** - store all layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
|
||||
* - **`number`** - store the specified number of layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
|
||||
* - **`{min?: number, max?: number, fitContext?: {contextSize: number}}`** - adapt to the current VRAM state and try to fit as
|
||||
* many layers as possible in it, but at least `min` and at most `max` layers. Set `fitContext` to the parameters of a context you
|
||||
* intend to create with the model, so it'll take it into account in the calculations and leave enough memory for such a context.
|
||||
*
|
||||
* If GPU support is disabled, will be set to `0` automatically.
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
gpuLayers?: "auto" | "max" | number | {
|
||||
min?: number;
|
||||
max?: number;
|
||||
fitContext?: {
|
||||
contextSize?: number;
|
||||
/**
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
embeddingContext?: boolean;
|
||||
};
|
||||
};
|
||||
/**
|
||||
* Only load the vocabulary, not weight tensors.
|
||||
*
|
||||
* Useful when you only want to use the model to use its tokenizer but not for evaluation.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
vocabOnly?: boolean;
|
||||
/**
|
||||
* Use mmap (memory-mapped file) to load the model.
|
||||
*
|
||||
* Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
|
||||
* and makes it easier for the system to manage memory.
|
||||
*
|
||||
* When using mmap, you might notice a delay the first time you actually use the model,
|
||||
* which is caused by the OS itself loading the model into memory.
|
||||
*
|
||||
* Defaults to `true` if the current system supports it.
|
||||
*/
|
||||
useMmap?: boolean;
|
||||
/**
|
||||
* Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory,
|
||||
* bypassing OS in-memory caches.
|
||||
*
|
||||
* It leads to improved model loading times and reduced RAM usage,
|
||||
* on the expense of higher loading times when the model unloaded and loaded again repeatedly in a short period of time.
|
||||
*
|
||||
* When this option is enabled, if Direct I/O is supported by the system (and for the given file)
|
||||
* it will be used and mmap will be disabled.
|
||||
*
|
||||
* Unsupported on macOS.
|
||||
*
|
||||
* Defaults to `true`.
|
||||
*/
|
||||
useDirectIo?: boolean;
|
||||
/**
|
||||
* Force the system to keep the model in the RAM/VRAM.
|
||||
* Use with caution as this can crash your system if the available resources are insufficient.
|
||||
*/
|
||||
useMlock?: boolean;
|
||||
/**
|
||||
* Check for tensor validity before actually loading the model.
|
||||
* Using it increases the time it takes to load the model.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
checkTensors?: boolean;
|
||||
/**
|
||||
* Enable flash attention by default for contexts created with this model.
|
||||
* Only works with models that support flash attention.
|
||||
*
|
||||
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
|
||||
*
|
||||
* The support for flash attention is currently experimental and may not always work as expected.
|
||||
* Use with caution.
|
||||
*
|
||||
* This option will be ignored if flash attention is not supported by the model.
|
||||
*
|
||||
* Enabling this affects the calculations of default values for the model and contexts created with it
|
||||
* as flash attention reduces the amount of memory required,
|
||||
* which allows for more layers to be offloaded to the GPU and for context sizes to be bigger.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*
|
||||
* Upon flash attention exiting the experimental status, the default value will become `true`.
|
||||
*/
|
||||
defaultContextFlashAttention?: boolean;
|
||||
/**
|
||||
* When using SWA (Sliding Window Attention) on a supported model,
|
||||
* extend the sliding window size to the current context size (meaning practically disabling SWA)
|
||||
* by default for contexts created with this model.
|
||||
*
|
||||
* See the `swaFullCache` option of the `.createContext()` method for more information.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
defaultContextSwaFullCache?: boolean;
|
||||
/**
|
||||
* Called with the load percentage when the model is being loaded.
|
||||
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
|
||||
*/
|
||||
onLoadProgress?(loadProgress: number): void;
|
||||
/** An abort signal to abort the model load */
|
||||
loadSignal?: AbortSignal;
|
||||
/**
|
||||
* Ignore insufficient memory errors and continue with the model load.
|
||||
* Can cause the process to crash if there's not enough VRAM to fit the model.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
/**
|
||||
* Metadata overrides to load the model with.
|
||||
*
|
||||
* > **Note:** Most metadata value overrides aren't supported and overriding them will have no effect on `llama.cpp`.
|
||||
* > Only use this for metadata values that are explicitly documented to be supported by `llama.cpp` to be overridden,
|
||||
* > and only in cases when this is crucial, as this is not guaranteed to always work as expected.
|
||||
*/
|
||||
metadataOverrides?: OverridesObject<GgufMetadata, number | bigint | boolean | string>;
|
||||
};
|
||||
export declare class LlamaModel {
|
||||
readonly tokenizer: Tokenizer;
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
dispose(): Promise<void>;
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose](): Promise<void>;
|
||||
get disposed(): boolean;
|
||||
get llama(): Llama;
|
||||
get tokens(): LlamaModelTokens;
|
||||
get filename(): string | undefined;
|
||||
get fileInfo(): GgufFileInfo;
|
||||
get fileInsights(): GgufInsights;
|
||||
/**
|
||||
* Number of layers offloaded to the GPU.
|
||||
* If GPU support is disabled, this will always be `0`.
|
||||
*/
|
||||
get gpuLayers(): number;
|
||||
/**
|
||||
* Total model size in memory in bytes.
|
||||
*
|
||||
* When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
|
||||
*/
|
||||
get size(): number;
|
||||
get flashAttentionSupported(): boolean;
|
||||
get defaultContextFlashAttention(): boolean;
|
||||
get defaultContextSwaFullCache(): boolean;
|
||||
/**
|
||||
* Transform text into tokens that can be fed to the model
|
||||
* @param text - the text to tokenize
|
||||
* @param [specialTokens] - if set to true, text that correspond to special tokens will be tokenized to those tokens.
|
||||
* For example, `<s>` will be tokenized to the BOS token if `specialTokens` is set to `true`,
|
||||
* otherwise it will be tokenized to tokens that corresponds to the plaintext `<s>` string.
|
||||
* @param [options] - additional options for tokenization.
|
||||
* If set to `"trimLeadingSpace"`, a leading space will be trimmed from the tokenized output if the output has an
|
||||
* additional space at the beginning.
|
||||
*/
|
||||
tokenize(text: string, specialTokens?: boolean, options?: "trimLeadingSpace"): Token[];
|
||||
tokenize(text: BuiltinSpecialTokenValue, specialTokens: "builtin"): Token[];
|
||||
/**
|
||||
* Transform tokens into text
|
||||
* @param tokens - the tokens to detokenize.
|
||||
* @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
|
||||
*
|
||||
* Recommended for debugging purposes only.
|
||||
*
|
||||
* > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
|
||||
* this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
|
||||
*
|
||||
* Defaults to `false`.
|
||||
* @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
|
||||
* If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
|
||||
* and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
|
||||
*
|
||||
* Using it may have no effect with some models, but it is still recommended.
|
||||
*/
|
||||
detokenize(tokens: readonly Token[], specialTokens?: boolean, lastTokens?: readonly Token[]): string;
|
||||
getTokenAttributes(token: Token): TokenAttributes;
|
||||
/** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
|
||||
isSpecialToken(token: Token | undefined): boolean;
|
||||
iterateAllTokens(): Generator<Token, void, unknown>;
|
||||
/** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
|
||||
isEogToken(token: Token | undefined): boolean;
|
||||
createContext(options?: LlamaContextOptions): Promise<LlamaContext>;
|
||||
/**
|
||||
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
|
||||
*/
|
||||
createEmbeddingContext(options?: LlamaEmbeddingContextOptions): Promise<LlamaEmbeddingContext>;
|
||||
/**
|
||||
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
|
||||
*/
|
||||
createRankingContext(options?: LlamaRankingContextOptions): Promise<LlamaRankingContext>;
|
||||
/**
|
||||
* Get warnings about the model file that would affect its usage.
|
||||
*
|
||||
* These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
|
||||
*/
|
||||
getWarnings(): string[];
|
||||
/** @hidden `ModelTypeDescription` type alias is too long in the documentation */
|
||||
get typeDescription(): ModelTypeDescription;
|
||||
/** The context size the model was trained on */
|
||||
get trainContextSize(): number;
|
||||
/** The size of an embedding vector the model can produce */
|
||||
get embeddingVectorSize(): number;
|
||||
get vocabularyType(): LlamaVocabularyType;
|
||||
}
|
||||
export declare class LlamaModelTokens {
|
||||
private constructor();
|
||||
/**
|
||||
* @returns infill tokens
|
||||
*/
|
||||
get infill(): LlamaModelInfillTokens;
|
||||
/**
|
||||
* @returns The BOS (Beginning Of Sequence) token.
|
||||
*/
|
||||
get bos(): Token | null;
|
||||
/**
|
||||
* @returns The EOS (End Of Sequence) token.
|
||||
*/
|
||||
get eos(): Token | null;
|
||||
/**
|
||||
* @returns The EOT (End Of Turn) token.
|
||||
*/
|
||||
get eot(): Token | null;
|
||||
/**
|
||||
* @returns The SEP (Sentence Separator) token.
|
||||
*/
|
||||
get sep(): Token | null;
|
||||
/**
|
||||
* @returns The NL (New Line) token.
|
||||
*/
|
||||
get nl(): Token | null;
|
||||
/**
|
||||
* @returns The BOS (Beginning Of Sequence) token text representation.
|
||||
*/
|
||||
get bosString(): string | null;
|
||||
/**
|
||||
* @returns The EOS (End Of Sequence) token text representation.
|
||||
*/
|
||||
get eosString(): string | null;
|
||||
/**
|
||||
* @returns The EOT (End Of Turn) token text representation.
|
||||
*/
|
||||
get eotString(): string | null;
|
||||
/**
|
||||
* @returns The SEP (Sentence Separator) token text representation.
|
||||
*/
|
||||
get sepString(): string | null;
|
||||
/**
|
||||
* @returns The NL (New Line) token text representation.
|
||||
*/
|
||||
get nlString(): string | null;
|
||||
/**
|
||||
* @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
|
||||
*/
|
||||
get shouldPrependBosToken(): boolean;
|
||||
/**
|
||||
* @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
|
||||
*/
|
||||
get shouldAppendEosToken(): boolean;
|
||||
}
|
||||
export declare class LlamaModelInfillTokens {
|
||||
private constructor();
|
||||
/**
|
||||
* @returns The beginning of infill prefix token.
|
||||
*/
|
||||
get prefix(): Token | null;
|
||||
/**
|
||||
* @returns The beginning of infill middle token.
|
||||
*/
|
||||
get middle(): Token | null;
|
||||
/**
|
||||
* @returns The beginning of infill suffix token.
|
||||
*/
|
||||
get suffix(): Token | null;
|
||||
/**
|
||||
* @returns The beginning of infill prefix token as a string.
|
||||
*/
|
||||
get prefixString(): string | null;
|
||||
/**
|
||||
* @returns The beginning of infill middle token as a string.
|
||||
*/
|
||||
get middleString(): string | null;
|
||||
/**
|
||||
* @returns The beginning of infill suffix token as a string.
|
||||
*/
|
||||
get suffixString(): string | null;
|
||||
}
|
||||
832
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js
generated
vendored
Normal file
832
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js
generated
vendored
Normal file
@@ -0,0 +1,832 @@
|
||||
import process from "process";
|
||||
import path from "path";
|
||||
import { AsyncDisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
|
||||
import { removeNullFields } from "../../utils/removeNullFields.js";
|
||||
import { DisposeGuard } from "../../utils/DisposeGuard.js";
|
||||
import { LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues } from "../../bindings/types.js";
|
||||
import { readGgufFileInfo } from "../../gguf/readGgufFileInfo.js";
|
||||
import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
|
||||
import { getConsoleLogPrefix } from "../../utils/getConsoleLogPrefix.js";
|
||||
import { getReadablePath } from "../../cli/utils/getReadablePath.js";
|
||||
import { LlamaContext } from "../LlamaContext/LlamaContext.js";
|
||||
import { LlamaEmbeddingContext } from "../LlamaEmbeddingContext.js";
|
||||
import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
|
||||
import { maxRecentDetokenizerTokens } from "../../consts.js";
|
||||
import { LlamaRankingContext } from "../LlamaRankingContext.js";
|
||||
import { TokenAttribute, TokenAttributes } from "./utils/TokenAttributes.js";
|
||||
const defaultUseMmap = true;
|
||||
const defaultUseDirectIo = true;
|
||||
const defaultContextFlashAttentionEnabled = false;
|
||||
const defaultContextSwaFullCache = false;
|
||||
export class LlamaModel {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _model;
|
||||
/** @internal */ _backendModelDisposeGuard;
|
||||
/** @internal */ _tokens;
|
||||
/** @internal */ _modelPath;
|
||||
/** @internal */ _fileInfo;
|
||||
/** @internal */ _fileInsights;
|
||||
/** @internal */ _gpuLayers;
|
||||
/** @internal */ _vocabOnly;
|
||||
/** @internal */ _filename;
|
||||
/** @internal */ _disposedState = { disposed: false };
|
||||
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
|
||||
/** @internal */ _llamaPreventDisposalHandle;
|
||||
/** @internal */ _defaultContextFlashAttentionOptionEnabled;
|
||||
/** @internal */ _defaultContextFlashAttention;
|
||||
/** @internal */ _defaultContextSwaFullCache;
|
||||
/** @internal */ _flashAttentionSupported;
|
||||
/** @internal */ _loraAdapters = new Map();
|
||||
/** @internal */ _typeDescription;
|
||||
/** @internal */ _trainContextSize;
|
||||
/** @internal */ _embeddingVectorSize;
|
||||
/** @internal */ _vocabularyType;
|
||||
tokenizer;
|
||||
onDispose = new EventRelay();
|
||||
constructor({ modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides }, { _llama, _fileInfo, _fileInsights, _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, _defaultContextSwaFullCache, _flashAttentionSupported }) {
|
||||
this._llama = _llama;
|
||||
this._fileInfo = _fileInfo;
|
||||
this._modelPath = path.resolve(process.cwd(), modelPath);
|
||||
this._fileInsights = _fileInsights;
|
||||
this._gpuLayers = gpuLayers;
|
||||
this._vocabOnly = vocabOnly ?? false;
|
||||
this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]);
|
||||
this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
|
||||
this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
|
||||
this._defaultContextFlashAttention = _defaultContextFlashAttention;
|
||||
this._defaultContextSwaFullCache = _defaultContextSwaFullCache;
|
||||
this._flashAttentionSupported = _flashAttentionSupported;
|
||||
const overridesList = ggufMetadataOverridesToList(metadataOverrides);
|
||||
this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
|
||||
addonExports: this._llama._bindings,
|
||||
gpuLayers,
|
||||
vocabOnly: this._vocabOnly,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
useMlock: _llama.supportsMlock
|
||||
? useMlock
|
||||
: undefined,
|
||||
checkTensors: checkTensors ?? false,
|
||||
onLoadProgress: onLoadProgress == null
|
||||
? undefined
|
||||
: (loadPercentage) => {
|
||||
try {
|
||||
onLoadProgress(loadPercentage);
|
||||
}
|
||||
catch (err) {
|
||||
// the native addon code calls this function, so there's no use to throw an error here
|
||||
console.error(err);
|
||||
}
|
||||
},
|
||||
hasLoadAbortSignal: loadSignal != null,
|
||||
overridesList: overridesList.length > 0
|
||||
? overridesList
|
||||
: undefined
|
||||
}));
|
||||
this._tokens = LlamaModelTokens._create(this._model, this._disposedState);
|
||||
this._filename = path.basename(modelPath);
|
||||
this._disposeAggregator.add(() => {
|
||||
this._disposedState.disposed = true;
|
||||
});
|
||||
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
||||
this._disposeAggregator.add(this._llama.onDispose.createListener(disposeModelIfReferenced.bind(null, new WeakRef(this))));
|
||||
this._disposeAggregator.add(async () => {
|
||||
await this._backendModelDisposeGuard.acquireDisposeLock();
|
||||
await this._model.dispose();
|
||||
this._llamaPreventDisposalHandle.dispose();
|
||||
});
|
||||
this.tokenize = this.tokenize.bind(this);
|
||||
this.detokenize = this.detokenize.bind(this);
|
||||
this.isSpecialToken = this.isSpecialToken.bind(this);
|
||||
this.isEogToken = this.isEogToken.bind(this);
|
||||
this.tokenize.detokenize = this.detokenize;
|
||||
this.tokenize.isSpecialToken = this.isSpecialToken;
|
||||
this.tokenize.isEogToken = this.isEogToken;
|
||||
Object.freeze(this.tokenize);
|
||||
this.tokenizer = this.tokenize;
|
||||
}
|
||||
async dispose() {
|
||||
if (this._disposedState.disposed)
|
||||
return;
|
||||
this._disposedState.disposed = true;
|
||||
await this._disposeAggregator.dispose();
|
||||
}
|
||||
/** @hidden */
|
||||
async [Symbol.asyncDispose]() {
|
||||
await this.dispose();
|
||||
}
|
||||
get disposed() {
|
||||
return this._disposedState.disposed;
|
||||
}
|
||||
get llama() {
|
||||
return this._llama;
|
||||
}
|
||||
get tokens() {
|
||||
return this._tokens;
|
||||
}
|
||||
get filename() {
|
||||
return this._filename;
|
||||
}
|
||||
get fileInfo() {
|
||||
return this._fileInfo;
|
||||
}
|
||||
get fileInsights() {
|
||||
return this._fileInsights;
|
||||
}
|
||||
/**
|
||||
* Number of layers offloaded to the GPU.
|
||||
* If GPU support is disabled, this will always be `0`.
|
||||
*/
|
||||
get gpuLayers() {
|
||||
return this._gpuLayers;
|
||||
}
|
||||
/**
|
||||
* Total model size in memory in bytes.
|
||||
*
|
||||
* When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
|
||||
*/
|
||||
get size() {
|
||||
this._ensureNotDisposed();
|
||||
return this._model.getModelSize();
|
||||
}
|
||||
get flashAttentionSupported() {
|
||||
return this._flashAttentionSupported;
|
||||
}
|
||||
get defaultContextFlashAttention() {
|
||||
return this._defaultContextFlashAttention;
|
||||
}
|
||||
get defaultContextSwaFullCache() {
|
||||
return this._defaultContextSwaFullCache;
|
||||
}
|
||||
tokenize(text, specialTokens = false, options) {
|
||||
this._ensureNotDisposed();
|
||||
if (text === "")
|
||||
return [];
|
||||
if (specialTokens === "builtin") {
|
||||
const builtinToken = text;
|
||||
switch (builtinToken) {
|
||||
case "BOS": return this.tokens.bos == null ? [] : [this.tokens.bos];
|
||||
case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
|
||||
case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
|
||||
case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
|
||||
case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
|
||||
}
|
||||
void builtinToken;
|
||||
throw new Error(`Unknown builtin special token: ${builtinToken}`);
|
||||
}
|
||||
if (options === "trimLeadingSpace") {
|
||||
if (specialTokens) {
|
||||
const countLeadingSpaces = (text) => {
|
||||
let count = 0;
|
||||
for (; count < text.length; count++) {
|
||||
if (text[count] !== " ")
|
||||
break;
|
||||
}
|
||||
return count;
|
||||
};
|
||||
const textLeadingSpaces = countLeadingSpaces(text);
|
||||
const [workaroundToken, workaroundTokenString] = (this.tokens.bos != null && this.tokens.bosString != null)
|
||||
? [this.tokens.bos, this.tokens.bosString]
|
||||
: (this.tokens.eos != null && this.tokens.eosString != null)
|
||||
? [this.tokens.eos, this.tokens.eosString]
|
||||
: (this.tokens.nl != null && this.tokens.nlString != null)
|
||||
? [this.tokens.nl, this.tokens.nlString]
|
||||
: (this.tokens.eot != null && this.tokens.eotString != null)
|
||||
? [this.tokens.eot, this.tokens.eotString]
|
||||
: [null, null];
|
||||
if (workaroundToken != null && workaroundTokenString != null) {
|
||||
const tokens = Array.from(this._model.tokenize(workaroundTokenString + text, true));
|
||||
const workaroundTokenIndex = tokens.indexOf(workaroundToken);
|
||||
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
|
||||
if (workaroundTokenIndex >= 0 && workaroundTokenIndex <= 1) {
|
||||
tokens.splice(0, workaroundTokenIndex + 1);
|
||||
if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
|
||||
return tokens;
|
||||
}
|
||||
}
|
||||
const workaroundTokensString = "\n";
|
||||
const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, true));
|
||||
if (text.startsWith(workaroundTokensString)) {
|
||||
const tokens = Array.from(this._model.tokenize(text, true));
|
||||
if (this.detokenize(tokens, true).startsWith(workaroundTokensString))
|
||||
return tokens;
|
||||
}
|
||||
const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, true));
|
||||
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
|
||||
if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
|
||||
tokens.splice(0, workaroundTokens.length);
|
||||
if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
|
||||
return tokens;
|
||||
}
|
||||
}
|
||||
else {
|
||||
const workaroundTokensString = "\n";
|
||||
const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, false));
|
||||
if (text.startsWith(workaroundTokensString)) {
|
||||
const tokens = Array.from(this._model.tokenize(text, false));
|
||||
if (this.detokenize(tokens, false).startsWith(workaroundTokensString))
|
||||
return tokens;
|
||||
}
|
||||
const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, false));
|
||||
// only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
|
||||
if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
|
||||
tokens.splice(0, workaroundTokens.length);
|
||||
return tokens;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Array.from(this._model.tokenize(text, specialTokens));
|
||||
}
|
||||
/**
|
||||
* Transform tokens into text
|
||||
* @param tokens - the tokens to detokenize.
|
||||
* @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
|
||||
*
|
||||
* Recommended for debugging purposes only.
|
||||
*
|
||||
* > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
|
||||
* this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
|
||||
*
|
||||
* Defaults to `false`.
|
||||
* @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
|
||||
* If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
|
||||
* and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
|
||||
*
|
||||
* Using it may have no effect with some models, but it is still recommended.
|
||||
*/
|
||||
detokenize(tokens, specialTokens = false, lastTokens) {
|
||||
this._ensureNotDisposed();
|
||||
if (tokens.length === 0)
|
||||
return "";
|
||||
if (lastTokens == null || lastTokens.length === 0)
|
||||
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
|
||||
const addedTokens = lastTokens.slice(-maxRecentDetokenizerTokens);
|
||||
const addedTokensText = this._model.detokenize(Uint32Array.from(addedTokens), Boolean(specialTokens));
|
||||
if (addedTokensText === "")
|
||||
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
|
||||
const text = this._model.detokenize(Uint32Array.from([...addedTokens, ...tokens]), Boolean(specialTokens));
|
||||
if (text.startsWith(addedTokensText))
|
||||
return text.slice(addedTokensText.length);
|
||||
return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
|
||||
}
|
||||
getTokenAttributes(token) {
|
||||
if (token == null)
|
||||
throw new Error("Token cannot be null");
|
||||
if (this.vocabularyType === LlamaVocabularyType.none)
|
||||
return TokenAttributes._create(token, TokenAttribute.undefined);
|
||||
return TokenAttributes._create(token, this._model.getTokenAttributes(token));
|
||||
}
|
||||
/** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
|
||||
isSpecialToken(token) {
|
||||
if (token == null)
|
||||
return false;
|
||||
if (this.getTokenAttributes(token).control)
|
||||
return true;
|
||||
const normalText = this.detokenize([token], false);
|
||||
if (normalText === "")
|
||||
return this.detokenize([token], true) !== "";
|
||||
return false;
|
||||
}
|
||||
*iterateAllTokens() {
|
||||
if (this.vocabularyType === LlamaVocabularyType.none)
|
||||
return;
|
||||
const totalTokens = this.fileInfo.metadata?.tokenizer?.ggml?.tokens?.length;
|
||||
if (typeof totalTokens !== "number")
|
||||
return;
|
||||
for (let i = 0; i < totalTokens; i++)
|
||||
yield i;
|
||||
}
|
||||
/** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
|
||||
isEogToken(token) {
|
||||
if (token == null)
|
||||
return false;
|
||||
return token === this.tokens.eos || token === this.tokens.eot || this._model.isEogToken(token);
|
||||
}
|
||||
async createContext(options = {}) {
|
||||
if (this._vocabOnly)
|
||||
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
|
||||
return await withLock([this._llama._memoryLock, LlamaLocks.loadToMemory], options.createSignal, async () => {
|
||||
const preventDisposalHandle = this._backendModelDisposeGuard.createPreventDisposalHandle();
|
||||
try {
|
||||
return await LlamaContext._create(options, { _model: this });
|
||||
}
|
||||
finally {
|
||||
preventDisposalHandle.dispose();
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
|
||||
*/
|
||||
async createEmbeddingContext(options = {}) {
|
||||
if (this._vocabOnly)
|
||||
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
|
||||
return await LlamaEmbeddingContext._create({ _model: this }, options);
|
||||
}
|
||||
/**
|
||||
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
|
||||
*/
|
||||
async createRankingContext(options = {}) {
|
||||
if (this._vocabOnly)
|
||||
throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
|
||||
return await LlamaRankingContext._create({ _model: this }, options);
|
||||
}
|
||||
/**
|
||||
* Get warnings about the model file that would affect its usage.
|
||||
*
|
||||
* These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
|
||||
*/
|
||||
getWarnings() {
|
||||
this._ensureNotDisposed();
|
||||
const warnings = this._fileInsights.getWarnings(this._modelPath);
|
||||
const modelFilePathText = `("${getReadablePath(this._modelPath)}")`;
|
||||
try {
|
||||
const beforeTextNoSpecialTokens = "some test text here";
|
||||
const afterTextNoSpecialTokens = this.detokenize(this.tokenize(beforeTextNoSpecialTokens, false, "trimLeadingSpace"), false);
|
||||
if (beforeTextNoSpecialTokens !== afterTextNoSpecialTokens)
|
||||
warnings.push(`Using this model ${modelFilePathText} to tokenize text and then detokenize it resulted in a different text. ` +
|
||||
"There might be an issue with the model or the tokenizer implementation. " +
|
||||
"Using this model may not work as intended");
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
try {
|
||||
if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) {
|
||||
if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
|
||||
warnings.push("Flash attention is incompatible with Grok and thus was turned off");
|
||||
else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
|
||||
warnings.push("Flash attention is incompatible with Gemma2 and thus was turned off");
|
||||
else {
|
||||
const nHead = this.fileInfo.architectureMetadata?.attention?.head_count ?? 0;
|
||||
const nEmbd = this.fileInfo.architectureMetadata?.embedding_length ?? 0;
|
||||
const nEmbdHeadK = this.fileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
|
||||
const nEmbdHeadV = this.fileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
|
||||
if (nEmbdHeadK !== nEmbdHeadV)
|
||||
warnings.push("Flash attention is incompatible with this model and thus was turned off");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
return warnings;
|
||||
}
|
||||
/** @hidden `ModelTypeDescription` type alias is too long in the documentation */
|
||||
get typeDescription() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._typeDescription == null)
|
||||
this._typeDescription = this._model.getModelDescription();
|
||||
return this._typeDescription;
|
||||
}
|
||||
/** The context size the model was trained on */
|
||||
get trainContextSize() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._trainContextSize == null)
|
||||
this._trainContextSize = this._model.getTrainContextSize();
|
||||
return this._trainContextSize;
|
||||
}
|
||||
/** The size of an embedding vector the model can produce */
|
||||
get embeddingVectorSize() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._embeddingVectorSize == null)
|
||||
this._embeddingVectorSize = this._model.getEmbeddingVectorSize();
|
||||
return this._embeddingVectorSize;
|
||||
}
|
||||
get vocabularyType() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._vocabularyType == null) {
|
||||
const vocabType = this._model.getVocabularyType();
|
||||
this._vocabularyType = LlamaVocabularyTypeValues[vocabType];
|
||||
if (this._vocabularyType == null) {
|
||||
console.warn(getConsoleLogPrefix() + "Unknown vocabulary type:", vocabType);
|
||||
this._vocabularyType = LlamaVocabularyType.none;
|
||||
}
|
||||
}
|
||||
return this._vocabularyType;
|
||||
}
|
||||
/** @internal */
|
||||
_ensureNotDisposed() {
|
||||
if (this._disposedState.disposed)
|
||||
throw new DisposedError();
|
||||
}
|
||||
/** @internal */
|
||||
async _getOrLoadLora(filePath) {
|
||||
const resolvedPath = path.resolve(process.cwd(), filePath);
|
||||
if (this._loraAdapters.has(resolvedPath))
|
||||
return this._loraAdapters.get(resolvedPath);
|
||||
return await withLock([this._loraAdapters, "modify"], async () => {
|
||||
if (this._loraAdapters.has(resolvedPath))
|
||||
return this._loraAdapters.get(resolvedPath);
|
||||
const lora = new this._llama._bindings.AddonModelLora(this._model, resolvedPath);
|
||||
await this._model.loadLora(lora);
|
||||
this._loraAdapters.set(resolvedPath, lora);
|
||||
return lora;
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
static async _create(modelOptions, { _llama }) {
|
||||
const { loadSignal, defaultContextFlashAttention } = modelOptions;
|
||||
const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
|
||||
const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
|
||||
const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
|
||||
sourceType: "filesystem",
|
||||
signal: loadSignal
|
||||
});
|
||||
applyGgufMetadataOverrides(fileInfo, modelOptions.metadataOverrides);
|
||||
const ggufInsights = await GgufInsights.from(fileInfo, _llama);
|
||||
const flashAttentionSupported = ggufInsights.flashAttentionSupported;
|
||||
const resolvedDefaultContextFlashAttention = flashAttentionSupported
|
||||
? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
|
||||
: false;
|
||||
const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
|
||||
const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
|
||||
ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
|
||||
defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
|
||||
defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers: gpuLayers,
|
||||
useMmap
|
||||
});
|
||||
const model = new LlamaModel({ ...modelOptions, gpuLayers, useMmap, useDirectIo }, {
|
||||
_fileInfo: fileInfo,
|
||||
_fileInsights: ggufInsights,
|
||||
_llama,
|
||||
_defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
|
||||
_flashAttentionSupported: flashAttentionSupported,
|
||||
_defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
|
||||
_defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache
|
||||
});
|
||||
const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
|
||||
? null
|
||||
: _llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
|
||||
const modelCreationRamReservation = modelOptions.ignoreMemorySafetyChecks
|
||||
? null
|
||||
: _llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
|
||||
const loggedWarnings = new Set();
|
||||
function onAbort() {
|
||||
model._model.abortActiveModelLoad();
|
||||
loadSignal?.removeEventListener("abort", onAbort);
|
||||
}
|
||||
function logWarnings(warnings) {
|
||||
for (const warning of warnings) {
|
||||
if (loggedWarnings.has(warning))
|
||||
continue;
|
||||
_llama._log(LlamaLogLevel.warn, warning);
|
||||
loggedWarnings.add(warning);
|
||||
}
|
||||
}
|
||||
if (loadSignal != null) {
|
||||
if (loadSignal.aborted)
|
||||
throw loadSignal.reason;
|
||||
loadSignal.addEventListener("abort", onAbort);
|
||||
}
|
||||
logWarnings(ggufInsights.getWarnings(modelOptions.modelPath));
|
||||
try {
|
||||
const modelLoaded = await model._model.init();
|
||||
if (loadSignal?.aborted) {
|
||||
if (modelLoaded)
|
||||
await model._model.dispose();
|
||||
throw loadSignal.reason;
|
||||
}
|
||||
else if (!modelLoaded)
|
||||
throw new Error("Failed to load model");
|
||||
loadSignal?.removeEventListener("abort", onAbort);
|
||||
logWarnings(model.getWarnings());
|
||||
return model;
|
||||
}
|
||||
finally {
|
||||
loadSignal?.removeEventListener("abort", onAbort);
|
||||
modelCreationVramReservation?.dispose?.();
|
||||
modelCreationRamReservation?.dispose?.();
|
||||
}
|
||||
}
|
||||
}
|
||||
export class LlamaModelTokens {
|
||||
/** @internal */ _model;
|
||||
/** @internal */ _disposedState;
|
||||
/** @internal */ _infillTokens;
|
||||
/** @internal */ _bosToken;
|
||||
/** @internal */ _eosToken;
|
||||
/** @internal */ _eotToken;
|
||||
/** @internal */ _sepToken;
|
||||
/** @internal */ _nlToken;
|
||||
/** @internal */ _bosString;
|
||||
/** @internal */ _eosString;
|
||||
/** @internal */ _eotString;
|
||||
/** @internal */ _sepString;
|
||||
/** @internal */ _nlString;
|
||||
/** @internal */ _shouldPrependBosToken;
|
||||
/** @internal */ _shouldAppendEosToken;
|
||||
constructor(model, disposedState) {
|
||||
this._model = model;
|
||||
this._disposedState = disposedState;
|
||||
}
|
||||
/**
|
||||
* @returns infill tokens
|
||||
*/
|
||||
get infill() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._infillTokens == null)
|
||||
this._infillTokens = LlamaModelInfillTokens._create(this._model, this._disposedState);
|
||||
return this._infillTokens;
|
||||
}
|
||||
/**
|
||||
* @returns The BOS (Beginning Of Sequence) token.
|
||||
*/
|
||||
get bos() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._bosToken == null)
|
||||
this._bosToken = this._model.tokenBos();
|
||||
if (this._bosToken === -1)
|
||||
return null;
|
||||
return this._bosToken;
|
||||
}
|
||||
/**
|
||||
* @returns The EOS (End Of Sequence) token.
|
||||
*/
|
||||
get eos() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._eosToken == null)
|
||||
this._eosToken = this._model.tokenEos();
|
||||
if (this._eosToken === -1)
|
||||
return null;
|
||||
return this._eosToken;
|
||||
}
|
||||
/**
|
||||
* @returns The EOT (End Of Turn) token.
|
||||
*/
|
||||
get eot() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._eotToken == null)
|
||||
this._eotToken = this._model.eotToken();
|
||||
if (this._eotToken === -1)
|
||||
return null;
|
||||
return this._eotToken;
|
||||
}
|
||||
/**
|
||||
* @returns The SEP (Sentence Separator) token.
|
||||
*/
|
||||
get sep() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._sepToken == null)
|
||||
this._sepToken = this._model.sepToken();
|
||||
if (this._sepToken === -1)
|
||||
return null;
|
||||
return this._sepToken;
|
||||
}
|
||||
/**
|
||||
* @returns The NL (New Line) token.
|
||||
*/
|
||||
get nl() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._nlToken == null)
|
||||
this._nlToken = this._model.tokenNl();
|
||||
if (this._nlToken === -1)
|
||||
return null;
|
||||
return this._nlToken;
|
||||
}
|
||||
/**
|
||||
* @returns The BOS (Beginning Of Sequence) token text representation.
|
||||
*/
|
||||
get bosString() {
|
||||
this._ensureNotDisposed();
|
||||
const bosToken = this.bos;
|
||||
if (bosToken == null)
|
||||
return null;
|
||||
if (this._bosString == null)
|
||||
this._bosString = this._model.getTokenString(bosToken);
|
||||
return this._bosString;
|
||||
}
|
||||
/**
|
||||
* @returns The EOS (End Of Sequence) token text representation.
|
||||
*/
|
||||
get eosString() {
|
||||
this._ensureNotDisposed();
|
||||
const eosToken = this.eos;
|
||||
if (eosToken == null)
|
||||
return null;
|
||||
if (this._eosString == null)
|
||||
this._eosString = this._model.getTokenString(eosToken);
|
||||
return this._eosString;
|
||||
}
|
||||
/**
|
||||
* @returns The EOT (End Of Turn) token text representation.
|
||||
*/
|
||||
get eotString() {
|
||||
this._ensureNotDisposed();
|
||||
const eotToken = this.eot;
|
||||
if (eotToken == null)
|
||||
return null;
|
||||
if (this._eotString == null)
|
||||
this._eotString = this._model.getTokenString(eotToken);
|
||||
return this._eotString;
|
||||
}
|
||||
/**
|
||||
* @returns The SEP (Sentence Separator) token text representation.
|
||||
*/
|
||||
get sepString() {
|
||||
this._ensureNotDisposed();
|
||||
const sepToken = this.sep;
|
||||
if (sepToken == null)
|
||||
return null;
|
||||
if (this._sepString == null)
|
||||
this._sepString = this._model.getTokenString(sepToken);
|
||||
return this._sepString;
|
||||
}
|
||||
/**
|
||||
* @returns The NL (New Line) token text representation.
|
||||
*/
|
||||
get nlString() {
|
||||
this._ensureNotDisposed();
|
||||
const nlToken = this.nl;
|
||||
if (nlToken == null)
|
||||
return null;
|
||||
if (this._nlString == null)
|
||||
this._nlString = this._model.getTokenString(nlToken);
|
||||
return this._nlString;
|
||||
}
|
||||
/**
|
||||
* @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
|
||||
*/
|
||||
get shouldPrependBosToken() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._shouldPrependBosToken == null)
|
||||
this._shouldPrependBosToken = this.bos != null && this._model.shouldPrependBosToken();
|
||||
return this._shouldPrependBosToken;
|
||||
}
|
||||
/**
|
||||
* @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
|
||||
*/
|
||||
get shouldAppendEosToken() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._shouldAppendEosToken == null)
|
||||
this._shouldAppendEosToken = this.bos != null && this._model.shouldAppendEosToken();
|
||||
return this._shouldAppendEosToken;
|
||||
}
|
||||
/** @internal */
|
||||
_ensureNotDisposed() {
|
||||
if (this._disposedState.disposed)
|
||||
throw new DisposedError();
|
||||
}
|
||||
/** @internal */
|
||||
static _create(model, disposedState) {
|
||||
return new LlamaModelTokens(model, disposedState);
|
||||
}
|
||||
}
|
||||
export class LlamaModelInfillTokens {
|
||||
/** @internal */ _model;
|
||||
/** @internal */ _disposedState;
|
||||
/** @internal */ _prefixToken;
|
||||
/** @internal */ _middleToken;
|
||||
/** @internal */ _suffixToken;
|
||||
/** @internal */ _prefixString;
|
||||
/** @internal */ _middleString;
|
||||
/** @internal */ _suffixString;
|
||||
constructor(model, disposedState) {
|
||||
this._model = model;
|
||||
this._disposedState = disposedState;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill prefix token.
|
||||
*/
|
||||
get prefix() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._prefixToken == null)
|
||||
this._prefixToken = this._resolveSpecialToken(this._model.prefixToken(), ["<fim_prefix>"]);
|
||||
if (this._prefixToken === -1)
|
||||
return null;
|
||||
return this._prefixToken;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill middle token.
|
||||
*/
|
||||
get middle() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._middleToken == null)
|
||||
this._middleToken = this._resolveSpecialToken(this._model.middleToken(), ["<fim_middle>"]);
|
||||
if (this._middleToken === -1)
|
||||
return null;
|
||||
return this._middleToken;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill suffix token.
|
||||
*/
|
||||
get suffix() {
|
||||
this._ensureNotDisposed();
|
||||
if (this._suffixToken == null)
|
||||
this._suffixToken = this._resolveSpecialToken(this._model.suffixToken(), ["<fim_suffix>"]);
|
||||
if (this._suffixToken === -1)
|
||||
return null;
|
||||
return this._suffixToken;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill prefix token as a string.
|
||||
*/
|
||||
get prefixString() {
|
||||
this._ensureNotDisposed();
|
||||
const prefixToken = this.prefix;
|
||||
if (prefixToken == null)
|
||||
return null;
|
||||
if (this._prefixString == null)
|
||||
this._prefixString = this._model.getTokenString(prefixToken);
|
||||
return this._prefixString;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill middle token as a string.
|
||||
*/
|
||||
get middleString() {
|
||||
this._ensureNotDisposed();
|
||||
const middleToken = this.middle;
|
||||
if (middleToken == null)
|
||||
return null;
|
||||
if (this._middleString == null)
|
||||
this._middleString = this._model.getTokenString(middleToken);
|
||||
return this._middleString;
|
||||
}
|
||||
/**
|
||||
* @returns The beginning of infill suffix token as a string.
|
||||
*/
|
||||
get suffixString() {
|
||||
this._ensureNotDisposed();
|
||||
const suffixToken = this.suffix;
|
||||
if (suffixToken == null)
|
||||
return null;
|
||||
if (this._suffixString == null)
|
||||
this._suffixString = this._model.getTokenString(suffixToken);
|
||||
return this._suffixString;
|
||||
}
|
||||
/** @internal */
|
||||
_ensureNotDisposed() {
|
||||
if (this._disposedState.disposed)
|
||||
throw new DisposedError();
|
||||
}
|
||||
/** @internal */
|
||||
_resolveSpecialToken(token, fallbackTexts) {
|
||||
if (token != null && token !== -1)
|
||||
return token;
|
||||
for (const text of fallbackTexts) {
|
||||
const tokens = this._model.tokenize(text, true);
|
||||
if (tokens.length !== 1)
|
||||
continue;
|
||||
return tokens[0];
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
/** @internal */
|
||||
static _create(model, disposedState) {
|
||||
return new LlamaModelInfillTokens(model, disposedState);
|
||||
}
|
||||
}
|
||||
function applyGgufMetadataOverrides(ggufFileInfo, overrides) {
|
||||
function applyOverride(object, override) {
|
||||
if (override == null || object == null)
|
||||
return;
|
||||
if (object instanceof Array || typeof object !== "object" || typeof override !== "object")
|
||||
return;
|
||||
for (const [key, value] of Object.entries(override)) {
|
||||
if (value instanceof Array || typeof value !== "object" || (typeof value === "object" && typeof object[key] !== "object"))
|
||||
object[key] = value;
|
||||
else
|
||||
applyOverride(object[key], value);
|
||||
}
|
||||
}
|
||||
applyOverride(ggufFileInfo.metadata, overrides);
|
||||
}
|
||||
function ggufMetadataOverridesToList(overrides) {
|
||||
const maxStringLength = 127;
|
||||
const maxKeyLength = 127;
|
||||
const res = [];
|
||||
function addItem(object, path) {
|
||||
if (object == null || object instanceof Array)
|
||||
return;
|
||||
if (typeof object !== "object") {
|
||||
if (typeof object === "string" && object.length > maxStringLength)
|
||||
throw new Error(`Metadata key "${path.join(".")}" override string value (${JSON.stringify(object)}) is longer than ${maxStringLength} characters`);
|
||||
const key = path.join(".");
|
||||
if (key.length > maxKeyLength)
|
||||
throw new Error(`Metadata key "${key}" override path is longer than ${maxKeyLength} characters`);
|
||||
let type = undefined;
|
||||
if (typeof object === "number") {
|
||||
if (typeof object === "bigint" || Number.isInteger(object))
|
||||
type = 0;
|
||||
else
|
||||
type = 1;
|
||||
}
|
||||
res.push([key, object, type]);
|
||||
return;
|
||||
}
|
||||
for (const [key, value] of Object.entries(object))
|
||||
addItem(value, [...path, key]);
|
||||
}
|
||||
addItem(overrides ?? {}, []);
|
||||
return res;
|
||||
}
|
||||
function disposeModelIfReferenced(modelRef) {
|
||||
const model = modelRef.deref();
|
||||
if (model != null)
|
||||
void model.dispose();
|
||||
}
|
||||
//# sourceMappingURL=LlamaModel.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/LlamaModel.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
29
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts
generated
vendored
Normal file
29
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts
generated
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
import { Token } from "../../../types.js";
|
||||
export declare const enum TokenAttribute {
|
||||
undefined = 0,
|
||||
unknown = 1,
|
||||
unused = 2,
|
||||
normal = 4,
|
||||
control = 8,// SPECIAL
|
||||
userDefined = 16,
|
||||
byte = 32,
|
||||
normalized = 64,
|
||||
lstrip = 128,
|
||||
rstrip = 256,
|
||||
singleWord = 512
|
||||
}
|
||||
export declare class TokenAttributes {
|
||||
readonly token: Token;
|
||||
private constructor();
|
||||
get undefined(): boolean;
|
||||
get unknown(): boolean;
|
||||
get unused(): boolean;
|
||||
get normal(): boolean;
|
||||
get control(): boolean;
|
||||
get userDefined(): boolean;
|
||||
get byte(): boolean;
|
||||
get normalized(): boolean;
|
||||
get lstrip(): boolean;
|
||||
get rstrip(): boolean;
|
||||
get singleWord(): boolean;
|
||||
}
|
||||
65
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js
generated
vendored
Normal file
65
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js
generated
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
// updated against `enum llama_token_attr` from `llama.h`
|
||||
export var TokenAttribute;
|
||||
(function (TokenAttribute) {
|
||||
TokenAttribute[TokenAttribute["undefined"] = 0] = "undefined";
|
||||
TokenAttribute[TokenAttribute["unknown"] = 1] = "unknown";
|
||||
TokenAttribute[TokenAttribute["unused"] = 2] = "unused";
|
||||
TokenAttribute[TokenAttribute["normal"] = 4] = "normal";
|
||||
TokenAttribute[TokenAttribute["control"] = 8] = "control";
|
||||
TokenAttribute[TokenAttribute["userDefined"] = 16] = "userDefined";
|
||||
TokenAttribute[TokenAttribute["byte"] = 32] = "byte";
|
||||
TokenAttribute[TokenAttribute["normalized"] = 64] = "normalized";
|
||||
TokenAttribute[TokenAttribute["lstrip"] = 128] = "lstrip";
|
||||
TokenAttribute[TokenAttribute["rstrip"] = 256] = "rstrip";
|
||||
TokenAttribute[TokenAttribute["singleWord"] = 512] = "singleWord";
|
||||
})(TokenAttribute || (TokenAttribute = {}));
|
||||
export class TokenAttributes {
|
||||
token;
|
||||
/** @internal */ _attributes;
|
||||
constructor(token, attributes) {
|
||||
this.token = token;
|
||||
this._attributes = attributes;
|
||||
}
|
||||
get undefined() {
|
||||
return this._attributes === TokenAttribute.undefined;
|
||||
}
|
||||
get unknown() {
|
||||
return this._hasAttribute(TokenAttribute.unknown);
|
||||
}
|
||||
get unused() {
|
||||
return this._hasAttribute(TokenAttribute.unused);
|
||||
}
|
||||
get normal() {
|
||||
return this._hasAttribute(TokenAttribute.normal);
|
||||
}
|
||||
get control() {
|
||||
return this._hasAttribute(TokenAttribute.control);
|
||||
}
|
||||
get userDefined() {
|
||||
return this._hasAttribute(TokenAttribute.userDefined);
|
||||
}
|
||||
get byte() {
|
||||
return this._hasAttribute(TokenAttribute.byte);
|
||||
}
|
||||
get normalized() {
|
||||
return this._hasAttribute(TokenAttribute.normalized);
|
||||
}
|
||||
get lstrip() {
|
||||
return this._hasAttribute(TokenAttribute.lstrip);
|
||||
}
|
||||
get rstrip() {
|
||||
return this._hasAttribute(TokenAttribute.rstrip);
|
||||
}
|
||||
get singleWord() {
|
||||
return this._hasAttribute(TokenAttribute.singleWord);
|
||||
}
|
||||
/** @internal */
|
||||
_hasAttribute(attribute) {
|
||||
return (this._attributes & attribute) === attribute;
|
||||
}
|
||||
/** @internal */
|
||||
static _create(token, attributes) {
|
||||
return new TokenAttributes(token, attributes);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=TokenAttributes.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TokenAttributes.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaModel/utils/TokenAttributes.ts"],"names":[],"mappings":"AAEA,yDAAyD;AACzD,MAAM,CAAN,IAAkB,cAYjB;AAZD,WAAkB,cAAc;IAC5B,6DAAa,CAAA;IACb,yDAAgB,CAAA;IAChB,uDAAe,CAAA;IACf,uDAAe,CAAA;IACf,yDAAgB,CAAA;IAChB,kEAAoB,CAAA;IACpB,oDAAa,CAAA;IACb,gEAAmB,CAAA;IACnB,yDAAe,CAAA;IACf,yDAAe,CAAA;IACf,iEAAmB,CAAA;AACvB,CAAC,EAZiB,cAAc,KAAd,cAAc,QAY/B;AAED,MAAM,OAAO,eAAe;IACR,KAAK,CAAQ;IAC7B,gBAAgB,CAAkB,WAAW,CAAiB;IAE9D,YAAoB,KAAY,EAAE,UAA0B;QACxD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;IAClC,CAAC;IAED,IAAW,SAAS;QAChB,OAAO,IAAI,CAAC,WAAW,KAAK,cAAc,CAAC,SAAS,CAAC;IACzD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,OAAO;QACd,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,IAAW,WAAW;QAClB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC;IAED,IAAW,IAAI;QACX,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACnD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,MAAM;QACb,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,IAAW,UAAU;QACjB,OAAO,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC;IAED,gBAAgB;IACR,aAAa,CAAC,SAAyB;QAC3C,OAAO,CAAC,IAAI,CAAC,WAAW,GAAG,SAAS,CAAC,KAAK,SAAS,CAAC;IACxD,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,OAAO,CAAC,KAAY,EAAE,UAA0B;QAC1D,OAAO,IAAI,eAAe,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;IAClD,CAAC;CACJ"}
|
||||
91
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.d.ts
generated
vendored
Normal file
91
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.d.ts
generated
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { Token } from "../types.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
|
||||
export type LlamaRankingContextOptions = {
|
||||
/**
|
||||
* The number of tokens the model can see at once.
|
||||
* - **`"auto"`** - adapt to the current VRAM state and attempt to set the context size as high as possible up to the size
|
||||
* the model was trained on.
|
||||
* - **`number`** - set the context size to a specific number of tokens.
|
||||
* If there's not enough VRAM, an error will be thrown.
|
||||
* Use with caution.
|
||||
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attempt to set the context size as high as possible
|
||||
* up to the size the model was trained on, but at least `min` and at most `max`.
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
contextSize?: "auto" | number | {
|
||||
min?: number;
|
||||
max?: number;
|
||||
};
|
||||
/** prompt processing batch size */
|
||||
batchSize?: number;
|
||||
/**
|
||||
* number of threads to use to evaluate tokens.
|
||||
* set to 0 to use the maximum threads supported by the current machine hardware
|
||||
*/
|
||||
threads?: number;
|
||||
/** An abort signal to abort the context creation */
|
||||
createSignal?: AbortSignal;
|
||||
/**
|
||||
* The template to use for the ranking evaluation.
|
||||
* If not provided, the model's template will be used by default.
|
||||
*
|
||||
* The template is tokenized with special tokens enabled, but the provided query and document are not.
|
||||
*
|
||||
* **<span v-pre>`{{query}}`</span>** is replaced with the query content.
|
||||
*
|
||||
* **<span v-pre>`{{document}}`</span>** is replaced with the document content.
|
||||
*
|
||||
* It's recommended to not set this option unless you know what you're doing.
|
||||
*
|
||||
* Defaults to the model's template.
|
||||
*/
|
||||
template?: `${string}{{query}}${string}{{document}}${string}` | `${string}{{document}}${string}{{query}}${string}`;
|
||||
/**
|
||||
* Ignore insufficient memory errors and continue with the context creation.
|
||||
* Can cause the process to crash if there's not enough VRAM for the new context.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
};
|
||||
/**
|
||||
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
|
||||
*/
|
||||
export declare class LlamaRankingContext {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
/**
|
||||
* Get the ranking score for a document for a query.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
* @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText): Promise<number>;
|
||||
/**
|
||||
* Get the ranking scores for all the given documents for a query.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
* @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]>;
|
||||
/**
|
||||
* Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
rankAndSort<const T extends string>(query: Token[] | string | LlamaText, documents: T[]): Promise<Array<{
|
||||
document: T;
|
||||
/**
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
score: number;
|
||||
}>>;
|
||||
dispose(): Promise<void>;
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose](): Promise<void>;
|
||||
get disposed(): boolean;
|
||||
get model(): LlamaModel;
|
||||
}
|
||||
178
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js
generated
vendored
Normal file
178
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js
generated
vendored
Normal file
@@ -0,0 +1,178 @@
|
||||
import { AsyncDisposeAggregator, EventRelay, splitText, withLock } from "lifecycle-utils";
|
||||
import { tokenizeInput } from "../utils/tokenizeInput.js";
|
||||
import { resolveBeginningTokenToPrepend, resolveEndTokenToAppend } from "../utils/tokenizerUtils.js";
|
||||
import { isRankingTemplateValid, parseRankingTemplate } from "../gguf/insights/GgufInsights.js";
|
||||
/**
|
||||
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
|
||||
*/
|
||||
export class LlamaRankingContext {
|
||||
/** @internal */ _llamaContext;
|
||||
/** @internal */ _template;
|
||||
/** @internal */ _sequence;
|
||||
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
|
||||
onDispose = new EventRelay();
|
||||
constructor({ _llamaContext, _template }) {
|
||||
this._llamaContext = _llamaContext;
|
||||
this._template = _template;
|
||||
this._sequence = this._llamaContext.getSequence();
|
||||
this._disposeAggregator.add(this._llamaContext.onDispose.createListener(() => {
|
||||
void this._disposeAggregator.dispose();
|
||||
}));
|
||||
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
||||
this._disposeAggregator.add(async () => {
|
||||
await this._llamaContext.dispose();
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Get the ranking score for a document for a query.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
* @returns a ranking score between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
async rank(query, document) {
|
||||
const resolvedInput = this._getEvaluationInput(query, document);
|
||||
if (resolvedInput.length > this._llamaContext.contextSize)
|
||||
throw new Error("The input length exceed the context size. " +
|
||||
`Try to increase the context size to at least ${resolvedInput.length + 1} ` +
|
||||
"or use another model that supports longer contexts.");
|
||||
return this._evaluateRankingForInput(resolvedInput);
|
||||
}
|
||||
/**
|
||||
* Get the ranking scores for all the given documents for a query.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
* @returns an array of ranking scores between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
async rankAll(query, documents) {
|
||||
const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
|
||||
const maxInputTokensLength = resolvedTokens.reduce((max, tokens) => Math.max(max, tokens.length), 0);
|
||||
if (maxInputTokensLength > this._llamaContext.contextSize)
|
||||
throw new Error("The input lengths of some of the given documents exceed the context size. " +
|
||||
`Try to increase the context size to at least ${maxInputTokensLength + 1} ` +
|
||||
"or use another model that supports longer contexts.");
|
||||
else if (resolvedTokens.length === 0)
|
||||
return [];
|
||||
return await Promise.all(resolvedTokens.map((tokens) => this._evaluateRankingForInput(tokens)));
|
||||
}
|
||||
/**
|
||||
* Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
|
||||
*
|
||||
* A ranking score is a number between 0 and 1 representing the probability that the document is relevant to the query.
|
||||
*/
|
||||
async rankAndSort(query, documents) {
|
||||
const scores = await this.rankAll(query, documents);
|
||||
return documents
|
||||
.map((document, index) => ({ document: document, score: scores[index] }))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
}
|
||||
async dispose() {
|
||||
await this._disposeAggregator.dispose();
|
||||
}
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
get disposed() {
|
||||
return this._llamaContext.disposed;
|
||||
}
|
||||
get model() {
|
||||
return this._llamaContext.model;
|
||||
}
|
||||
/** @internal */
|
||||
_getEvaluationInput(query, document) {
|
||||
if (this._template != null) {
|
||||
const resolvedInput = splitText(this._template, ["{{query}}", "{{document}}"])
|
||||
.flatMap((item) => {
|
||||
if (typeof item === "string")
|
||||
return this._llamaContext.model.tokenize(item, true, "trimLeadingSpace");
|
||||
else if (item.separator === "{{query}}")
|
||||
return tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
|
||||
else if (item.separator === "{{document}}")
|
||||
return tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
|
||||
else
|
||||
void item;
|
||||
void item;
|
||||
return [];
|
||||
});
|
||||
const beginningTokens = resolveBeginningTokenToPrepend(this.model.vocabularyType, this.model.tokens);
|
||||
const endToken = resolveEndTokenToAppend(this.model.vocabularyType, this.model.tokens);
|
||||
if (beginningTokens != null && resolvedInput.at(0) !== beginningTokens)
|
||||
resolvedInput.unshift(beginningTokens);
|
||||
if (endToken != null && resolvedInput.at(-1) !== endToken)
|
||||
resolvedInput.unshift(endToken);
|
||||
return resolvedInput;
|
||||
}
|
||||
if (this.model.tokens.eos == null && this.model.tokens.sep == null)
|
||||
throw new Error("Computing rankings is not supported for this model.");
|
||||
const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
|
||||
const resolvedDocument = tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
|
||||
if (resolvedQuery.length === 0 && resolvedDocument.length === 0)
|
||||
return [];
|
||||
const resolvedInput = [
|
||||
...(this.model.tokens.bos == null ? [] : [this.model.tokens.bos]),
|
||||
...resolvedQuery,
|
||||
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos]),
|
||||
...(this.model.tokens.sep == null ? [] : [this.model.tokens.sep]),
|
||||
...resolvedDocument,
|
||||
...(this.model.tokens.eos == null ? [] : [this.model.tokens.eos])
|
||||
];
|
||||
return resolvedInput;
|
||||
}
|
||||
/** @internal */
|
||||
_evaluateRankingForInput(input) {
|
||||
if (input.length === 0)
|
||||
return Promise.resolve(0);
|
||||
return withLock([this, "evaluate"], async () => {
|
||||
await this._sequence.eraseContextTokenRanges([{
|
||||
start: 0,
|
||||
end: this._sequence.nextTokenIndex
|
||||
}]);
|
||||
const iterator = this._sequence.evaluate(input, { _noSampling: true });
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
for await (const token of iterator) {
|
||||
break; // only generate one token to get embeddings
|
||||
}
|
||||
const embedding = this._llamaContext._ctx.getEmbedding(input.length, 1);
|
||||
if (embedding.length === 0)
|
||||
return 0;
|
||||
const logit = embedding[0];
|
||||
const probability = logitToSigmoid(logit);
|
||||
return probability;
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
static async _create({ _model }, { contextSize, batchSize, threads = 6, createSignal, template, ignoreMemorySafetyChecks }) {
|
||||
const resolvedTemplate = template ?? parseRankingTemplate(_model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"]);
|
||||
if (_model.tokens.eos == null && _model.tokens.sep == null) {
|
||||
if (!isRankingTemplateValid(resolvedTemplate)) {
|
||||
if (resolvedTemplate === _model.fileInfo.metadata?.tokenizer?.["chat_template.rerank"])
|
||||
throw new Error("The model's builtin template is invalid. It must contain both {query} and {document} placeholders.");
|
||||
else
|
||||
throw new Error("The provided template is invalid. It must contain both {{query}} and {{document}} placeholders.");
|
||||
}
|
||||
else if (resolvedTemplate == null)
|
||||
throw new Error("Computing rankings is not supported for this model.");
|
||||
}
|
||||
if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
|
||||
throw new Error("Computing rankings is not supported for encoder-decoder models.");
|
||||
if (!_model.fileInsights.supportsRanking)
|
||||
throw new Error("Computing rankings is not supported for this model.");
|
||||
const llamaContext = await _model.createContext({
|
||||
contextSize,
|
||||
batchSize,
|
||||
threads,
|
||||
createSignal,
|
||||
ignoreMemorySafetyChecks,
|
||||
_embeddings: true,
|
||||
_ranking: true
|
||||
});
|
||||
return new LlamaRankingContext({
|
||||
_llamaContext: llamaContext,
|
||||
_template: resolvedTemplate
|
||||
});
|
||||
}
|
||||
}
|
||||
function logitToSigmoid(logit) {
|
||||
return 1 / (1 + Math.exp(-logit));
|
||||
}
|
||||
//# sourceMappingURL=LlamaRankingContext.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaRankingContext.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
37
node_modules/node-llama-cpp/dist/evaluator/TokenBias.d.ts
generated
vendored
Normal file
37
node_modules/node-llama-cpp/dist/evaluator/TokenBias.d.ts
generated
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
import { Token, Tokenizer } from "../types.js";
|
||||
import { LlamaText } from "../utils/LlamaText.js";
|
||||
import type { LlamaModel } from "./LlamaModel/LlamaModel.js";
|
||||
/**
|
||||
* @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
|
||||
*/
|
||||
export declare class TokenBias {
|
||||
constructor(tokenizer: Tokenizer);
|
||||
/**
|
||||
* Adjust the bias of the given token(s).
|
||||
*
|
||||
* If a text is provided, the bias will be applied to each individual token in the text.
|
||||
*
|
||||
* Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
|
||||
*
|
||||
* Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
|
||||
* @param input - The token(s) to apply the bias to
|
||||
* @param bias - The probability bias to apply to the token(s).
|
||||
*
|
||||
* Setting to a positive number increases the probability of the token(s) being generated.
|
||||
*
|
||||
* Setting to a negative number decreases the probability of the token(s) being generated.
|
||||
*
|
||||
* Setting to `0` has no effect.
|
||||
*
|
||||
* For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
|
||||
* Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
|
||||
*
|
||||
* Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
|
||||
*
|
||||
* Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
|
||||
*/
|
||||
set(input: Token | Token[] | string | LlamaText, bias: "never" | number | {
|
||||
logit: number;
|
||||
}): this;
|
||||
static for(modelOrTokenizer: LlamaModel | Tokenizer): TokenBias;
|
||||
}
|
||||
68
node_modules/node-llama-cpp/dist/evaluator/TokenBias.js
generated
vendored
Normal file
68
node_modules/node-llama-cpp/dist/evaluator/TokenBias.js
generated
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
import { tokenizeInput } from "../utils/tokenizeInput.js";
|
||||
/**
|
||||
* @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
|
||||
*/
|
||||
export class TokenBias {
|
||||
/** @internal */ _tokenizer;
|
||||
/** @internal */ _biases = new Map();
|
||||
constructor(tokenizer) {
|
||||
this._tokenizer = tokenizer;
|
||||
}
|
||||
/**
|
||||
* Adjust the bias of the given token(s).
|
||||
*
|
||||
* If a text is provided, the bias will be applied to each individual token in the text.
|
||||
*
|
||||
* Setting a bias to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
|
||||
*
|
||||
* Setting the bias of the EOS or EOT tokens to `"never"` has no effect and will be ignored.
|
||||
* @param input - The token(s) to apply the bias to
|
||||
* @param bias - The probability bias to apply to the token(s).
|
||||
*
|
||||
* Setting to a positive number increases the probability of the token(s) being generated.
|
||||
*
|
||||
* Setting to a negative number decreases the probability of the token(s) being generated.
|
||||
*
|
||||
* Setting to `0` has no effect.
|
||||
*
|
||||
* For example, setting to `0.5` will increase the probability of the token(s) being generated by 50%.
|
||||
* Setting to `-0.5` will decrease the probability of the token(s) being generated by 50%.
|
||||
*
|
||||
* Setting to `"never"` will prevent the token from being generated, unless it is required to comply with a grammar.
|
||||
*
|
||||
* Try to play around with values between `0.9` and `-0.9` to see what works for your use case.
|
||||
*/
|
||||
set(input, bias) {
|
||||
const resolvedLogit = bias === "never"
|
||||
? -Infinity
|
||||
: typeof bias === "number"
|
||||
? probabilityToLogit(bias)
|
||||
: bias.logit;
|
||||
for (const token of tokenizeInput(input, this._tokenizer)) {
|
||||
if (this._tokenizer.isEogToken(token))
|
||||
continue;
|
||||
this._biases.set(token, resolvedLogit);
|
||||
}
|
||||
for (const token of tokenizeInput(input, this._tokenizer, "trimLeadingSpace")) {
|
||||
if (this._tokenizer.isEogToken(token))
|
||||
continue;
|
||||
this._biases.set(token, resolvedLogit);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
static for(modelOrTokenizer) {
|
||||
if (modelOrTokenizer.tokenizer != null)
|
||||
return new TokenBias(modelOrTokenizer.tokenizer);
|
||||
return new TokenBias(modelOrTokenizer);
|
||||
}
|
||||
}
|
||||
function probabilityToLogit(probability) {
|
||||
if (probability <= -1)
|
||||
return -Infinity;
|
||||
else if (probability >= 1)
|
||||
return Infinity;
|
||||
else if (probability === 0)
|
||||
return 0;
|
||||
return Math.log(probability / (1 - probability));
|
||||
}
|
||||
//# sourceMappingURL=TokenBias.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/TokenBias.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/TokenBias.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TokenBias.js","sourceRoot":"","sources":["../../src/evaluator/TokenBias.ts"],"names":[],"mappings":"AAEA,OAAO,EAAC,aAAa,EAAC,MAAM,2BAA2B,CAAC;AAGxD;;GAEG;AACH,MAAM,OAAO,SAAS;IAClB,gBAAgB,CAAiB,UAAU,CAAY;IACvD,gBAAgB,CAAiB,OAAO,GAAG,IAAI,GAAG,EAAiB,CAAC;IAEpE,YAAmB,SAAoB;QACnC,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;IAChC,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACI,GAAG,CAAC,KAA2C,EAAE,IAAwC;QAC5F,MAAM,aAAa,GAAG,IAAI,KAAK,OAAO;YAClC,CAAC,CAAC,CAAC,QAAQ;YACX,CAAC,CAAC,OAAO,IAAI,KAAK,QAAQ;gBACtB,CAAC,CAAC,kBAAkB,CAAC,IAAI,CAAC;gBAC1B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC;QAErB,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;YACxD,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,kBAAkB,CAAC,EAAE,CAAC;YAC5E,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,KAAK,CAAC;gBACjC,SAAS;YAEb,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,IAAI,CAAC;IAChB,CAAC;IAEM,MAAM,CAAC,GAAG,CAAC,gBAAwC;QACtD,IAAK,gBAA+B,CAAC,SAAS,IAAI,IAAI;YAClD,OAAO,IAAI,SAAS,CAAE,gBAA+B,CAAC,SAAS,CAAC,CAAC;QAErE,OAAO,IAAI,SAAS,CAAC,gBAA6B,CAAC,CAAC;IACxD,CAAC;CACJ;AAED,SAAS,kBAAkB,CAAC,WAAmB;IAC3C,IAAI,WAAW,IAAI,CAAC,CAAC;QACjB,OAAO,CAAC,QAAQ,CAAC;SAChB,IAAI,WAAW,IAAI,CAAC;QACrB,OAAO,QAAQ,CAAC;SACf,IAAI,WAAW,KAAK,CAAC;QACtB,OAAO,CAAC,CAAC;IAEb,OAAO,IAAI,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC;AACrD,CAAC"}
|
||||
45
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.d.ts
generated
vendored
Normal file
45
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.d.ts
generated
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
/**
|
||||
* Tracks the usage of tokens.
|
||||
*/
|
||||
export declare class TokenMeter {
|
||||
private _inputTokens;
|
||||
private _outputTokens;
|
||||
/**
|
||||
* The number of input tokens used
|
||||
*/
|
||||
get usedInputTokens(): number;
|
||||
/**
|
||||
* The number of tokens generated by a model
|
||||
*/
|
||||
get usedOutputTokens(): number;
|
||||
/**
|
||||
* Get the current state of the token meter
|
||||
*/
|
||||
getState(): TokenMeterState;
|
||||
/**
|
||||
* Log the usage of tokens
|
||||
*/
|
||||
useTokens(tokens: number, type: "input" | "output"): void;
|
||||
/**
|
||||
* Get the difference between the current meter and another meter
|
||||
*/
|
||||
diff(meter: TokenMeter | TokenMeterState): {
|
||||
usedInputTokens: number;
|
||||
usedOutputTokens: number;
|
||||
};
|
||||
/**
|
||||
* Log the usage of tokens on multiple meters
|
||||
*/
|
||||
static useTokens(meters: null | undefined | TokenMeter | readonly TokenMeter[] | ReadonlySet<TokenMeter>, tokens: number, type: "input" | "output"): void;
|
||||
/**
|
||||
* Get the difference between two meters
|
||||
*/
|
||||
static diff(meter1: TokenMeter | TokenMeterState, meter2: TokenMeter | TokenMeterState): {
|
||||
usedInputTokens: number;
|
||||
usedOutputTokens: number;
|
||||
};
|
||||
}
|
||||
export type TokenMeterState = {
|
||||
usedInputTokens: number;
|
||||
usedOutputTokens: number;
|
||||
};
|
||||
74
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js
generated
vendored
Normal file
74
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js
generated
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* Tracks the usage of tokens.
|
||||
*/
|
||||
export class TokenMeter {
|
||||
_inputTokens = 0;
|
||||
_outputTokens = 0;
|
||||
/**
|
||||
* The number of input tokens used
|
||||
*/
|
||||
get usedInputTokens() {
|
||||
return this._inputTokens;
|
||||
}
|
||||
/**
|
||||
* The number of tokens generated by a model
|
||||
*/
|
||||
get usedOutputTokens() {
|
||||
return this._outputTokens;
|
||||
}
|
||||
/**
|
||||
* Get the current state of the token meter
|
||||
*/
|
||||
getState() {
|
||||
return {
|
||||
usedInputTokens: this.usedInputTokens,
|
||||
usedOutputTokens: this.usedOutputTokens
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Log the usage of tokens
|
||||
*/
|
||||
useTokens(tokens, type) {
|
||||
if (tokens < 0)
|
||||
throw new RangeError("Tokens cannot be negative");
|
||||
else if (tokens === 0)
|
||||
return;
|
||||
if (type === "input")
|
||||
this._inputTokens += tokens;
|
||||
else if (type === "output")
|
||||
this._outputTokens += tokens;
|
||||
else {
|
||||
void type;
|
||||
throw new TypeError(`Unknown token type: ${type}`);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Get the difference between the current meter and another meter
|
||||
*/
|
||||
diff(meter) {
|
||||
return TokenMeter.diff(this, meter);
|
||||
}
|
||||
/**
|
||||
* Log the usage of tokens on multiple meters
|
||||
*/
|
||||
static useTokens(meters, tokens, type) {
|
||||
if (meters == null)
|
||||
return;
|
||||
if (meters instanceof TokenMeter)
|
||||
meters.useTokens(tokens, type);
|
||||
else {
|
||||
for (const meter of meters)
|
||||
meter.useTokens(tokens, type);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Get the difference between two meters
|
||||
*/
|
||||
static diff(meter1, meter2) {
|
||||
return {
|
||||
usedInputTokens: meter1.usedInputTokens - meter2.usedInputTokens,
|
||||
usedOutputTokens: meter1.usedOutputTokens - meter2.usedOutputTokens
|
||||
};
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=TokenMeter.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/TokenMeter.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TokenMeter.js","sourceRoot":"","sources":["../../src/evaluator/TokenMeter.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,OAAO,UAAU;IACX,YAAY,GAAW,CAAC,CAAC;IACzB,aAAa,GAAW,CAAC,CAAC;IAElC;;OAEG;IACH,IAAW,eAAe;QACtB,OAAO,IAAI,CAAC,YAAY,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,IAAW,gBAAgB;QACvB,OAAO,IAAI,CAAC,aAAa,CAAC;IAC9B,CAAC;IAED;;OAEG;IACI,QAAQ;QACX,OAAO;YACH,eAAe,EAAE,IAAI,CAAC,eAAe;YACrC,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;SAC1C,CAAC;IACN,CAAC;IAED;;OAEG;IACI,SAAS,CAAC,MAAc,EAAE,IAAwB;QACrD,IAAI,MAAM,GAAG,CAAC;YACV,MAAM,IAAI,UAAU,CAAC,2BAA2B,CAAC,CAAC;aACjD,IAAI,MAAM,KAAK,CAAC;YACjB,OAAO;QAEX,IAAI,IAAI,KAAK,OAAO;YAChB,IAAI,CAAC,YAAY,IAAI,MAAM,CAAC;aAC3B,IAAI,IAAI,KAAK,QAAQ;YACtB,IAAI,CAAC,aAAa,IAAI,MAAM,CAAC;aAC5B,CAAC;YACF,KAAM,IAAqB,CAAC;YAC5B,MAAM,IAAI,SAAS,CAAC,uBAAuB,IAAI,EAAE,CAAC,CAAC;QACvD,CAAC;IACL,CAAC;IAED;;OAEG;IACI,IAAI,CAAC,KAAmC;QAC3C,OAAO,UAAU,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IACxC,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,SAAS,CACnB,MAAuF,EACvF,MAAc,EACd,IAAwB;QAExB,IAAI,MAAM,IAAI,IAAI;YACd,OAAO;QAEX,IAAI,MAAM,YAAY,UAAU;YAC5B,MAAM,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;aAC9B,CAAC;YACF,KAAK,MAAM,KAAK,IAAI,MAAM;gBACtB,KAAK,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACtC,CAAC;IACL,CAAC;IAED;;OAEG;IACI,MAAM,CAAC,IAAI,CACd,MAAoC,EACpC,MAAoC;QAEpC,OAAO;YACH,eAAe,EAAE,MAAM,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe;YAChE,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,GAAG,MAAM,CAAC,gBAAgB;SACtE,CAAC;IACN,CAAC;CACJ"}
|
||||
86
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.d.ts
generated
vendored
Normal file
86
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.d.ts
generated
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
|
||||
import { Token, Tokenizer } from "../../types.js";
|
||||
import { LlamaText } from "../../utils/LlamaText.js";
|
||||
/**
|
||||
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
|
||||
*
|
||||
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
|
||||
*
|
||||
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
|
||||
* @experimental - this API is experimental and may change or be removed in subsequent releases
|
||||
* @hidden
|
||||
*/
|
||||
export declare function experimentalChunkDocument(options: {
|
||||
contextSequence: LlamaContextSequence;
|
||||
document: string;
|
||||
/**
|
||||
* The tokens to use as separators for chunking the document.
|
||||
* Passed to the `getSystemPrompt` function to generate the prompt.
|
||||
*/
|
||||
separatorTokens?: Token[];
|
||||
getSystemPrompt?(options: {
|
||||
separatorTokens: Token[];
|
||||
tokenizer: Tokenizer;
|
||||
maxChunkSize?: number;
|
||||
}): LlamaText | string;
|
||||
/**
|
||||
* Maximum number of tokens to allow in a chunk.
|
||||
*
|
||||
* As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
|
||||
*
|
||||
* Set to `0` to disable this mechanism.
|
||||
*
|
||||
* Defaults to `500`.
|
||||
*/
|
||||
maxChunkSize?: number;
|
||||
/**
|
||||
* The alignment curve for the maximum chunk size mechanism.
|
||||
*
|
||||
* Adjust the value based on the behavior of the model.
|
||||
*
|
||||
* Play around with values between `1` and `4` to see what works best for you.
|
||||
*
|
||||
* Set to `1` to disable this mechanism.
|
||||
*
|
||||
* Defaults to `4`.
|
||||
*/
|
||||
maxChunkSizeAlignmentCurve?: number;
|
||||
/**
|
||||
* Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
|
||||
* matches any of the texts in `trimmedTexts`
|
||||
*/
|
||||
syntaxAlignment?: {
|
||||
/**
|
||||
* The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
|
||||
*
|
||||
* Default: `4`
|
||||
*/
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* The trimmed texts to match for, to append the token to the current chunk.
|
||||
*
|
||||
* Default: `["", ".", ";"]`
|
||||
*/
|
||||
trimmedTexts?: string[];
|
||||
};
|
||||
/**
|
||||
* The number of tokens to skip before starting to use the generated separator tokens to split the document.
|
||||
*/
|
||||
skipFirstTokens?: number;
|
||||
/**
|
||||
* The number of recent probabilities to keep in the trail for normalization.
|
||||
*
|
||||
* Adjust the value based on the behavior of the model.
|
||||
*
|
||||
* Defaults to `200`.
|
||||
*/
|
||||
normalizationTrailSize?: number;
|
||||
/**
|
||||
* Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
|
||||
*/
|
||||
onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void;
|
||||
/**
|
||||
* Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
|
||||
*/
|
||||
onChunkText?(chunkText: string, usedSeparatorToken: Token): void;
|
||||
}): Promise<string[]>;
|
||||
212
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js
generated
vendored
Normal file
212
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js
generated
vendored
Normal file
@@ -0,0 +1,212 @@
|
||||
import { LlamaText, SpecialTokensText } from "../../utils/LlamaText.js";
|
||||
import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
|
||||
import { safeEventCallback } from "../../utils/safeEventCallback.js";
|
||||
import { maxRecentDetokenizerTokens } from "../../consts.js";
|
||||
/**
|
||||
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
|
||||
*
|
||||
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
|
||||
*
|
||||
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
|
||||
* @experimental - this API is experimental and may change or be removed in subsequent releases
|
||||
* @hidden
|
||||
*/
|
||||
export async function experimentalChunkDocument(options) {
|
||||
const { contextSequence, document, separatorTokens = findAppropriateSeparatorTokens(contextSequence.model), getSystemPrompt = getDefaultPrompt, maxChunkSize = 500, maxChunkSizeAlignmentCurve = 4, syntaxAlignment: { maxTokens: maxSyntaxAlignment = 4, trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"] } = {}, skipFirstTokens = 3, normalizationTrailSize = 100 } = options;
|
||||
const onChunkTokens = safeEventCallback(options.onChunkTokens);
|
||||
const onChunkText = safeEventCallback(options.onChunkText);
|
||||
if (separatorTokens.length === 0)
|
||||
throw new Error("Separator tokens must be provided");
|
||||
const chatHistory = [{
|
||||
type: "system",
|
||||
text: LlamaText(getSystemPrompt({
|
||||
separatorTokens,
|
||||
tokenizer: contextSequence.model.tokenizer,
|
||||
maxChunkSize: maxChunkSize <= 0
|
||||
? undefined
|
||||
: maxChunkSize
|
||||
})).toJSON()
|
||||
}, {
|
||||
type: "user",
|
||||
text: document
|
||||
}, {
|
||||
type: "model",
|
||||
response: [""]
|
||||
}];
|
||||
const chatWrapper = resolveChatWrapper(contextSequence.model);
|
||||
const { contextText } = chatWrapper.generateContextState({ chatHistory });
|
||||
const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
|
||||
const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
|
||||
const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
|
||||
if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
|
||||
throw new Error("The context size is too small to chunk the given document");
|
||||
const evaluateInput = initialContextTokens.slice();
|
||||
for (let i = 0; i < documentTokens.length - 1; i++) {
|
||||
const token = documentTokens[i];
|
||||
evaluateInput.push([token, {
|
||||
generateNext: {
|
||||
probabilities: true
|
||||
}
|
||||
}]);
|
||||
}
|
||||
let weight = 1;
|
||||
const recentProbabilitiesTrail = [];
|
||||
let chunkStartIndex = 0;
|
||||
let lastPushedSeparatorIndex = 0;
|
||||
const chunks = [];
|
||||
const res = [];
|
||||
function pushSeparatorIndex(separateIndex, separatorToken) {
|
||||
lastPushedSeparatorIndex = separateIndex;
|
||||
if (separateIndex <= chunkStartIndex)
|
||||
return;
|
||||
let endIndex = separateIndex;
|
||||
for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
|
||||
const text = contextSequence.model.detokenize([documentTokens[endIndex + i]]);
|
||||
if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
|
||||
break;
|
||||
endIndex++;
|
||||
}
|
||||
const chunk = documentTokens.slice(chunkStartIndex, endIndex);
|
||||
const text = contextSequence.model.detokenize(chunk, false, documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex));
|
||||
chunks.push(chunk);
|
||||
chunkStartIndex = endIndex;
|
||||
onChunkTokens?.(chunk, separatorToken);
|
||||
onChunkText?.(text, separatorToken);
|
||||
res.push(text);
|
||||
}
|
||||
await contextSequence.controlledEvaluate(evaluateInput, {
|
||||
onTokenResult(inputTokenIndex, result) {
|
||||
const i = inputTokenIndex - initialContextTokens.length;
|
||||
const nextProbabilities = result?.next?.probabilities;
|
||||
const nextDocumentToken = documentTokens[i + 1];
|
||||
if (nextProbabilities == null)
|
||||
throw new Error("received no result for token " + i);
|
||||
const topProbabilityScore = nextProbabilities.entries()
|
||||
.next().value?.[1];
|
||||
const [usedSeparatorToken, separatorProbability] = separatorTokens
|
||||
.filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
|
||||
.map((token) => [token, nextProbabilities.get(token)])
|
||||
.filter((pair) => pair[1] != null)
|
||||
.reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
|
||||
if (probabilityA >= probabilityB)
|
||||
return [tokenA, probabilityA];
|
||||
return [tokenB, probabilityB];
|
||||
}, [separatorTokens[0], 0]);
|
||||
if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
|
||||
return;
|
||||
// console.log(
|
||||
// i, contextSequence.model.detokenize([documentTokens[i]!]),
|
||||
// Array.from(nextProbabilities.entries()).slice(0, 5)
|
||||
// .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
|
||||
// );
|
||||
if (separatorProbability >= topProbabilityScore)
|
||||
pushSeparatorIndex(i + 1, usedSeparatorToken);
|
||||
else if (i > skipFirstTokens) {
|
||||
const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
|
||||
let maxChunkSizeAlignment = 0;
|
||||
if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
|
||||
const leftProbability = 1 - adjustedProbability;
|
||||
const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
|
||||
maxChunkSizeAlignment = currentChunkSize === 0
|
||||
? 0
|
||||
: adjustExponential(leftProbability * Math.min(1, currentChunkSize / maxChunkSize), maxChunkSizeAlignmentCurve <= 0
|
||||
? 1
|
||||
: maxChunkSizeAlignmentCurve, 0.8);
|
||||
if (currentChunkSize === maxChunkSize)
|
||||
maxChunkSizeAlignment = 1;
|
||||
}
|
||||
if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
|
||||
pushSeparatorIndex(i + 1, usedSeparatorToken);
|
||||
// update the weight of the current token with the adjusted probability in the trail
|
||||
if (recentProbabilitiesTrail.length > 1) {
|
||||
weight /= recentProbabilitiesTrail.pop();
|
||||
recentProbabilitiesTrail.push(adjustedProbability);
|
||||
weight *= adjustedProbability;
|
||||
}
|
||||
}
|
||||
}
|
||||
const nextDocumentTokenProbability = nextDocumentToken == null
|
||||
? undefined
|
||||
: nextProbabilities.get(nextDocumentToken);
|
||||
if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
|
||||
recentProbabilitiesTrail.push(nextDocumentTokenProbability);
|
||||
weight *= nextDocumentTokenProbability;
|
||||
if (recentProbabilitiesTrail.length > normalizationTrailSize)
|
||||
weight /= recentProbabilitiesTrail.shift();
|
||||
}
|
||||
}
|
||||
});
|
||||
if (lastPushedSeparatorIndex !== documentTokens.length)
|
||||
pushSeparatorIndex(documentTokens.length, separatorTokens[0]);
|
||||
return res;
|
||||
}
|
||||
const idealTokenTexts = [
|
||||
"\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
|
||||
"\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
|
||||
"\u00a1", // inverted exclamation mark
|
||||
"|",
|
||||
"_"
|
||||
];
|
||||
function findAppropriateSeparatorTokens(model, maxTokens = 2) {
|
||||
const idealTextsSet = new Set(idealTokenTexts);
|
||||
const foundTokens = [];
|
||||
for (const token of model.iterateAllTokens()) {
|
||||
if (model.isSpecialToken(token))
|
||||
continue;
|
||||
const text = model.detokenize([token]);
|
||||
const trimmedText = text.trim();
|
||||
if (idealTextsSet.has(trimmedText)) {
|
||||
const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
|
||||
if (foundTokens[textIndex] == null || text === trimmedText)
|
||||
foundTokens[textIndex] = token;
|
||||
}
|
||||
}
|
||||
const res = [];
|
||||
for (let i = 0; i < idealTokenTexts.length; i++) {
|
||||
const token = foundTokens[i];
|
||||
if (token != null)
|
||||
res.push(token);
|
||||
}
|
||||
return res.slice(0, maxTokens);
|
||||
}
|
||||
function getDefaultPrompt({ separatorTokens, tokenizer, maxChunkSize = 500 }) {
|
||||
if (separatorTokens.length === 0)
|
||||
throw new Error("No separator tokens provided");
|
||||
else if (separatorTokens.length > 2)
|
||||
throw new Error("Maximum of 2 separator tokens are supported");
|
||||
return LlamaText.joinValues("\n", [
|
||||
'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
|
||||
"",
|
||||
"You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
|
||||
"",
|
||||
"# Instructions",
|
||||
LlamaText([
|
||||
"- For splits, use `",
|
||||
new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]])),
|
||||
'` as the "big split token" separator.'
|
||||
]),
|
||||
separatorTokens.length > 1 && (LlamaText([
|
||||
"- For small splits, use `",
|
||||
new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]])),
|
||||
'` as the "big split token" separator.'
|
||||
])),
|
||||
"- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
|
||||
LlamaText([
|
||||
"- You may get a user message that is unstructured or not structured cleanly. " +
|
||||
"Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
|
||||
" characters, and a big split every ", Math.floor(maxChunkSize), " characters."
|
||||
]),
|
||||
"- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
|
||||
"- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
|
||||
'- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
|
||||
"- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
|
||||
].filter((x) => x !== false));
|
||||
}
|
||||
function adjustExponential(value, exponent, weight) {
|
||||
if (value < 0)
|
||||
return 0;
|
||||
else if (value > 1)
|
||||
return 1;
|
||||
return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
|
||||
}
|
||||
//# sourceMappingURL=chunkDocument.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/utils/chunkDocument.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user