First upload version 0.0.1
This commit is contained in:
245
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
generated
vendored
Normal file
245
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.d.ts
generated
vendored
Normal file
@@ -0,0 +1,245 @@
|
||||
import { EventRelay } from "lifecycle-utils";
|
||||
import { Token } from "../../types.js";
|
||||
import { TokenMeter } from "../TokenMeter.js";
|
||||
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
|
||||
import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
|
||||
import { TokenPredictor } from "./TokenPredictor.js";
|
||||
export declare class LlamaContext {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
dispose(): Promise<void>;
|
||||
/** @hidden */
|
||||
[Symbol.asyncDispose](): Promise<void>;
|
||||
get disposed(): boolean;
|
||||
get model(): LlamaModel;
|
||||
get contextSize(): number;
|
||||
get batchSize(): number;
|
||||
get flashAttention(): boolean;
|
||||
/**
|
||||
* The actual size of the state in the memory in bytes.
|
||||
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
||||
*/
|
||||
get stateSize(): number;
|
||||
/** The number of threads currently used to evaluate tokens */
|
||||
get currentThreads(): number;
|
||||
/**
|
||||
* The number of threads that are preferred to be used to evaluate tokens.
|
||||
*
|
||||
* The actual number of threads used may be lower when other evaluations are running in parallel.
|
||||
*/
|
||||
get idealThreads(): number;
|
||||
getAllocatedContextSize(): number;
|
||||
get totalSequences(): number;
|
||||
get sequencesLeft(): number;
|
||||
/**
|
||||
* Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
|
||||
* When there are no sequences left, this method will throw an error.
|
||||
*/
|
||||
getSequence(options?: {
|
||||
contextShift?: ContextShiftOptions;
|
||||
/**
|
||||
* Token predictor to use for the sequence.
|
||||
* Don't share the same token predictor between multiple sequences.
|
||||
*
|
||||
* Using a token predictor doesn't affect the generation output itself -
|
||||
* it only allows for greater parallelization of the token evaluation to speed up the generation.
|
||||
*
|
||||
* > **Note:** that if a token predictor is too resource intensive,
|
||||
* > it can slow down the generation process due to the overhead of running the predictor.
|
||||
* >
|
||||
* > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
|
||||
*
|
||||
* Automatically disposed when disposing the sequence.
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
|
||||
*/
|
||||
tokenPredictor?: TokenPredictor;
|
||||
}): LlamaContextSequence;
|
||||
dispatchPendingBatch(): void;
|
||||
/**
|
||||
* Print the timings of token evaluation since that last print for this context.
|
||||
*
|
||||
* Requires the `performanceTracking` option to be enabled.
|
||||
*
|
||||
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
||||
* it won't print anything.
|
||||
*/
|
||||
printTimings(): Promise<void>;
|
||||
}
|
||||
export declare class LlamaContextSequence {
|
||||
readonly onDispose: EventRelay<void>;
|
||||
private constructor();
|
||||
dispose(): void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void;
|
||||
get disposed(): boolean;
|
||||
get context(): LlamaContext;
|
||||
get model(): LlamaModel;
|
||||
/** The maximum number of tokens that the sequence state can hold */
|
||||
get contextSize(): number;
|
||||
/** The index where the next evaluated token will be placed in the context */
|
||||
get nextTokenIndex(): number;
|
||||
/** The current context state tokens */
|
||||
get contextTokens(): Token[];
|
||||
get tokenMeter(): TokenMeter;
|
||||
/**
|
||||
* The token predictor used when creating this sequence.
|
||||
*/
|
||||
get tokenPredictor(): TokenPredictor | undefined;
|
||||
/**
|
||||
* Get the index of the first token in the KV cache.
|
||||
*
|
||||
* If you remove any tokens from the state that come before this index,
|
||||
* no cached prefix tokens evaluation state will be used for the next evaluation.
|
||||
*
|
||||
* For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
|
||||
* then the cached state for range `0-10` will be used in the next evaluation,
|
||||
* but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
|
||||
* and will be re-evaluated in the next evaluation.
|
||||
*
|
||||
* This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
|
||||
*
|
||||
* When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
|
||||
*
|
||||
* When the KV cache is empty, this index will be `-1`.
|
||||
*
|
||||
* You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
|
||||
*/
|
||||
get stateCellsStartIndex(): number;
|
||||
/**
|
||||
* Statistics of token predictions using the sequence's `tokenPredictor`.
|
||||
*
|
||||
* The statistics change only when token prediction is used in this sequence.
|
||||
*
|
||||
* `validated` + `refuted` = total number of evaluated predictions.
|
||||
*
|
||||
* Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
|
||||
*/
|
||||
get tokenPredictions(): {
|
||||
/** Number of token predictions that were actually used (tokens that were validated and then consumed) */
|
||||
used: number;
|
||||
/** Number of token predictions that were not used (tokens that were validated and were not consumed) */
|
||||
unused: number;
|
||||
/** Number of token predictions that were validated successfully */
|
||||
validated: number;
|
||||
/** Number of token predictions that were refuted */
|
||||
refuted: number;
|
||||
};
|
||||
get isLoadedToMemory(): boolean;
|
||||
compareContextTokens(tokens: Token[]): {
|
||||
firstDifferentIndex: number;
|
||||
};
|
||||
/**
|
||||
* Erase parts of the context state to align it with the given tokens.
|
||||
*
|
||||
* If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
|
||||
*
|
||||
* To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
|
||||
*
|
||||
* If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
|
||||
* which incurs token evaluation of the shifted tokens.
|
||||
*/
|
||||
adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
|
||||
/**
|
||||
* Clear the history of the sequence.
|
||||
*/
|
||||
clearHistory(): Promise<void>;
|
||||
/**
|
||||
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
||||
* The start of each range is inclusive, and the end of each range is exclusive.
|
||||
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
||||
*/
|
||||
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
|
||||
*
|
||||
* This method uses the token predictor (when provided) to generate new tokens faster.
|
||||
*/
|
||||
evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
|
||||
/**
|
||||
* Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
|
||||
*
|
||||
* Configure the additional metadata options to choose which metadata to include.
|
||||
*/
|
||||
evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence without generating new tokens.
|
||||
*/
|
||||
evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/** Override the sequence context shift options for this evaluation */
|
||||
contextShift?: ContextShiftOptions;
|
||||
}): Promise<void>;
|
||||
/**
|
||||
* Evaluate the provided tokens into the context sequence with custom options for each token.
|
||||
*
|
||||
* This method allows for more precise control of the generation process.
|
||||
*
|
||||
* A next token will be generated for a given token only if any of the `generateNext` options for it are used.
|
||||
*
|
||||
* To generate more tokens after this method finishes,
|
||||
* use it again with token(s) you selected to add to the context from the previous evaluation.
|
||||
*
|
||||
* This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
|
||||
* Use the `evaluate` method when you need to use token prediction.
|
||||
* @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
|
||||
* For indexes that have no output, there won't be any value at the corresponding index in the output array.
|
||||
*
|
||||
* It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
|
||||
*/
|
||||
controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/** Override the sequence context shift options for this evaluation */
|
||||
contextShift?: ContextShiftOptions;
|
||||
/** Called on each token result after it's generated */
|
||||
onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
|
||||
}): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
|
||||
/**
|
||||
* Save the current context sequence evaluation state to a file.
|
||||
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
|
||||
*/
|
||||
saveStateToFile(filePath: string): Promise<{
|
||||
fileSize: number;
|
||||
}>;
|
||||
/**
|
||||
* Load a context sequence evaluation state from a file.
|
||||
*
|
||||
* Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
|
||||
*
|
||||
* You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
|
||||
* @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
|
||||
*/
|
||||
loadStateFromFile(filePath: string, acceptRisk: {
|
||||
/**
|
||||
* Loading a state file created using a different model may crash the process.
|
||||
*
|
||||
* You must accept this risk to use this feature.
|
||||
*/
|
||||
acceptRisk: true;
|
||||
}): Promise<void>;
|
||||
}
|
||||
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
|
||||
contextSize: number;
|
||||
sequences: number;
|
||||
}): number;
|
||||
export declare function getDefaultContextSequences(): number;
|
||||
export declare function getDefaultModelContextSize({ trainContextSize }: {
|
||||
trainContextSize?: number;
|
||||
}): number;
|
||||
1691
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
generated
vendored
Normal file
1691
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaContext.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.d.ts
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export {};
|
||||
31
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
generated
vendored
Normal file
31
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js
generated
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
/** @internal */
|
||||
export class LlamaSampler {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _sampler;
|
||||
/** @internal */ disposed = false;
|
||||
constructor(model) {
|
||||
this._llama = model._llama;
|
||||
this._sampler = new this._llama._bindings.AddonSampler(model._model);
|
||||
this.asyncDispose = this.asyncDispose.bind(this);
|
||||
}
|
||||
dispose() {
|
||||
this.disposed = true;
|
||||
this._sampler.dispose();
|
||||
}
|
||||
async asyncDispose() {
|
||||
this.disposed = true;
|
||||
this._sampler.dispose();
|
||||
}
|
||||
applyConfig(config) {
|
||||
return this._sampler.applyConfig(config);
|
||||
}
|
||||
/** @internal */
|
||||
static _canBeNextTokenForGrammarEvaluationState(llama, grammarEvaluationState, token) {
|
||||
return llama._bindings.AddonSampler.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
|
||||
}
|
||||
/** @internal */
|
||||
static _acceptTokenOnGrammarEvaluationState(llama, grammarEvaluationState, token) {
|
||||
llama._bindings.AddonSampler.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=LlamaSampler.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/LlamaSampler.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"LlamaSampler.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/LlamaSampler.ts"],"names":[],"mappings":"AAMA,gBAAgB;AAChB,MAAM,OAAO,YAAY;IACrB,gBAAgB,CAAiB,MAAM,CAAQ;IAC/C,gBAAgB,CAAiB,QAAQ,CAAe;IACxD,gBAAgB,CAAQ,QAAQ,GAAY,KAAK,CAAC;IAElD,YAAmB,KAAiB;QAChC,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAErE,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrD,CAAC;IAEM,OAAO;QACV,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,KAAK,CAAC,YAAY;QACrB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;IAC5B,CAAC;IAEM,WAAW,CAAC,MAAkD;QACjE,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,wCAAwC,CAClD,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,OAAO,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,uCAAuC,CACvE,sBAAsB,CAAC,MAAM,EAC7B,KAAK,CACR,CAAC;IACN,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,oCAAoC,CAC9C,KAAY,EACZ,sBAAmD,EACnD,KAAY;QAEZ,KAAK,CAAC,SAAS,CAAC,YAAY,CAAC,iCAAiC,CAAC,sBAAsB,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IACzG,CAAC;CACJ"}
|
||||
55
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
generated
vendored
Normal file
55
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
import { Token } from "../../types.js";
|
||||
import { SequenceEvaluateOptions } from "./types.js";
|
||||
import { LlamaContextSequence } from "./LlamaContext.js";
|
||||
/**
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
|
||||
*/
|
||||
export declare abstract class TokenPredictor {
|
||||
/**
|
||||
* Resets the state of the predictor.
|
||||
*
|
||||
* Called before the generation starts.
|
||||
*/
|
||||
abstract reset(params: {
|
||||
/** The target sequence that this token predictor is generating tokens for */
|
||||
targetSequence: LlamaContextSequence;
|
||||
/**
|
||||
* The tokens that are or will be loaded into the state.
|
||||
*
|
||||
* The initial predictions should be based on these tokens.
|
||||
*
|
||||
* When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
|
||||
*/
|
||||
stateTokens: Token[];
|
||||
/**
|
||||
* Options used for the evaluation on the target sequence.
|
||||
*
|
||||
* The `grammarEvaluationState` is cloned before being passed to the token predictor,
|
||||
* so it can be modified without affecting the original state.
|
||||
*/
|
||||
evaluateOptions: Readonly<SequenceEvaluateOptions>;
|
||||
}): Promise<void> | void;
|
||||
abstract pushTokens(tokens: Token[]): void;
|
||||
/**
|
||||
* Predicts the next tokens based on the current state.
|
||||
*
|
||||
* If the generation should wait until the minimum predications are ready,
|
||||
* this method should return a promise that resolves when the minimum predictions are ready.
|
||||
*
|
||||
* A background prediction process can be started when this function is called,
|
||||
* so that the next predictions will be ready when this function is called again.
|
||||
*/
|
||||
abstract predictTokens(): Promise<Token[]> | Token[];
|
||||
/**
|
||||
* Stops the prediction process when it runs in the background.
|
||||
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
|
||||
*/
|
||||
stop(untilPredictionsExhausted?: boolean): Promise<void> | void;
|
||||
/**
|
||||
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*/
|
||||
updateInputTokens(tokens: Token[]): void;
|
||||
dispose(): Promise<void> | void;
|
||||
/** @hidden */
|
||||
[Symbol.dispose](): void | Promise<void>;
|
||||
}
|
||||
20
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
generated
vendored
Normal file
20
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
|
||||
*/
|
||||
export class TokenPredictor {
|
||||
/**
|
||||
* Stops the prediction process when it runs in the background.
|
||||
* @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
|
||||
*/
|
||||
stop(untilPredictionsExhausted) { }
|
||||
/**
|
||||
* Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*/
|
||||
updateInputTokens(tokens) { }
|
||||
dispose() { }
|
||||
/** @hidden */
|
||||
[Symbol.dispose]() {
|
||||
return this.dispose();
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=TokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/TokenPredictor.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TokenPredictor.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/TokenPredictor.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,MAAM,OAAgB,cAAc;IAwChC;;;OAGG;IACI,IAAI,CAAC,yBAAmC,IAAyB,CAAC;IAEzE;;OAEG;IACI,iBAAiB,CAAC,MAAe,IAAS,CAAC;IAE3C,OAAO,KAA0B,CAAC;IAEzC,cAAc;IACP,CAAC,MAAM,CAAC,OAAO,CAAC;QACnB,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;IAC1B,CAAC;CACJ"}
|
||||
56
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
generated
vendored
Normal file
56
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
import { Token } from "../../../types.js";
|
||||
import { SequenceEvaluateOptions } from "../types.js";
|
||||
import { LlamaContextSequence } from "../LlamaContext.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
/**
|
||||
* Predicts the next tokens by evaluating the current state of the target sequence
|
||||
* on a draft sequence from a smaller and faster draft model.
|
||||
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
|
||||
*/
|
||||
export declare class DraftSequenceTokenPredictor extends TokenPredictor {
|
||||
constructor(draftSequence: LlamaContextSequence, options?: {
|
||||
/**
|
||||
* The minimum number of tokens to draft.
|
||||
*
|
||||
* Defaults to `0`.
|
||||
*/
|
||||
minTokens?: number;
|
||||
/**
|
||||
* Maximum number of tokens to draft.
|
||||
*
|
||||
* Defaults to `16`.
|
||||
*/
|
||||
maxTokens?: number;
|
||||
/**
|
||||
* Evaluate options default to the values of the target sequence.
|
||||
*
|
||||
* You can override any of the options for the prediction here.
|
||||
*/
|
||||
evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
|
||||
/**
|
||||
* Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
|
||||
* When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
|
||||
* are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
|
||||
*
|
||||
* A number between `0` and `1` representing the minimum probability of the token to be generated.
|
||||
*
|
||||
* Set to `0` to disable.
|
||||
*
|
||||
* Defaults to `0.6`.
|
||||
*/
|
||||
minConfidence?: number;
|
||||
});
|
||||
get draftSequence(): LlamaContextSequence;
|
||||
get minTokens(): number;
|
||||
get maxTokens(): number;
|
||||
get minConfidence(): number | undefined;
|
||||
reset({ targetSequence, stateTokens, evaluateOptions }: {
|
||||
targetSequence: LlamaContextSequence;
|
||||
stateTokens: Token[];
|
||||
evaluateOptions: Readonly<SequenceEvaluateOptions>;
|
||||
}): Promise<void>;
|
||||
pushTokens(tokens: Token[]): void;
|
||||
predictTokens(): Token[] | Promise<Token[]>;
|
||||
stop(untilPredictionsExhausted?: boolean): void;
|
||||
dispose(): void;
|
||||
}
|
||||
266
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
generated
vendored
Normal file
266
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,266 @@
|
||||
import { withLock } from "lifecycle-utils";
|
||||
import { pushAll } from "../../../utils/pushAll.js";
|
||||
import { getConsoleLogPrefix } from "../../../utils/getConsoleLogPrefix.js";
|
||||
import { LlamaSampler } from "../LlamaSampler.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
const defaultPredictionMinTokens = 0;
|
||||
const defaultPredictionMaxTokens = 16;
|
||||
const defaultPredictionMinConfidence = 0.6;
|
||||
/**
|
||||
* Predicts the next tokens by evaluating the current state of the target sequence
|
||||
* on a draft sequence from a smaller and faster draft model.
|
||||
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
|
||||
*/
|
||||
export class DraftSequenceTokenPredictor extends TokenPredictor {
|
||||
/** @internal */ _draftSequence;
|
||||
/** @internal */ _minTokens;
|
||||
/** @internal */ _maxTokens;
|
||||
/** @internal */ _minConfidence;
|
||||
/** @internal */ _stateTokens = [];
|
||||
/** @internal */ _pendingEvalTokens = [];
|
||||
/** @internal */ _predictedTokens = [];
|
||||
/** @internal */ _evaluateOptions = {};
|
||||
/** @internal */ _overrideEvaluateOptions = {};
|
||||
/** @internal */ _grammarEvaluationStateOption;
|
||||
/** @internal */ _currentEvaluationAbortController = new AbortController();
|
||||
/** @internal */ _resetAbortController = new AbortController();
|
||||
/** @internal */ _stopped = true;
|
||||
/** @internal */ _waitForPredictionExhaustion = false;
|
||||
/** @internal */ _minTokensCallbacks = [];
|
||||
/** @internal */ _resetPredictions = false;
|
||||
/** @internal */ _iterator;
|
||||
/** @internal */ _active = false;
|
||||
/** @internal */ _disposed = false;
|
||||
constructor(draftSequence, options = {}) {
|
||||
super();
|
||||
this._draftSequence = draftSequence;
|
||||
this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
|
||||
this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
|
||||
this._overrideEvaluateOptions = options.evaluateOptions ?? {};
|
||||
this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
|
||||
if (draftSequence.disposed)
|
||||
throw new Error("The draft sequence is disposed");
|
||||
}
|
||||
get draftSequence() {
|
||||
return this._draftSequence;
|
||||
}
|
||||
get minTokens() {
|
||||
return this._minTokens;
|
||||
}
|
||||
get maxTokens() {
|
||||
return this._maxTokens;
|
||||
}
|
||||
get minConfidence() {
|
||||
return this._minConfidence;
|
||||
}
|
||||
async reset({ targetSequence, stateTokens, evaluateOptions }) {
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._resetAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
this._resetAbortController = new AbortController();
|
||||
this._stopped = true;
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
const currentAbortSignal = this._resetAbortController.signal;
|
||||
targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
|
||||
try {
|
||||
await withLock([this, "evaluate"], currentAbortSignal, async () => {
|
||||
this._stateTokens = stateTokens.slice();
|
||||
this._pendingEvalTokens = [];
|
||||
this._predictedTokens = [];
|
||||
this._resetPredictions = false;
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
const lastToken = this._stateTokens.pop();
|
||||
if (lastToken != null)
|
||||
this._pendingEvalTokens.push(lastToken);
|
||||
this._evaluateOptions = evaluateOptions;
|
||||
this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
|
||||
? this._evaluateOptions.grammarEvaluationState()?.clone()
|
||||
: this._evaluateOptions.grammarEvaluationState?.clone();
|
||||
const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
|
||||
await this._draftSequence.adaptStateToTokens(newStateTokens, true);
|
||||
newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
|
||||
await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
|
||||
contextShift: this._evaluateOptions.contextShift,
|
||||
evaluationPriority: this._evaluateOptions.evaluationPriority
|
||||
});
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err !== currentAbortSignal.reason)
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
pushTokens(tokens) {
|
||||
const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
|
||||
? this._evaluateOptions.grammarEvaluationState()?.clone()
|
||||
: this._evaluateOptions.grammarEvaluationState?.clone();
|
||||
void withLock([this, "pushTokens"], async () => {
|
||||
this._grammarEvaluationStateOption = grammarEvaluationStateOption;
|
||||
const tokensToPush = tokens.slice();
|
||||
while (!this._resetPredictions && tokensToPush.length > 0) {
|
||||
const token = tokensToPush.shift();
|
||||
if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
|
||||
this._predictedTokens.shift();
|
||||
}
|
||||
else {
|
||||
tokensToPush.unshift(token);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tokensToPush.length === 0) {
|
||||
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
|
||||
this._resume();
|
||||
return;
|
||||
}
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
pushAll(this._pendingEvalTokens, tokensToPush);
|
||||
this._resetPredictions = true;
|
||||
this._resume();
|
||||
});
|
||||
}
|
||||
predictTokens() {
|
||||
if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
|
||||
return this._predictedTokens;
|
||||
this._stopped = false;
|
||||
if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._resume();
|
||||
}
|
||||
if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
|
||||
return this._predictedTokens;
|
||||
if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
|
||||
if (this._resetPredictions)
|
||||
return [];
|
||||
return this._predictedTokens;
|
||||
}
|
||||
return new Promise((accept) => void this._minTokensCallbacks.push(accept))
|
||||
.then(() => {
|
||||
if (this._resetPredictions)
|
||||
return [];
|
||||
return this._predictedTokens;
|
||||
});
|
||||
}
|
||||
stop(untilPredictionsExhausted = false) {
|
||||
this._stopped = true;
|
||||
this._currentEvaluationAbortController.abort();
|
||||
this._currentEvaluationAbortController = new AbortController();
|
||||
if (untilPredictionsExhausted)
|
||||
this._waitForPredictionExhaustion = true;
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
});
|
||||
}
|
||||
dispose() {
|
||||
this._disposed = true;
|
||||
this._stopped = true;
|
||||
this._resetAbortController.abort();
|
||||
this._currentEvaluationAbortController.abort();
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
_canIterate() {
|
||||
return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
|
||||
}
|
||||
/** @internal */
|
||||
_resume() {
|
||||
if (this._active || !this._canIterate())
|
||||
return;
|
||||
this._active = true;
|
||||
void withLock([this, "evaluate"], async () => {
|
||||
try {
|
||||
const abortSignal = this._currentEvaluationAbortController.signal;
|
||||
if (!this._canIterate() || abortSignal.aborted)
|
||||
return;
|
||||
const resetPredications = async () => {
|
||||
this._iterator?.return();
|
||||
this._iterator = undefined;
|
||||
this._waitForPredictionExhaustion = false;
|
||||
this._resetPredictions = false;
|
||||
const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
|
||||
this._predictedTokens = [];
|
||||
await this._draftSequence.eraseContextTokenRanges([{
|
||||
start: this._draftSequence.nextTokenIndex - tokenToDelete,
|
||||
end: this._draftSequence.nextTokenIndex
|
||||
}]);
|
||||
};
|
||||
const createIterator = () => {
|
||||
const tokens = this._pendingEvalTokens;
|
||||
this._pendingEvalTokens = [];
|
||||
return this.draftSequence.evaluateWithMetadata(tokens, { confidence: true }, {
|
||||
...this._evaluateOptions,
|
||||
...this._overrideEvaluateOptions,
|
||||
grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
|
||||
});
|
||||
};
|
||||
if (this._resetPredictions)
|
||||
await resetPredications();
|
||||
if (!this._canIterate() || abortSignal.aborted)
|
||||
return;
|
||||
let iterator = createIterator();
|
||||
this._iterator = iterator;
|
||||
while (this._canIterate() && !abortSignal.aborted) {
|
||||
const { value, done } = await iterator.next();
|
||||
let shouldBreak = done;
|
||||
if (value != null) {
|
||||
const { token, confidence } = value;
|
||||
if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
|
||||
confidence < this._minConfidence) {
|
||||
this._iterator = undefined;
|
||||
await iterator.return();
|
||||
this._waitForPredictionExhaustion = true;
|
||||
shouldBreak = true;
|
||||
}
|
||||
else
|
||||
this._predictedTokens.push(token);
|
||||
}
|
||||
if (this._resetPredictions && !abortSignal.aborted) {
|
||||
await resetPredications();
|
||||
iterator = createIterator();
|
||||
this._iterator = iterator;
|
||||
continue;
|
||||
}
|
||||
if (this._predictedTokens.length >= this._minTokens) {
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
}
|
||||
if (shouldBreak) {
|
||||
this._iterator = undefined;
|
||||
await iterator.return();
|
||||
this._waitForPredictionExhaustion = true;
|
||||
while (this._minTokensCallbacks.length > 0)
|
||||
this._minTokensCallbacks.shift()?.();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
this._active = false;
|
||||
}
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
_getGrammarEvaluationStateWithTokens(tokens) {
|
||||
if (this._grammarEvaluationStateOption == null)
|
||||
return undefined;
|
||||
const clone = this._grammarEvaluationStateOption.clone();
|
||||
for (const token of tokens) {
|
||||
const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
|
||||
if (!canAddToken) {
|
||||
console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
|
||||
this._grammarEvaluationStateOption = undefined;
|
||||
return undefined;
|
||||
}
|
||||
LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=DraftSequenceTokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
58
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
generated
vendored
Normal file
58
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts
generated
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
import { Token } from "../../../types.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
/**
|
||||
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
|
||||
*
|
||||
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
|
||||
* such as in text summarization or modifying code).
|
||||
*
|
||||
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*
|
||||
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
|
||||
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
|
||||
*/
|
||||
export declare class InputLookupTokenPredictor extends TokenPredictor {
|
||||
constructor(options?: {
|
||||
patternLength?: {
|
||||
/**
|
||||
* Min pattern length to look for in the input tokens.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
min?: number;
|
||||
/**
|
||||
* Max pattern length to look for in the input tokens.
|
||||
*
|
||||
* Set to `0` to disable the max pattern size.
|
||||
*
|
||||
* Defaults to `0`.
|
||||
*/
|
||||
max?: number;
|
||||
};
|
||||
predictionLength?: {
|
||||
/**
|
||||
* Minimum number of tokens to predict.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
min?: number;
|
||||
/**
|
||||
* Maximum number of tokens to predict.
|
||||
*
|
||||
* Defaults to `3`.
|
||||
*/
|
||||
max?: number;
|
||||
};
|
||||
});
|
||||
get patternMinLength(): number;
|
||||
get patternMaxLength(): number;
|
||||
get predictionMinLength(): number;
|
||||
get predictionMaxLength(): number;
|
||||
reset({ stateTokens }: {
|
||||
stateTokens: Token[];
|
||||
}): void;
|
||||
updateInputTokens(tokens: Token[]): void;
|
||||
pushTokens(tokens: Token[]): void;
|
||||
predictTokens(): Token[];
|
||||
dispose(): void;
|
||||
}
|
||||
138
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
generated
vendored
Normal file
138
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js
generated
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
import { DisposedError } from "lifecycle-utils";
|
||||
import { pushAll } from "../../../utils/pushAll.js";
|
||||
import { TokenPredictor } from "../TokenPredictor.js";
|
||||
const defaultPatternMinLength = 1;
|
||||
const defaultPatternMaxLength = 0;
|
||||
const defaultPredictionMinLength = 1;
|
||||
const defaultPredictionMaxLength = 3;
|
||||
/**
|
||||
* Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
|
||||
*
|
||||
* This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
|
||||
* such as in text summarization or modifying code).
|
||||
*
|
||||
* This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
|
||||
*
|
||||
* Based on https://github.com/apoorvumang/prompt-lookup-decoding.
|
||||
* @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
|
||||
*/
|
||||
export class InputLookupTokenPredictor extends TokenPredictor {
|
||||
/** @internal */ _patternMinLength;
|
||||
/** @internal */ _patternMaxLength;
|
||||
/** @internal */ _predictionMinLength;
|
||||
/** @internal */ _predictionMaxLength;
|
||||
/** @internal */ _lastPredictionMatchStartIndex = undefined;
|
||||
/** @internal */ _lastPredictionMatchLength = undefined;
|
||||
/** @internal */ _stateTokens = [];
|
||||
/** @internal */ _inputTokens = [];
|
||||
/** @internal */ _disposed = false;
|
||||
constructor(options = {}) {
|
||||
super();
|
||||
this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
|
||||
this._patternMaxLength = Math.floor(Math.max(0, Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)));
|
||||
this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
|
||||
this._predictionMaxLength = Math.floor(Math.max(this._patternMinLength, options.predictionLength?.max ?? defaultPredictionMaxLength));
|
||||
}
|
||||
get patternMinLength() {
|
||||
return this._patternMinLength;
|
||||
}
|
||||
get patternMaxLength() {
|
||||
return this._patternMaxLength;
|
||||
}
|
||||
get predictionMinLength() {
|
||||
return this._predictionMinLength;
|
||||
}
|
||||
get predictionMaxLength() {
|
||||
return this._predictionMaxLength;
|
||||
}
|
||||
reset({ stateTokens }) {
|
||||
this._stateTokens = stateTokens.slice();
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
updateInputTokens(tokens) {
|
||||
this._inputTokens = tokens.slice();
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
pushTokens(tokens) {
|
||||
pushAll(this._stateTokens, tokens);
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
this._lastPredictionMatchLength += tokens.length;
|
||||
}
|
||||
}
|
||||
predictTokens() {
|
||||
if (this._disposed)
|
||||
throw new DisposedError();
|
||||
if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
|
||||
return [];
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
for (let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1, s = this._stateTokens.length - 1; p >= this._lastPredictionMatchStartIndex && s >= 0; p--, s--) {
|
||||
if (this._inputTokens[p] !== this._stateTokens[s]) {
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
|
||||
const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
|
||||
if (predictionEndIndex < this._inputTokens.length) {
|
||||
return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
|
||||
if (matchStartIndex == null || matchLength == null)
|
||||
return [];
|
||||
const predictionEndIndex = matchStartIndex + matchLength;
|
||||
const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
|
||||
if (res.length >= this._predictionMinLength) {
|
||||
this._lastPredictionMatchStartIndex = matchStartIndex;
|
||||
this._lastPredictionMatchLength = matchLength;
|
||||
return res;
|
||||
}
|
||||
return [];
|
||||
}
|
||||
dispose() {
|
||||
this._disposed = true;
|
||||
this._stateTokens = [];
|
||||
this._inputTokens = [];
|
||||
delete this._lastPredictionMatchStartIndex;
|
||||
delete this._lastPredictionMatchLength;
|
||||
}
|
||||
/** @internal */
|
||||
_findLongestPatternIndex(findIn, lookupPattern) {
|
||||
const checkIndexes = [];
|
||||
let bestIndex = -1;
|
||||
let bestIndexDiff = -1;
|
||||
for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
|
||||
const token = findIn[i];
|
||||
for (let j = checkIndexes.length - 1; j >= 0; j--) {
|
||||
const startIndex = checkIndexes[j];
|
||||
const indexDiff = startIndex - i;
|
||||
if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength)) {
|
||||
checkIndexes.splice(j, 1);
|
||||
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
|
||||
bestIndex = startIndex;
|
||||
bestIndexDiff = indexDiff;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (token === lookupPattern[lookupPattern.length - 1])
|
||||
checkIndexes.unshift(i);
|
||||
}
|
||||
for (let j = checkIndexes.length - 1; j >= 0; j--) {
|
||||
const startIndex = checkIndexes[j];
|
||||
const indexDiff = startIndex + 1;
|
||||
checkIndexes.splice(j, 1);
|
||||
if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
|
||||
bestIndex = startIndex;
|
||||
bestIndexDiff = indexDiff;
|
||||
}
|
||||
}
|
||||
if (bestIndex >= 0)
|
||||
return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
|
||||
return [];
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=InputLookupTokenPredictor.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
458
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
generated
vendored
Normal file
458
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.d.ts
generated
vendored
Normal file
@@ -0,0 +1,458 @@
|
||||
import { PickOptions } from "../../utils/utilTypes.js";
|
||||
import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
|
||||
import type { TokenBias } from "../TokenBias.js";
|
||||
import type { Token } from "../../types.js";
|
||||
import type { LlamaContextSequence } from "./LlamaContext.js";
|
||||
export type LlamaContextOptions = {
|
||||
/**
|
||||
* number of sequences for the context.
|
||||
* Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context.
|
||||
* Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other.
|
||||
* This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch).
|
||||
*
|
||||
* Each sequence increases the memory usage of the context.
|
||||
*
|
||||
* Defaults to `1`.
|
||||
*/
|
||||
sequences?: number;
|
||||
/**
|
||||
* The number of tokens the model can see at once.
|
||||
* - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
|
||||
* the model was trained on.
|
||||
* - **`number`** - set the context size to a specific number of tokens.
|
||||
* If there's not enough VRAM, an error will be thrown.
|
||||
* Use with caution.
|
||||
* - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
|
||||
* up to the size the model was trained on, but at least `min` and at most `max`.
|
||||
*
|
||||
* The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
|
||||
* aligns the context size to multiples of 256 for performance reasons.
|
||||
* To check the actual context size that gets created, use the `.contextSize` property
|
||||
* of the created context instance or any of its sequences.
|
||||
*
|
||||
* Defaults to `"auto"`.
|
||||
*/
|
||||
contextSize?: "auto" | number | {
|
||||
min?: number;
|
||||
max?: number;
|
||||
};
|
||||
/**
|
||||
* The number of tokens that can be processed at once by the GPU.
|
||||
*
|
||||
* Defaults to `512` or `contextSize` if `contextSize` is less than `512`.
|
||||
*/
|
||||
batchSize?: number;
|
||||
/**
|
||||
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
|
||||
*
|
||||
* The support for flash attention is currently experimental and may not always work as expected.
|
||||
* Use with caution.
|
||||
*
|
||||
* This option will be ignored if flash attention is not supported by the model.
|
||||
*
|
||||
* Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
|
||||
*
|
||||
* Upon flash attention exiting the experimental status, the default value will become `true`
|
||||
* (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
|
||||
*/
|
||||
flashAttention?: boolean;
|
||||
/**
|
||||
* number of threads to use to evaluate tokens.
|
||||
* set to 0 to use the maximum threads supported by the current machine hardware.
|
||||
*
|
||||
* This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running.
|
||||
* To ensure the minimum number of threads you want to use are always used,
|
||||
* set this to an object with a `min` property (see the `min` property description for more details).
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
|
||||
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
|
||||
*/
|
||||
threads?: number | {
|
||||
/**
|
||||
* The ideal number of threads to use for evaluations.
|
||||
*
|
||||
* If other evaluations are running, the actual number of threads may be lower than this value.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used.
|
||||
*
|
||||
* If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance,
|
||||
* otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details).
|
||||
*/
|
||||
ideal?: number;
|
||||
/**
|
||||
* Ensure evaluations always use at least this number of threads.
|
||||
*
|
||||
* Use with caution, since setting this value too high can lead to the context waiting too much time
|
||||
* to reserve this number of threads before the evaluation can start.
|
||||
*/
|
||||
min?: number;
|
||||
};
|
||||
/**
|
||||
* Control the parallel sequences processing behavior.
|
||||
*
|
||||
* See {@link BatchingOptions} for more information.
|
||||
*/
|
||||
batching?: BatchingOptions;
|
||||
/**
|
||||
* When using SWA (Sliding Window Attention) on a supported model,
|
||||
* extend the sliding window size to the current context size (meaning practically disabling SWA).
|
||||
*
|
||||
* Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
|
||||
* but will allow reusing the evaluation cache of any prefix length of the context sequence state
|
||||
* (instead of just the size of the sliding window when SWA is used).
|
||||
*
|
||||
* This option has no effect on models that do not support SWA (Sliding Window Attention).
|
||||
*
|
||||
* > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
|
||||
*
|
||||
* Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
|
||||
*/
|
||||
swaFullCache?: boolean;
|
||||
/**
|
||||
* Load the provided LoRA adapters onto the context.
|
||||
* LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
|
||||
* without the need for extensive retraining from scratch.
|
||||
*
|
||||
* If a string is provided, it will be treated as a path to a single LoRA adapter file.
|
||||
*
|
||||
* The adapters will be released from memory once the model (not just the context) is disposed.
|
||||
*/
|
||||
lora?: string | {
|
||||
adapters: Array<{
|
||||
filePath: string;
|
||||
/**
|
||||
* Defaults to `1`
|
||||
*/
|
||||
scale?: number;
|
||||
}>;
|
||||
/**
|
||||
* Called with the LoRA adapters load percentage when the LoRA adapters are being loaded.
|
||||
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
|
||||
*/
|
||||
onLoadProgress?(loadProgress: number): void;
|
||||
};
|
||||
/** An abort signal to abort the context creation */
|
||||
createSignal?: AbortSignal;
|
||||
/**
|
||||
* Ignore insufficient memory errors and continue with the context creation.
|
||||
* Can cause the process to crash if there's not enough VRAM for the new context.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
/**
|
||||
* On failed context creation, retry the creation with a smaller context size.
|
||||
*
|
||||
* Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties.
|
||||
*
|
||||
* Set `retries` to `false` to disable.
|
||||
*/
|
||||
failedCreationRemedy?: false | {
|
||||
/**
|
||||
* Retries to attempt to create the context.
|
||||
*
|
||||
* Defaults to `6`.
|
||||
*/
|
||||
retries?: number;
|
||||
/**
|
||||
* The percentage to decrease the context size by on each retry.
|
||||
* Should be a number between `0` and `1`.
|
||||
*
|
||||
* If a function is provided, it will be called with the current context size and should return the new context size.
|
||||
*
|
||||
* Defaults to `0.16`.
|
||||
*/
|
||||
autoContextSizeShrink?: number | ((contextSize: number) => number);
|
||||
};
|
||||
/**
|
||||
* Track the inference performance of the context, so using `.printTimings()` will work.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
performanceTracking?: boolean;
|
||||
};
|
||||
export type LlamaContextSequenceRepeatPenalty = {
|
||||
/** Tokens to lower the predication probability of to be the next predicted token */
|
||||
punishTokens: Token[] | (() => Token[]);
|
||||
/**
|
||||
* The maximum number of tokens that will be provided in the `punishTokens` array.
|
||||
*
|
||||
* This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation.
|
||||
*
|
||||
* Don't set this value too high, as it can allocate too much memory.
|
||||
*
|
||||
* Defaults to `64`.
|
||||
*/
|
||||
maxPunishTokens?: number;
|
||||
/**
|
||||
* The relative amount to lower the probability of the tokens in `punishTokens` by.
|
||||
*
|
||||
* Defaults to `1.1`.
|
||||
* Set to `1` to disable.
|
||||
*/
|
||||
penalty?: number;
|
||||
/**
|
||||
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`.
|
||||
*
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
frequencyPenalty?: number;
|
||||
/**
|
||||
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`.
|
||||
*
|
||||
* Disabled by default (`0`).
|
||||
* Set to a value between `0` and `1` to enable.
|
||||
*/
|
||||
presencePenalty?: number;
|
||||
};
|
||||
export type BatchingOptions = {
|
||||
/**
|
||||
* The strategy used to dispatch items to be processed when there are items pending to be processed.
|
||||
* - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
|
||||
* You can provide a custom function to define a custom dispatch schedule.
|
||||
*
|
||||
* Defaults to `"nextCycle"`.
|
||||
*/
|
||||
dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule;
|
||||
/**
|
||||
* The strategy used to prioritize pending items to be processed.
|
||||
* - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
|
||||
* - **`"firstInFirstOut"`** - process items in the order they were added.
|
||||
* - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
|
||||
* See the {@link CustomBatchingPrioritizationStrategy} type for more information.
|
||||
*
|
||||
* Defaults to `"maximumParallelism"`.
|
||||
*/
|
||||
itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy;
|
||||
};
|
||||
/**
|
||||
* A function that schedules the dispatch of the batch items.
|
||||
* Call the `dispatch` function to dispatch the items.
|
||||
*/
|
||||
export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void;
|
||||
/**
|
||||
* A function that prioritizes the batch items to be processed.
|
||||
* The function receives an array of `items` and the `size` of how many tokens can be processed in this batch.
|
||||
*
|
||||
* The function should return an array of prioritized items,
|
||||
* where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received,
|
||||
* and where the `item` of each prioritized item is the same reference to an original item in the `items` array.
|
||||
*/
|
||||
export type CustomBatchingPrioritizationStrategy = (options: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}) => PrioritizedBatchItem[];
|
||||
export type ContextShiftOptions = {
|
||||
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
|
||||
strategy?: "eraseBeginning" | ((options: {
|
||||
sequence: LlamaContextSequence;
|
||||
size: number;
|
||||
}) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>);
|
||||
};
|
||||
export type ContextTokensDeleteRange = {
|
||||
start: number;
|
||||
end: number;
|
||||
};
|
||||
export type SequenceEvaluateOptions = {
|
||||
temperature?: number;
|
||||
minP?: number;
|
||||
topK?: number;
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Defaults to the current epoch time.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined);
|
||||
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
/**
|
||||
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
|
||||
* evaluated based on the strategy chosen for the context.
|
||||
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
|
||||
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
|
||||
* highest evaluation priority.
|
||||
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
|
||||
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
|
||||
*/
|
||||
evaluationPriority?: EvaluationPriority;
|
||||
/**
|
||||
* Override the sequence context shift options for this evaluation
|
||||
*
|
||||
* See {@link ContextShiftOptions} for more information.
|
||||
*/
|
||||
contextShift?: ContextShiftOptions;
|
||||
/**
|
||||
* Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
|
||||
* When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
yieldEogToken?: boolean;
|
||||
};
|
||||
export type SequenceEvaluateMetadataOptions = {
|
||||
/**
|
||||
* Get the confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `probabilities.get(token)` from the output.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
readonly confidence?: boolean;
|
||||
/**
|
||||
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
|
||||
*
|
||||
* Only enable when needed, as it impacts the performance.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
readonly probabilities?: boolean;
|
||||
};
|
||||
export type SequenceEvaluateOutput<Options extends {
|
||||
readonly confidence?: boolean;
|
||||
readonly probabilities?: boolean;
|
||||
} = {
|
||||
readonly confidence: true;
|
||||
readonly probabilities: true;
|
||||
}> = PickOptions<{
|
||||
/**
|
||||
* The next token generated by the model and selected using the given options (such a temperature).
|
||||
*/
|
||||
token: Token;
|
||||
/**
|
||||
* The confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `probabilities.get(token)`.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence: number;
|
||||
/**
|
||||
* The probabilities of the tokens from the vocabulary to be the next token.
|
||||
*
|
||||
* A probability is a number from `0` to `1`.
|
||||
*
|
||||
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
|
||||
*
|
||||
* The map is sorted by the probability of the tokens from the highest to the lowest,
|
||||
* and is reflected in the order of the entries when iterating over the map.
|
||||
* Use `.entries().next().value` to get the top probability pair
|
||||
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
|
||||
*/
|
||||
probabilities: Map<Token, number>;
|
||||
}, Options & {
|
||||
token: true;
|
||||
}>;
|
||||
export type ControlledEvaluateInputItem = Token | [
|
||||
token: Token,
|
||||
options: {
|
||||
generateNext?: {
|
||||
/**
|
||||
* Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
|
||||
*
|
||||
* Only enable when needed, as it impacts the performance.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
probabilities?: boolean;
|
||||
/**
|
||||
* Get the confidence (probability) of the selected token.
|
||||
*
|
||||
* Same as `next.probabilities.get(next.token)` from the output.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence?: boolean;
|
||||
/**
|
||||
* Generate the next token with the provided options using sampling.
|
||||
*
|
||||
* Setting this to `true` will generate probabilities for the next token and sample it.
|
||||
*/
|
||||
token?: boolean;
|
||||
options?: {
|
||||
temperature?: number;
|
||||
minP?: number;
|
||||
topK?: number;
|
||||
topP?: number;
|
||||
/**
|
||||
* Used to control the randomness of the generated text.
|
||||
*
|
||||
* Change the seed to get different results.
|
||||
*
|
||||
* Defaults to the current epoch time.
|
||||
*
|
||||
* Only relevant when using `temperature`.
|
||||
*/
|
||||
seed?: number;
|
||||
repeatPenalty?: LlamaContextSequenceRepeatPenalty;
|
||||
/**
|
||||
* Adjust the probability of tokens being generated.
|
||||
* Can be used to bias the model to generate tokens that you want it to lean towards,
|
||||
* or to avoid generating tokens that you want it to avoid.
|
||||
*/
|
||||
tokenBias?: TokenBias | (() => TokenBias);
|
||||
};
|
||||
};
|
||||
}
|
||||
];
|
||||
export type ControlledEvaluateIndexOutput = {
|
||||
next: {
|
||||
token?: Token | null;
|
||||
/**
|
||||
* The confidence (probability) of the selected token (the `token` field in this object).
|
||||
*
|
||||
* Same as `next.probabilities.get(next.token)`.
|
||||
*
|
||||
* If you need only this value, you can skip getting the full probabilities list to improve performance.
|
||||
*
|
||||
* This value might be slightly different when evaluated on different GPUs and configurations.
|
||||
*/
|
||||
confidence?: number;
|
||||
/**
|
||||
* The probabilities of the tokens from the vocabulary to be the next token.
|
||||
*
|
||||
* A probability is a number from `0` to `1`.
|
||||
*
|
||||
* The probabilities might be slightly different when evaluated on different GPUs and configurations.
|
||||
*
|
||||
* The map is sorted by the probability of the tokens from the highest to the lowest,
|
||||
* and is reflected in the order of the entries when iterating over the map.
|
||||
* Use `.entries().next().value` to get the top probability pair
|
||||
* ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
|
||||
*/
|
||||
probabilities?: Map<Token, number>;
|
||||
};
|
||||
};
|
||||
/**
|
||||
* 1 - low
|
||||
*
|
||||
* 5 - high
|
||||
*/
|
||||
export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
|
||||
export type BatchItem = {
|
||||
readonly tokens: readonly Token[];
|
||||
readonly logits: readonly (true | undefined)[];
|
||||
readonly evaluationPriority: EvaluationPriority;
|
||||
};
|
||||
export type PrioritizedBatchItem = {
|
||||
item: BatchItem;
|
||||
processAmount: number;
|
||||
};
|
||||
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
generated
vendored
Normal file
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js
generated
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
export {};
|
||||
//# sourceMappingURL=types.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/types.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/evaluator/LlamaContext/types.ts"],"names":[],"mappings":""}
|
||||
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
|
||||
export declare function firstInFirstOutStrategy({ items, size }: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}): PrioritizedBatchItem[];
|
||||
16
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
export function firstInFirstOutStrategy({ items, size }) {
|
||||
const res = [];
|
||||
const sortedItems = items
|
||||
.slice()
|
||||
.sort((a, b) => b.evaluationPriority - a.evaluationPriority);
|
||||
let leftFreeTokens = size;
|
||||
for (const item of sortedItems) {
|
||||
const processAmount = Math.min(item.tokens.length, leftFreeTokens);
|
||||
res.push({ item, processAmount });
|
||||
leftFreeTokens -= processAmount;
|
||||
if (leftFreeTokens === 0)
|
||||
break;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
//# sourceMappingURL=firstInFirstOutStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"firstInFirstOutStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,uBAAuB,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IAC9F,MAAM,GAAG,GAA2B,EAAE,CAAC;IAEvC,MAAM,WAAW,GAAG,KAAK;SACpB,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,GAAG,CAAC,CAAC,kBAAkB,CAAC,CAAC;IAEjE,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC7B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;QACnE,GAAG,CAAC,IAAI,CAAC,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC,CAAC;QAChC,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
|
||||
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
import { BatchItem, PrioritizedBatchItem } from "../../types.js";
|
||||
export declare function maximumParallelismStrategy({ items, size }: {
|
||||
items: readonly BatchItem[];
|
||||
size: number;
|
||||
}): PrioritizedBatchItem[];
|
||||
42
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
generated
vendored
Normal file
42
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
export function maximumParallelismStrategy({ items, size }) {
|
||||
let leftFreeTokens = size;
|
||||
const minTokensForEachItem = Math.floor(leftFreeTokens / items.length);
|
||||
const res = [];
|
||||
const clippedItems = [];
|
||||
for (const item of items) {
|
||||
const processAmount = Math.min(item.tokens.length, leftFreeTokens, minTokensForEachItem);
|
||||
const prioritizeItem = { item, processAmount };
|
||||
res.push(prioritizeItem);
|
||||
leftFreeTokens -= processAmount;
|
||||
if (processAmount < item.tokens.length)
|
||||
clippedItems.push(prioritizeItem);
|
||||
if (leftFreeTokens === 0)
|
||||
break;
|
||||
}
|
||||
for (let passesLeft = 3; leftFreeTokens > 0 && clippedItems.length > 0 && passesLeft > 0; passesLeft--) {
|
||||
const minIncreaseAmount = Math.ceil(leftFreeTokens / clippedItems.length);
|
||||
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
|
||||
const prioritizeItem = clippedItems[i];
|
||||
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
|
||||
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens, minIncreaseAmount);
|
||||
prioritizeItem.processAmount += increaseAmount;
|
||||
if (increaseAmount === unprocessedAmount) {
|
||||
clippedItems.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
clippedItems.sort((a, b) => b.item.evaluationPriority - a.item.evaluationPriority);
|
||||
for (let i = 0; i < clippedItems.length && leftFreeTokens > 0; i++) {
|
||||
const prioritizeItem = clippedItems[i];
|
||||
const unprocessedAmount = prioritizeItem.item.tokens.length - prioritizeItem.processAmount;
|
||||
const increaseAmount = Math.min(unprocessedAmount, leftFreeTokens);
|
||||
prioritizeItem.processAmount += increaseAmount;
|
||||
if (increaseAmount === unprocessedAmount) {
|
||||
clippedItems.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
//# sourceMappingURL=maximumParallelismStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"maximumParallelismStrategy.js","sourceRoot":"","sources":["../../../../../src/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,0BAA0B,CAAC,EAAC,KAAK,EAAE,IAAI,EAA8C;IACjG,IAAI,cAAc,GAAG,IAAI,CAAC;IAC1B,MAAM,oBAAoB,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvE,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,YAAY,GAA2B,EAAE,CAAC;IAEhD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,EAAE,oBAAoB,CAAC,CAAC;QACzF,MAAM,cAAc,GAAG,EAAC,IAAI,EAAE,aAAa,EAAC,CAAC;QAE7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACzB,cAAc,IAAI,aAAa,CAAC;QAEhC,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM;YAClC,YAAY,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAEtC,IAAI,cAAc,KAAK,CAAC;YACpB,MAAM;IACd,CAAC;IAED,KAAK,IAAI,UAAU,GAAG,CAAC,EAAE,cAAc,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,EAAE,UAAU,EAAE,EAAE,CAAC;QACrG,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;QAE1E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;YACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;YAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,EAAE,iBAAiB,CAAC,CAAC;YACtF,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;YAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;gBACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACR,CAAC;QACL,CAAC;IACL,CAAC;IAED,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IAEnF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,IAAI,cAAc,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACjE,MAAM,cAAc,GAAG,YAAY,CAAC,CAAC,CAAE,CAAC;QACxC,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC;QAC3F,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QACnE,cAAc,CAAC,aAAa,IAAI,cAAc,CAAC;QAE/C,IAAI,cAAc,KAAK,iBAAiB,EAAE,CAAC;YACvC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC1B,CAAC,EAAE,CAAC;QACR,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.d.ts
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export declare function padSafeContextSize(value: number, padDirection: "up" | "down", padding?: number): number;
|
||||
18
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
generated
vendored
Normal file
18
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js
generated
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
import { contextSizePad } from "../../../config.js";
|
||||
export function padSafeContextSize(value, padDirection, padding = contextSizePad) {
|
||||
const paddedSize = ggmlPad(value, padding);
|
||||
if (paddedSize === value)
|
||||
return value;
|
||||
else if (padDirection === "up")
|
||||
return paddedSize;
|
||||
else if (padDirection === "down") {
|
||||
const smallerPaddedSize = ggmlPad(value - padding, padding);
|
||||
if (smallerPaddedSize >= padding)
|
||||
return smallerPaddedSize;
|
||||
}
|
||||
return paddedSize;
|
||||
}
|
||||
function ggmlPad(value, padding) {
|
||||
return ((value + padding - 1) & ~(padding - 1));
|
||||
}
|
||||
//# sourceMappingURL=padSafeContextSize.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/padSafeContextSize.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"padSafeContextSize.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/padSafeContextSize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAC,MAAM,oBAAoB,CAAC;AAElD,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,YAA2B,EAAE,UAAkB,cAAc;IAC3G,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,IAAI,UAAU,KAAK,KAAK;QACpB,OAAO,KAAK,CAAC;SACZ,IAAI,YAAY,KAAK,IAAI;QAC1B,OAAO,UAAU,CAAC;SACjB,IAAI,YAAY,KAAK,MAAM,EAAE,CAAC;QAC/B,MAAM,iBAAiB,GAAG,OAAO,CAAC,KAAK,GAAG,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,IAAI,iBAAiB,IAAI,OAAO;YAC5B,OAAO,iBAAiB,CAAC;IACjC,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AACD,SAAS,OAAO,CAAC,KAAa,EAAE,OAAe;IAC3C,OAAO,CAAC,CAAC,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC;AACpD,CAAC"}
|
||||
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
generated
vendored
Normal file
2
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts
generated
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
import { BatchingOptions } from "../types.js";
|
||||
export declare function resolveBatchItemsPrioritizationStrategy(strategy: Required<BatchingOptions>["itemPrioritizationStrategy"]): import("../types.js").CustomBatchingPrioritizationStrategy;
|
||||
13
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
generated
vendored
Normal file
13
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
import { maximumParallelismStrategy } from "./batchItemsPrioritizationStrategies/maximumParallelismStrategy.js";
|
||||
import { firstInFirstOutStrategy } from "./batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js";
|
||||
export function resolveBatchItemsPrioritizationStrategy(strategy) {
|
||||
if (strategy instanceof Function)
|
||||
return strategy;
|
||||
else if (strategy === "maximumParallelism")
|
||||
return maximumParallelismStrategy;
|
||||
else if (strategy === "firstInFirstOut")
|
||||
return firstInFirstOutStrategy;
|
||||
void strategy;
|
||||
throw new Error(`Unknown batch items prioritize strategy: ${strategy}`);
|
||||
}
|
||||
//# sourceMappingURL=resolveBatchItemsPrioritizationStrategy.js.map
|
||||
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"resolveBatchItemsPrioritizationStrategy.js","sourceRoot":"","sources":["../../../../src/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,0BAA0B,EAAC,MAAM,oEAAoE,CAAC;AAC9G,OAAO,EAAC,uBAAuB,EAAC,MAAM,iEAAiE,CAAC;AAExG,MAAM,UAAU,uCAAuC,CAAC,QAAiE;IACrH,IAAI,QAAQ,YAAY,QAAQ;QAC5B,OAAO,QAAQ,CAAC;SACf,IAAI,QAAQ,KAAK,oBAAoB;QACtC,OAAO,0BAA0B,CAAC;SACjC,IAAI,QAAQ,KAAK,iBAAiB;QACnC,OAAO,uBAAuB,CAAC;IAEnC,KAAM,QAAyB,CAAC;IAEhC,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;AAC5E,CAAC"}
|
||||
Reference in New Issue
Block a user