First upload version 0.0.1
This commit is contained in:
67
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.d.ts
generated
vendored
Normal file
67
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.d.ts
generated
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
import { Llama } from "../../bindings/Llama.js";
|
||||
import { GgufFileInfo } from "../types/GgufFileInfoTypes.js";
|
||||
import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js";
|
||||
import { GgufInsightsTokens } from "./GgufInsightsTokens.js";
|
||||
export type GgufInsightsResourceRequirements = {
|
||||
cpuRam: number;
|
||||
gpuVram: number;
|
||||
};
|
||||
export declare class GgufInsights {
|
||||
private constructor();
|
||||
/**
|
||||
* Get warnings about the model file that would affect its usage.
|
||||
*
|
||||
* Most of these warnings are also generated by `llama.cpp`
|
||||
*/
|
||||
getWarnings(modelFilePath?: string): string[];
|
||||
get ggufFileInfo(): GgufFileInfo;
|
||||
get configurationResolver(): GgufInsightsConfigurationResolver;
|
||||
get tokens(): GgufInsightsTokens;
|
||||
/** The context size the model was trained on */
|
||||
get trainContextSize(): number | undefined;
|
||||
/** The size of an embedding vector the model can produce */
|
||||
get embeddingVectorSize(): number | undefined;
|
||||
get totalLayers(): number;
|
||||
get modelSize(): number;
|
||||
get flashAttentionSupported(): boolean;
|
||||
get hasEncoder(): boolean;
|
||||
get hasDecoder(): boolean;
|
||||
get isRecurrent(): boolean;
|
||||
get supportsRanking(): boolean;
|
||||
/**
|
||||
* The size of the SWA (Sliding Window Attention).
|
||||
*
|
||||
* When `undefined`, the model does not use sliding window attention.
|
||||
*/
|
||||
get swaSize(): number | undefined;
|
||||
estimateModelResourceRequirements({ gpuLayers, useMmap, gpuSupportsMmap }: {
|
||||
gpuLayers: number;
|
||||
useMmap?: boolean;
|
||||
gpuSupportsMmap?: boolean;
|
||||
}): GgufInsightsResourceRequirements;
|
||||
/**
|
||||
* Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`.
|
||||
* The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that.
|
||||
* The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
|
||||
*/
|
||||
estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext, includeGraphOverhead, flashAttention, swaFullCache }: {
|
||||
contextSize: number;
|
||||
modelGpuLayers: number;
|
||||
batchSize?: number;
|
||||
sequences?: number;
|
||||
isEmbeddingContext?: boolean;
|
||||
flashAttention?: boolean;
|
||||
includeGraphOverhead?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
}): GgufInsightsResourceRequirements;
|
||||
/**
|
||||
* @param ggufFileInfo
|
||||
* @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
|
||||
* If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that
|
||||
* doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances
|
||||
* that need a fallback `Llama` instance.
|
||||
*/
|
||||
static from(ggufFileInfo: GgufFileInfo, llama?: Llama): Promise<GgufInsights>;
|
||||
}
|
||||
export declare function parseRankingTemplate(template: string | undefined | null): string | undefined;
|
||||
export declare function isRankingTemplateValid(template: string | undefined | null): boolean;
|
||||
653
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.js
generated
vendored
Normal file
653
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.js
generated
vendored
Normal file
@@ -0,0 +1,653 @@
|
||||
import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js";
|
||||
import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
|
||||
import { GgufArchitectureType } from "../types/GgufMetadataTypes.js";
|
||||
import { getReadablePath } from "../../cli/utils/getReadablePath.js";
|
||||
import { padSafeContextSize } from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
|
||||
import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js";
|
||||
import { GgufInsightsTokens } from "./GgufInsightsTokens.js";
|
||||
export class GgufInsights {
|
||||
/** @internal */ _llama;
|
||||
/** @internal */ _modelSize;
|
||||
/** @internal */ _totalFileLayers = null;
|
||||
/** @internal */ _supportsRanking;
|
||||
/** @internal */ _ggufFileInfo;
|
||||
/** @internal */ _configurationResolver;
|
||||
/** @internal */ _tokens;
|
||||
constructor(ggufFileInfo, llama) {
|
||||
this._llama = llama;
|
||||
this._ggufFileInfo = ggufFileInfo;
|
||||
this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
|
||||
this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
|
||||
this._tokens = GgufInsightsTokens._create(this);
|
||||
}
|
||||
/**
|
||||
* Get warnings about the model file that would affect its usage.
|
||||
*
|
||||
* Most of these warnings are also generated by `llama.cpp`
|
||||
*/
|
||||
getWarnings(modelFilePath) {
|
||||
const warnings = [];
|
||||
const modelFilePathText = (modelFilePath != null && modelFilePath !== "")
|
||||
? ` ("${getReadablePath(modelFilePath)}")`
|
||||
: "";
|
||||
if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" &&
|
||||
this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) {
|
||||
// equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'"
|
||||
warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` +
|
||||
"This may cause incorrect tokenization and thus degrade the generation quality. " +
|
||||
"Consider using a newer model or regenerating this GGUF model file");
|
||||
}
|
||||
return warnings;
|
||||
}
|
||||
get ggufFileInfo() {
|
||||
return this._ggufFileInfo;
|
||||
}
|
||||
get configurationResolver() {
|
||||
return this._configurationResolver;
|
||||
}
|
||||
get tokens() {
|
||||
return this._tokens;
|
||||
}
|
||||
/** The context size the model was trained on */
|
||||
get trainContextSize() {
|
||||
return this._ggufFileInfo.architectureMetadata.context_length;
|
||||
}
|
||||
/** The size of an embedding vector the model can produce */
|
||||
get embeddingVectorSize() {
|
||||
return this._ggufFileInfo.architectureMetadata.embedding_length;
|
||||
}
|
||||
get totalLayers() {
|
||||
const outputLayers = 1;
|
||||
return this._getTotalFileLayers() + outputLayers;
|
||||
}
|
||||
get modelSize() {
|
||||
return this._modelSize;
|
||||
}
|
||||
get flashAttentionSupported() {
|
||||
// source: `llama_new_context_with_model` in `llama.cpp`
|
||||
if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
|
||||
return false;
|
||||
else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
|
||||
return false;
|
||||
else {
|
||||
const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0;
|
||||
const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0;
|
||||
const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
|
||||
const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
|
||||
if (nEmbdHeadK !== nEmbdHeadV)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
get hasEncoder() {
|
||||
switch (this._ggufFileInfo.metadata?.general?.architecture) {
|
||||
case GgufArchitectureType.t5:
|
||||
case GgufArchitectureType.t5encoder:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
get hasDecoder() {
|
||||
switch (this._ggufFileInfo.metadata?.general?.architecture) {
|
||||
case GgufArchitectureType.t5encoder:
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
get isRecurrent() {
|
||||
switch (this._ggufFileInfo.metadata?.general?.architecture) {
|
||||
case GgufArchitectureType.mamba:
|
||||
case GgufArchitectureType.mamba2:
|
||||
case GgufArchitectureType.rwkv6:
|
||||
case GgufArchitectureType.rwkv6qwen2:
|
||||
case GgufArchitectureType.rwkv7:
|
||||
case GgufArchitectureType.arwkv7:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
get supportsRanking() {
|
||||
if (this._supportsRanking != null)
|
||||
return this._supportsRanking;
|
||||
const layers = this._ggufFileInfo.fullTensorInfo ?? [];
|
||||
for (let i = layers.length - 1; i >= 0; i--) {
|
||||
const tensor = layers[i];
|
||||
if (tensor == null)
|
||||
continue;
|
||||
if (tensor.name === "cls.weight" || tensor.name === "cls.output.weight") {
|
||||
this._supportsRanking = this.tokens.sepToken != null || this.tokens.eosToken != null ||
|
||||
isRankingTemplateValid(parseRankingTemplate(this._ggufFileInfo.metadata?.tokenizer?.["chat_template.rerank"]));
|
||||
this._supportsRanking &&= !(this.hasEncoder && this.hasDecoder); // encoder-decoder models are not supported
|
||||
return this._supportsRanking;
|
||||
}
|
||||
}
|
||||
this._supportsRanking = false;
|
||||
return this._supportsRanking;
|
||||
}
|
||||
/**
|
||||
* The size of the SWA (Sliding Window Attention).
|
||||
*
|
||||
* When `undefined`, the model does not use sliding window attention.
|
||||
*/
|
||||
get swaSize() {
|
||||
const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window;
|
||||
if (slidingWindow == null || slidingWindow <= 0)
|
||||
return undefined;
|
||||
const trainContextSize = this.trainContextSize;
|
||||
if (trainContextSize != null && slidingWindow >= trainContextSize)
|
||||
return undefined;
|
||||
return slidingWindow;
|
||||
}
|
||||
estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) {
|
||||
const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers);
|
||||
return {
|
||||
cpuRam: calculateTensorsSize(cpu, this._llama, false),
|
||||
gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`.
|
||||
* The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that.
|
||||
* The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
|
||||
*/
|
||||
estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, swaFullCache = false }) {
|
||||
if (sequences == null)
|
||||
sequences = getDefaultContextSequences();
|
||||
if (batchSize == null)
|
||||
batchSize = getDefaultContextBatchSize({ contextSize, sequences });
|
||||
const llmData = this._ggufFileInfo.architectureMetadata;
|
||||
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
|
||||
const slidingWindow = this.swaSize ?? 0;
|
||||
const kvUnified = false;
|
||||
const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
|
||||
(this.trainContextSize == null || slidingWindow < this.trainContextSize);
|
||||
const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture);
|
||||
const nonSwaPercent = swaPattern <= 1
|
||||
? 1
|
||||
: (1 / (swaPattern + (flashAttention ? -0.5 : -1)));
|
||||
// source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
|
||||
const kvCachePadding = 1;
|
||||
const actualContextSize = kvUnified
|
||||
? padSafeContextSize(sequences * contextSize, "up")
|
||||
: sequences * padSafeContextSize(contextSize, "up");
|
||||
const kvSize = usingSWA
|
||||
? ((1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
|
||||
nonSwaPercent * actualContextSize)
|
||||
: actualContextSize;
|
||||
const totalFileLayers = this._getTotalFileLayers();
|
||||
const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers));
|
||||
const finalCpuLayers = totalFileLayers - finalGpuLayers;
|
||||
const usingGpu = finalGpuLayers !== 0;
|
||||
const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
|
||||
const embeddingSize = llmData.embedding_length ?? 0;
|
||||
const floatBytes = 4; // sizeof(float)
|
||||
const int32TBytes = 4; // sizeof(int32_t)
|
||||
const estimateOutput = (nOutputs) => {
|
||||
// source: `llama_context::output_reserve` in `llama-context.cpp`
|
||||
const nOutputsMax = Math.max(batchSize, nOutputs);
|
||||
const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5;
|
||||
const hasLogits = isT5 || !isEmbeddingContext;
|
||||
const hasEmbd = isT5 || isEmbeddingContext;
|
||||
const logitsSize = hasLogits
|
||||
? (vocabularySize * nOutputsMax)
|
||||
: 0;
|
||||
const embdSize = hasEmbd
|
||||
? (embeddingSize * nOutputsMax)
|
||||
: 0;
|
||||
const outputBufferSize = (logitsSize + embdSize) * floatBytes;
|
||||
const outputIdsArr = int32TBytes * batchSize;
|
||||
return outputBufferSize + outputIdsArr;
|
||||
};
|
||||
const estimateGraphOverheadMemory = () => {
|
||||
const s1MB = Math.pow(1024, 2);
|
||||
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
|
||||
const expertCount = llmData?.expert_count ?? 0;
|
||||
const headCount = llmData?.attention?.head_count ?? 0;
|
||||
const embeddingLength = llmData?.embedding_length ?? 0;
|
||||
let defaultCalculationAdjustment = 0;
|
||||
if (batchSize == null)
|
||||
return 0;
|
||||
if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) {
|
||||
if (expertCount > 0) {
|
||||
const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
|
||||
return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
|
||||
}
|
||||
return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
|
||||
}
|
||||
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) {
|
||||
if (modelGpuLayers === this.totalLayers) {
|
||||
defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null
|
||||
? 1
|
||||
: kvSize / this.trainContextSize);
|
||||
}
|
||||
else {
|
||||
defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null
|
||||
? 1
|
||||
: kvSize / this.trainContextSize));
|
||||
}
|
||||
}
|
||||
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) {
|
||||
// only works properly when all layers are on the GPU, which is why it's commented out:
|
||||
// return int32TBytes * batchSize * ((llmData.embedding_length ?? 0));
|
||||
if (modelGpuLayers === this.totalLayers) {
|
||||
defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null
|
||||
? 1
|
||||
: kvSize / this.trainContextSize));
|
||||
}
|
||||
else {
|
||||
defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null
|
||||
? 1
|
||||
: Math.max(0, (1 - (kvSize / this.trainContextSize)))));
|
||||
}
|
||||
}
|
||||
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) {
|
||||
const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
|
||||
return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB);
|
||||
// if (modelGpuLayers === this.totalLayers) {
|
||||
// defaultCalculationAdjustment += -(s1MB * 20) + (
|
||||
// (s1MB * 250) * (
|
||||
// this.trainContextSize == null
|
||||
// ? 1
|
||||
// : kvSize / this.trainContextSize
|
||||
// )
|
||||
// );
|
||||
// } else {
|
||||
// defaultCalculationAdjustment += -(s1MB * 40) + (
|
||||
// (s1MB * 300) * (
|
||||
// this.trainContextSize == null
|
||||
// ? 1
|
||||
// : kvSize / this.trainContextSize
|
||||
// )
|
||||
// );
|
||||
// }
|
||||
}
|
||||
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) {
|
||||
return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
|
||||
}
|
||||
else if (expertCount > 0) {
|
||||
const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
|
||||
return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
|
||||
}
|
||||
const totalElements = tensorInfo.length === 0
|
||||
? this.totalLayers * (((llmData.embedding_length ?? 0) +
|
||||
(llmData.feed_forward_length ?? 0)) / 2)
|
||||
: tensorInfo.reduce((res, tensor) => {
|
||||
return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0);
|
||||
}, 0);
|
||||
if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) {
|
||||
// magic numbers for estimation. will be improved in the future
|
||||
return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment;
|
||||
}
|
||||
// magic numbers for estimation. will be improved in the future
|
||||
return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment;
|
||||
};
|
||||
const gpuKVCacheSize = usingGpu
|
||||
? this._estimateKvMemorySizeInBytes(kvSize, finalGpuLayers < totalFileLayers
|
||||
? (finalGpuLayers + 1)
|
||||
: finalGpuLayers)
|
||||
: 0;
|
||||
const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers);
|
||||
// source: `llama_context::graph_max_nodes` in `llama-context.cpp`
|
||||
const getMaxNodesMultiplier = (arch, nTokens) => {
|
||||
if (arch === GgufArchitectureType.qwen3next)
|
||||
return {
|
||||
min: nTokens * 40,
|
||||
multiplier: 32
|
||||
};
|
||||
return {
|
||||
min: 1024,
|
||||
multiplier: 8
|
||||
};
|
||||
};
|
||||
const maxNodesMultiplier = getMaxNodesMultiplier(this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize));
|
||||
const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length);
|
||||
const cpuNodes = maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
|
||||
const gpuNodes = maxNodes - cpuNodes;
|
||||
const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) +
|
||||
this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false);
|
||||
const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) +
|
||||
this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false);
|
||||
const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
|
||||
? 0
|
||||
: estimateGraphOverheadMemory();
|
||||
const graphOverheadGpuSize = usingGpu
|
||||
? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers))
|
||||
: 0;
|
||||
const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize;
|
||||
const outputBufferSize = estimateOutput(sequences);
|
||||
const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize;
|
||||
const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize;
|
||||
return {
|
||||
cpuRam,
|
||||
gpuVram: usingGpu
|
||||
? gpuVram
|
||||
: 0
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Get the split tensor resources for CPU and GPU based on the number of GPU layers
|
||||
* @internal
|
||||
*/
|
||||
_getTensorResourceSplit(gpuLayers) {
|
||||
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
|
||||
const architecture = this._ggufFileInfo.metadata?.general?.architecture;
|
||||
if (gpuLayers === 0) {
|
||||
return {
|
||||
cpu: tensorInfo,
|
||||
gpu: []
|
||||
};
|
||||
}
|
||||
const fileLayers = this._getFileLayers();
|
||||
const startGpuLayer = Math.max(0, fileLayers - gpuLayers);
|
||||
const gpuTensors = [];
|
||||
const cpuTensors = [];
|
||||
let tokenEmbedLayer;
|
||||
let mainOutputLayer;
|
||||
for (const singleTensorInfo of tensorInfo) {
|
||||
if (isMainOutputLayer(singleTensorInfo.name))
|
||||
mainOutputLayer = singleTensorInfo;
|
||||
else if (isTokenEmbedLayer(singleTensorInfo.name))
|
||||
tokenEmbedLayer = singleTensorInfo;
|
||||
// in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
|
||||
// loaded with `model.dev_input`, which is always set to the CPU
|
||||
if (isInputLayer(singleTensorInfo.name)) {
|
||||
cpuTensors.push(singleTensorInfo);
|
||||
continue;
|
||||
// in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
|
||||
// loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
|
||||
}
|
||||
else if (isOutputLayer(singleTensorInfo.name)) {
|
||||
if (gpuLayers === this.totalLayers) {
|
||||
gpuTensors.push(singleTensorInfo);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
cpuTensors.push(singleTensorInfo);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
const { layerNumber } = parseTensorName(singleTensorInfo.name);
|
||||
if (gpuLayers !== this.totalLayers) {
|
||||
if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
|
||||
if (layerNumber != null && layerNumber >= startGpuLayer)
|
||||
gpuTensors.push(singleTensorInfo);
|
||||
else
|
||||
cpuTensors.push(singleTensorInfo);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (layerNumber == null || layerNumber >= startGpuLayer)
|
||||
gpuTensors.push(singleTensorInfo);
|
||||
else
|
||||
cpuTensors.push(singleTensorInfo);
|
||||
}
|
||||
if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
|
||||
gpuTensors.push(tokenEmbedLayer);
|
||||
return {
|
||||
cpu: cpuTensors,
|
||||
gpu: gpuTensors
|
||||
};
|
||||
}
|
||||
/** @internal */
|
||||
_determineNumberOfLayersFromTensorInfo() {
|
||||
const layerNumbers = new Set();
|
||||
for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) {
|
||||
const { layerNumber } = parseTensorName(singleTensorInfo.name);
|
||||
if (layerNumber != null)
|
||||
layerNumbers.add(layerNumber);
|
||||
}
|
||||
return layerNumbers.size;
|
||||
}
|
||||
/** @internal */
|
||||
_getFileLayers() {
|
||||
return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo();
|
||||
}
|
||||
/** @internal */
|
||||
_estimateKvMemorySizeInBytes(kvSize, layers) {
|
||||
// source: `llama_kv_cache_init` in `llama.cpp`
|
||||
const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
|
||||
const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
|
||||
const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
|
||||
const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
|
||||
const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
|
||||
const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0;
|
||||
const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0;
|
||||
const modelNEmbdKS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0
|
||||
? (this._ggufFileInfo.architectureMetadata.token_shift_count ?? 0) * nEmbd
|
||||
: (ssmDConv > 0 ? (ssmDConv - 1) : 0) * ssmDInner;
|
||||
const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0;
|
||||
const modelNEmbdVS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0
|
||||
? nEmbd * (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0)
|
||||
: ssmDState * ssmDInner;
|
||||
let totalElementsK = 0;
|
||||
let totalElementsV = 0;
|
||||
for (let i = 0; i < layers; i++) {
|
||||
const nHeadKvArrayItem = (typeof nHeadKv === "number")
|
||||
? nHeadKv
|
||||
: nHeadKv[i] !== 0
|
||||
? nHeadKv[i]
|
||||
: nHead;
|
||||
const nEmbdKGqa = nEmbdHeadK * nHeadKvArrayItem;
|
||||
const nEmbdVGqa = nEmbdHeadV * nHeadKvArrayItem;
|
||||
const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS;
|
||||
const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS;
|
||||
totalElementsK += totalNEmbdKGqa * kvSize;
|
||||
totalElementsV += totalNEmbdVGqa * kvSize;
|
||||
}
|
||||
const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
|
||||
// if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`,
|
||||
// this would have to depend on that value
|
||||
? this._llama._consts.ggmlTypeF32Size
|
||||
: this._llama._consts.ggmlTypeF16Size;
|
||||
const valueTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
|
||||
// if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`,
|
||||
// this would have to depend on that value
|
||||
? this._llama._consts.ggmlTypeF32Size
|
||||
: this._llama._consts.ggmlTypeF16Size;
|
||||
return ((totalElementsK * keyTypeSize) +
|
||||
(totalElementsV * valueTypeSize));
|
||||
}
|
||||
/** @internal */
|
||||
_getTotalFileLayers() {
|
||||
if (this._totalFileLayers != null)
|
||||
return this._totalFileLayers;
|
||||
this._totalFileLayers = this._getFileLayers();
|
||||
return this._totalFileLayers;
|
||||
}
|
||||
/**
|
||||
* @param ggufFileInfo
|
||||
* @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
|
||||
* If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that
|
||||
* doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances
|
||||
* that need a fallback `Llama` instance.
|
||||
*/
|
||||
static async from(ggufFileInfo, llama) {
|
||||
let resolvedLlama = llama;
|
||||
if (resolvedLlama == null)
|
||||
resolvedLlama = await getLlamaWithoutBackend();
|
||||
return new GgufInsights(ggufFileInfo, resolvedLlama);
|
||||
}
|
||||
}
|
||||
function parseTensorName(tensorName) {
|
||||
if (tensorName == null)
|
||||
return { layerNumber: undefined };
|
||||
const layerTensorPrefix = "blk.";
|
||||
if (!tensorName.startsWith(layerTensorPrefix))
|
||||
return { layerNumber: undefined };
|
||||
const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length);
|
||||
const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0
|
||||
? tensorName.length
|
||||
: dotIndex);
|
||||
const layerNumber = parseInt(layerNumberString);
|
||||
if (Number.isFinite(layerNumber))
|
||||
return { layerNumber };
|
||||
return { layerNumber: undefined };
|
||||
}
|
||||
function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) {
|
||||
if (!useMmap) {
|
||||
let size = 0;
|
||||
for (const tensorInfo of tensorsInfo)
|
||||
size += calculateTensorSize(tensorInfo, llama);
|
||||
return size;
|
||||
}
|
||||
const fileStats = new Map();
|
||||
for (const tensorInfo of tensorsInfo) {
|
||||
let stats = fileStats.get(tensorInfo.filePart);
|
||||
if (stats == null) {
|
||||
stats = {
|
||||
tensorsSize: 0
|
||||
};
|
||||
fileStats.set(tensorInfo.filePart, stats);
|
||||
}
|
||||
const tensorSize = calculateTensorSize(tensorInfo, llama);
|
||||
stats.tensorsSize += tensorSize;
|
||||
const startOffset = tensorInfo.offset;
|
||||
const endOffset = typeof startOffset === "number"
|
||||
? startOffset + tensorSize
|
||||
: startOffset + BigInt(tensorSize);
|
||||
if (startFromTensorDataOffset)
|
||||
stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
|
||||
else if (stats.startOffset == null || startOffset < stats.startOffset)
|
||||
stats.startOffset = startOffset;
|
||||
if (stats.endOffset == null || endOffset > stats.endOffset)
|
||||
stats.endOffset = endOffset;
|
||||
}
|
||||
let size = 0;
|
||||
for (const [, stats] of fileStats) {
|
||||
const offsetSize = (stats.endOffset == null || stats.startOffset == null)
|
||||
? 0
|
||||
: Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
|
||||
const tensorsSize = stats.tensorsSize;
|
||||
size += Math.max(offsetSize, tensorsSize);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
function calculateTensorSize(tensor, llama) {
|
||||
const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType);
|
||||
const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType);
|
||||
const ggmlMaxDims = llama._consts.ggmlMaxDims;
|
||||
if (typeSize == null || blockSize == null)
|
||||
throw new Error("Invalid type or block size");
|
||||
const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims });
|
||||
if (blockSize === 1) {
|
||||
let totalBytes = typeSize;
|
||||
for (let i = 0; i < ggmlMaxDims; i++) {
|
||||
totalBytes += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
return totalBytes;
|
||||
}
|
||||
else {
|
||||
let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize);
|
||||
for (let i = 1; i < ggmlMaxDims; i++) {
|
||||
totalBytes += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
return totalBytes;
|
||||
}
|
||||
}
|
||||
function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) {
|
||||
// number of elements
|
||||
// source: `ggml_new_tensor_impl` in `ggml.c`
|
||||
const ne = [
|
||||
...tensor.dimensions,
|
||||
...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1))
|
||||
].slice(0, ggmlMaxDims);
|
||||
// number of bytes
|
||||
// source: `ggml_new_tensor_impl` in `ggml.c`
|
||||
const nb = [
|
||||
typeSize,
|
||||
Math.floor(typeSize * (ne[0] / blockSize)),
|
||||
...Array(ggmlMaxDims - 2).fill(0)
|
||||
];
|
||||
for (let i = 2; i < ggmlMaxDims; i++) {
|
||||
nb[i] = nb[i - 1] * ne[i - 1];
|
||||
}
|
||||
return {
|
||||
ne,
|
||||
nb
|
||||
};
|
||||
}
|
||||
function isInputLayer(layerName) {
|
||||
const [firstPart] = layerName.split(".");
|
||||
if (firstPart == null)
|
||||
return false;
|
||||
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
|
||||
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
|
||||
switch (firstPart) {
|
||||
case "token_embd":
|
||||
case "token_embd_norm":
|
||||
case "token_types":
|
||||
case "position_embd":
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
function isOutputLayer(layerName) {
|
||||
const [firstPart, secondPart] = layerName.split(".");
|
||||
if (firstPart == null)
|
||||
return false;
|
||||
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
|
||||
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
|
||||
switch (firstPart) {
|
||||
case "output":
|
||||
case "output_norm":
|
||||
case "cls":
|
||||
return true;
|
||||
}
|
||||
if (secondPart == null)
|
||||
return false;
|
||||
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
|
||||
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
|
||||
switch (firstPart + "." + secondPart) {
|
||||
case "cls.output":
|
||||
case "dec.output_norm":
|
||||
case "enc.output_norm":
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
function isMainOutputLayer(layerName) {
|
||||
const [firstPart] = layerName.split(".");
|
||||
return firstPart === "output";
|
||||
}
|
||||
function isTokenEmbedLayer(layerName) {
|
||||
const [firstPart] = layerName.split(".");
|
||||
return firstPart === "token_embd";
|
||||
}
|
||||
function ggmlPad(value, padding) {
|
||||
return ((value + padding - 1) & ~(padding - 1));
|
||||
}
|
||||
function getSwaPatternForArchitecture(architecture) {
|
||||
// source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern`
|
||||
switch (architecture) {
|
||||
case GgufArchitectureType.llama4:
|
||||
return 4;
|
||||
case GgufArchitectureType.phi3:
|
||||
return 1;
|
||||
case GgufArchitectureType.gemma2:
|
||||
return 2;
|
||||
case GgufArchitectureType.gemma3:
|
||||
return 6;
|
||||
case GgufArchitectureType.gemma3n:
|
||||
return 5;
|
||||
case GgufArchitectureType.cohere2:
|
||||
return 4;
|
||||
case GgufArchitectureType.exaone4:
|
||||
return 4;
|
||||
case GgufArchitectureType.gptOss:
|
||||
return 2;
|
||||
case GgufArchitectureType.smallthinker:
|
||||
return 4;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
export function parseRankingTemplate(template) {
|
||||
if (template == null)
|
||||
return undefined;
|
||||
return template
|
||||
.replaceAll("{query}", "{{query}}")
|
||||
.replaceAll("{document}", "{{document}}");
|
||||
}
|
||||
export function isRankingTemplateValid(template) {
|
||||
return template != null && template.includes("{{query}}") && template.includes("{{document}}");
|
||||
}
|
||||
//# sourceMappingURL=GgufInsights.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
194
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts
generated
vendored
Normal file
194
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts
generated
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
import { BuildGpu } from "../../bindings/types.js";
|
||||
import { LlamaModelOptions } from "../../evaluator/LlamaModel/LlamaModel.js";
|
||||
import { LlamaContextOptions } from "../../evaluator/LlamaContext/types.js";
|
||||
import type { GgufInsights } from "./GgufInsights.js";
|
||||
export declare const defaultTrainContextSizeForEstimationPurposes = 4096;
|
||||
export declare class GgufInsightsConfigurationResolver {
|
||||
private constructor();
|
||||
get ggufInsights(): GgufInsights;
|
||||
/**
|
||||
* Resolve the best configuration for loading a model and creating a context using the current hardware.
|
||||
*
|
||||
* Specifying a `targetGpuLayers` and/or `targetContextSize` will ensure the resolved configuration matches those values,
|
||||
* but note it can lower the compatibility score if the hardware doesn't support it.
|
||||
*
|
||||
* Overriding hardware values it possible by configuring `hardwareOverrides`.
|
||||
* @param options
|
||||
* @param hardwareOverrides
|
||||
*/
|
||||
resolveAndScoreConfig({ targetGpuLayers, targetContextSize, embeddingContext, flashAttention, swaFullCache, useMmap }?: {
|
||||
targetGpuLayers?: number | "max";
|
||||
targetContextSize?: number;
|
||||
embeddingContext?: boolean;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
useMmap?: boolean;
|
||||
}, { getVramState, getRamState, getSwapState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading }?: {
|
||||
getVramState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
unifiedSize: number;
|
||||
}>;
|
||||
getRamState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
getSwapState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaVramPaddingSize?: number;
|
||||
llamaGpu?: BuildGpu;
|
||||
llamaSupportsGpuOffloading?: boolean;
|
||||
}): Promise<{
|
||||
/**
|
||||
* A number between `0` (inclusive) and `1` (inclusive) representing the compatibility score.
|
||||
*/
|
||||
compatibilityScore: number;
|
||||
/**
|
||||
* A number starting at `0` with no upper limit representing the bonus score.
|
||||
* For each multiplier of the specified `contextSize` that the resolved context size is larger by, 1 bonus point is given.
|
||||
*/
|
||||
bonusScore: number;
|
||||
/**
|
||||
* The total score, which is the sum of the compatibility and bonus scores.
|
||||
*/
|
||||
totalScore: number;
|
||||
/**
|
||||
* The resolved values used to calculate the scores.
|
||||
*/
|
||||
resolvedValues: {
|
||||
gpuLayers: number;
|
||||
contextSize: number;
|
||||
modelRamUsage: number;
|
||||
contextRamUsage: number;
|
||||
totalRamUsage: number;
|
||||
modelVramUsage: number;
|
||||
contextVramUsage: number;
|
||||
totalVramUsage: number;
|
||||
};
|
||||
}>;
|
||||
/**
|
||||
* Score the compatibility of the model configuration with the current GPU and VRAM state.
|
||||
* Assumes a model is loaded with the default `"auto"` configurations.
|
||||
* Scored based on the following criteria:
|
||||
* - The number of GPU layers that can be offloaded to the GPU (only if there's a GPU. If there's no GPU then by how small the model is)
|
||||
* - Whether all layers can be offloaded to the GPU (gives additional points)
|
||||
* - Whether the resolved context size is at least as large as the specified `contextSize`
|
||||
*
|
||||
* If the resolved context size is larger than the specified context size, for each multiplier of the specified `contextSize`
|
||||
* that the resolved context size is larger by, 1 bonus point is given in the `bonusScore`.
|
||||
*
|
||||
* `maximumFittedContextSizeMultiplier` is used to improve the proportionality of the bonus score between models.
|
||||
* Set this to any value higher than `<max compared model context size> / contextSize`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models.
|
||||
* Set this to any value higher than `<max compared model resource usage> / <total available resources>`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead).
|
||||
*/
|
||||
scoreModelConfigurationCompatibility({ contextSize, embeddingContext, flashAttention, swaFullCache, maximumFittedContextSizeMultiplier, maximumUnfitConfigurationResourceMultiplier, forceStrictContextSize, forceGpuLayers, useMmap }?: {
|
||||
contextSize?: number;
|
||||
embeddingContext?: boolean;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
maximumFittedContextSizeMultiplier?: number;
|
||||
maximumUnfitConfigurationResourceMultiplier?: number;
|
||||
/**
|
||||
* Do not resolve a context size larger than the specified `contextSize`.
|
||||
*
|
||||
* Defaults to `false`.
|
||||
*/
|
||||
forceStrictContextSize?: boolean;
|
||||
forceGpuLayers?: number | "max";
|
||||
useMmap?: boolean;
|
||||
}, { getVramState, getRamState, getSwapState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading }?: {
|
||||
getVramState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
unifiedSize: number;
|
||||
}>;
|
||||
getRamState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
getSwapState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaVramPaddingSize?: number;
|
||||
llamaGpu?: BuildGpu;
|
||||
llamaSupportsGpuOffloading?: boolean;
|
||||
}): Promise<{
|
||||
/**
|
||||
* A number between `0` (inclusive) and `1` (inclusive) representing the compatibility score.
|
||||
*/
|
||||
compatibilityScore: number;
|
||||
/**
|
||||
* A number starting at `0` with no upper limit representing the bonus score.
|
||||
* For each multiplier of the specified `contextSize` that the resolved context size is larger by, 1 bonus point is given.
|
||||
*/
|
||||
bonusScore: number;
|
||||
/**
|
||||
* The total score, which is the sum of the compatibility and bonus scores.
|
||||
*/
|
||||
totalScore: number;
|
||||
/**
|
||||
* The resolved values used to calculate the scores.
|
||||
*/
|
||||
resolvedValues: {
|
||||
gpuLayers: number;
|
||||
contextSize: number;
|
||||
modelRamUsage: number;
|
||||
contextRamUsage: number;
|
||||
totalRamUsage: number;
|
||||
modelVramUsage: number;
|
||||
contextVramUsage: number;
|
||||
totalVramUsage: number;
|
||||
};
|
||||
}>;
|
||||
resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], { ignoreMemorySafetyChecks, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }?: {
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
getVramState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaVramPaddingSize?: number;
|
||||
llamaGpu?: BuildGpu;
|
||||
llamaSupportsGpuOffloading?: boolean;
|
||||
defaultContextFlashAttention?: boolean;
|
||||
defaultContextSwaFullCache?: boolean;
|
||||
useMmap?: boolean;
|
||||
}): Promise<number>;
|
||||
/**
|
||||
* Resolve a context size option for the given options and constraints.
|
||||
*
|
||||
* If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown.
|
||||
*/
|
||||
resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention, swaFullCache, getVramState, getRamState, getSwapState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext, sequences }: {
|
||||
modelGpuLayers: number;
|
||||
modelTrainContextSize: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
batchSize?: LlamaContextOptions["batchSize"];
|
||||
sequences?: number;
|
||||
getVramState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
unifiedSize: number;
|
||||
}>;
|
||||
getRamState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
getSwapState?(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaGpu?: BuildGpu;
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
isEmbeddingContext?: boolean;
|
||||
}): Promise<number>;
|
||||
}
|
||||
272
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js
generated
vendored
Normal file
272
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js
generated
vendored
Normal file
@@ -0,0 +1,272 @@
|
||||
import { getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
|
||||
import { InsufficientMemoryError } from "../../utils/InsufficientMemoryError.js";
|
||||
import { resolveModelGpuLayersOption } from "./utils/resolveModelGpuLayersOption.js";
|
||||
import { resolveContextContextSizeOption } from "./utils/resolveContextContextSizeOption.js";
|
||||
import { scoreLevels } from "./utils/scoreLevels.js";
|
||||
import { getRamUsageFromUnifiedVram } from "./utils/getRamUsageFromUnifiedVram.js";
|
||||
export const defaultTrainContextSizeForEstimationPurposes = 4096;
|
||||
const defaultContextSizeForUnfitContextSizeConfiguration = 2048;
|
||||
export class GgufInsightsConfigurationResolver {
|
||||
/** @internal */ _ggufInsights;
|
||||
constructor(ggufInsights) {
|
||||
this._ggufInsights = ggufInsights;
|
||||
}
|
||||
get ggufInsights() {
|
||||
return this._ggufInsights;
|
||||
}
|
||||
/**
|
||||
* Resolve the best configuration for loading a model and creating a context using the current hardware.
|
||||
*
|
||||
* Specifying a `targetGpuLayers` and/or `targetContextSize` will ensure the resolved configuration matches those values,
|
||||
* but note it can lower the compatibility score if the hardware doesn't support it.
|
||||
*
|
||||
* Overriding hardware values it possible by configuring `hardwareOverrides`.
|
||||
* @param options
|
||||
* @param hardwareOverrides
|
||||
*/
|
||||
async resolveAndScoreConfig({ targetGpuLayers, targetContextSize, embeddingContext = false, flashAttention = false, swaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
|
||||
const compatibilityScore = await this.scoreModelConfigurationCompatibility({
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
contextSize: targetContextSize,
|
||||
embeddingContext,
|
||||
forceGpuLayers: targetGpuLayers,
|
||||
forceStrictContextSize: targetContextSize != null,
|
||||
useMmap
|
||||
}, {
|
||||
getVramState,
|
||||
getRamState,
|
||||
getSwapState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading
|
||||
});
|
||||
return compatibilityScore;
|
||||
}
|
||||
/**
|
||||
* Score the compatibility of the model configuration with the current GPU and VRAM state.
|
||||
* Assumes a model is loaded with the default `"auto"` configurations.
|
||||
* Scored based on the following criteria:
|
||||
* - The number of GPU layers that can be offloaded to the GPU (only if there's a GPU. If there's no GPU then by how small the model is)
|
||||
* - Whether all layers can be offloaded to the GPU (gives additional points)
|
||||
* - Whether the resolved context size is at least as large as the specified `contextSize`
|
||||
*
|
||||
* If the resolved context size is larger than the specified context size, for each multiplier of the specified `contextSize`
|
||||
* that the resolved context size is larger by, 1 bonus point is given in the `bonusScore`.
|
||||
*
|
||||
* `maximumFittedContextSizeMultiplier` is used to improve the proportionality of the bonus score between models.
|
||||
* Set this to any value higher than `<max compared model context size> / contextSize`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models.
|
||||
* Set this to any value higher than `<max compared model resource usage> / <total available resources>`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead).
|
||||
*/
|
||||
async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, swaFullCache = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, forceGpuLayers, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
|
||||
const [vramState, ramState, swapState] = await Promise.all([
|
||||
getVramState(),
|
||||
getRamState(),
|
||||
getSwapState()
|
||||
]);
|
||||
let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max")
|
||||
? this.ggufInsights.totalLayers
|
||||
: forceGpuLayers;
|
||||
let gpuLayersFitMemory = false;
|
||||
try {
|
||||
resolvedGpuLayers = await this.resolveModelGpuLayers(forceGpuLayers != null
|
||||
? forceGpuLayers
|
||||
: embeddingContext
|
||||
? {
|
||||
fitContext: {
|
||||
embeddingContext: true,
|
||||
contextSize: forceStrictContextSize
|
||||
? contextSize
|
||||
: undefined
|
||||
}
|
||||
}
|
||||
: forceStrictContextSize != null
|
||||
? { fitContext: { contextSize } }
|
||||
: "auto", {
|
||||
getVramState: async () => vramState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
ignoreMemorySafetyChecks: forceGpuLayers != null,
|
||||
useMmap
|
||||
});
|
||||
gpuLayersFitMemory = true;
|
||||
}
|
||||
catch (err) {
|
||||
if (!(err instanceof InsufficientMemoryError))
|
||||
throw err;
|
||||
}
|
||||
const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
|
||||
const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers: resolvedGpuLayers,
|
||||
useMmap
|
||||
});
|
||||
let resolvedContextSize = forceStrictContextSize
|
||||
? contextSize
|
||||
: Math.min(this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, defaultContextSizeForUnfitContextSizeConfiguration);
|
||||
let contextFitsMemory = false;
|
||||
try {
|
||||
resolvedContextSize = await this.resolveContextContextSize("auto", {
|
||||
getVramState: async () => ({
|
||||
total: vramState.total,
|
||||
free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram),
|
||||
unifiedSize: vramState.unifiedSize
|
||||
}),
|
||||
getRamState: async () => ({
|
||||
total: ramState.total,
|
||||
free: Math.max(0, ramState.free - estimatedModelResourceUsage.cpuRam +
|
||||
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)))
|
||||
}),
|
||||
getSwapState: async () => ({
|
||||
total: swapState.total,
|
||||
free: Math.max(0, swapState.free - Math.max(0, estimatedModelResourceUsage.cpuRam +
|
||||
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) +
|
||||
(-ramState.free)))
|
||||
}),
|
||||
llamaGpu,
|
||||
isEmbeddingContext: embeddingContext,
|
||||
modelGpuLayers: resolvedGpuLayers,
|
||||
modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
|
||||
ignoreMemorySafetyChecks: forceStrictContextSize,
|
||||
flashAttention,
|
||||
swaFullCache
|
||||
});
|
||||
contextFitsMemory = true;
|
||||
if (forceStrictContextSize && resolvedContextSize < contextSize) {
|
||||
contextFitsMemory = false;
|
||||
resolvedContextSize = contextSize;
|
||||
}
|
||||
else if (forceStrictContextSize && resolvedContextSize > contextSize) {
|
||||
resolvedContextSize = contextSize;
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
if (!(err instanceof InsufficientMemoryError))
|
||||
throw err;
|
||||
}
|
||||
const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize: resolvedContextSize,
|
||||
isEmbeddingContext: embeddingContext,
|
||||
modelGpuLayers: resolvedGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache
|
||||
});
|
||||
const rankPoints = {
|
||||
gpuLayers: 60,
|
||||
allLayersAreOffloaded: 10,
|
||||
contextSize: 30,
|
||||
ramUsageFitsInRam: 10,
|
||||
cpuOnlySmallModelSize: 70, // also defined inside `scoreModelSizeForCpuOnlyUsage`
|
||||
bonusContextSize: 10
|
||||
};
|
||||
const gpuLayersPoints = rankPoints.gpuLayers * Math.min(1, resolvedGpuLayers / this._ggufInsights.totalLayers);
|
||||
const allLayersAreOffloadedPoints = rankPoints.allLayersAreOffloaded * (resolvedGpuLayers === this._ggufInsights.totalLayers ? 1 : 0);
|
||||
const contextSizePoints = contextFitsMemory
|
||||
? rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize)
|
||||
: 0;
|
||||
const ramUsageFitsInRamPoints = rankPoints.ramUsageFitsInRam * (estimatedModelResourceUsage.cpuRam <= ramState.free
|
||||
? 1
|
||||
: estimatedModelResourceUsage.cpuRam <= ramState.free + swapState.free
|
||||
? 0.8
|
||||
: estimatedModelResourceUsage.cpuRam <= ramState.total
|
||||
? 0.5
|
||||
: (0.5 - Math.min(0.5, 0.5 * ((estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total))));
|
||||
const bonusContextSizePoints = contextFitsMemory
|
||||
? (10 * Math.min(1, (Math.max(0, resolvedContextSize - contextSize) / contextSize) / maximumFittedContextSizeMultiplier))
|
||||
: 0;
|
||||
let compatibilityScore = canUseGpu
|
||||
? ((gpuLayersPoints + allLayersAreOffloadedPoints + contextSizePoints + ramUsageFitsInRamPoints) /
|
||||
(rankPoints.gpuLayers + rankPoints.allLayersAreOffloaded + rankPoints.contextSize + rankPoints.ramUsageFitsInRam))
|
||||
: ((contextSizePoints + ramUsageFitsInRamPoints + scoreModelSizeForCpuOnlyUsage(this._ggufInsights.modelSize)) /
|
||||
(rankPoints.contextSize + rankPoints.ramUsageFitsInRam + rankPoints.cpuOnlySmallModelSize));
|
||||
let bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize;
|
||||
if (!gpuLayersFitMemory || !contextFitsMemory ||
|
||||
estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram > vramState.total ||
|
||||
estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam > ramState.total + swapState.total) {
|
||||
const totalVramRequirement = estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram;
|
||||
const totalRamRequirement = estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam;
|
||||
compatibilityScore = 0;
|
||||
bonusScore = ((1 - (totalVramRequirement / (vramState.total * maximumUnfitConfigurationResourceMultiplier))) +
|
||||
(1 - (totalRamRequirement / ((ramState.total + swapState.total) * maximumUnfitConfigurationResourceMultiplier)))) / 2;
|
||||
}
|
||||
return {
|
||||
compatibilityScore,
|
||||
bonusScore,
|
||||
totalScore: compatibilityScore + bonusScore,
|
||||
resolvedValues: {
|
||||
gpuLayers: resolvedGpuLayers,
|
||||
contextSize: resolvedContextSize,
|
||||
modelRamUsage: estimatedModelResourceUsage.cpuRam,
|
||||
contextRamUsage: estimatedContextResourceUsage.cpuRam,
|
||||
totalRamUsage: estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam,
|
||||
modelVramUsage: estimatedModelResourceUsage.gpuVram,
|
||||
contextVramUsage: estimatedContextResourceUsage.gpuVram,
|
||||
totalVramUsage: estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram
|
||||
}
|
||||
};
|
||||
}
|
||||
async resolveModelGpuLayers(gpuLayers, { ignoreMemorySafetyChecks = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}) {
|
||||
return resolveModelGpuLayersOption(gpuLayers, {
|
||||
ggufInsights: this._ggufInsights,
|
||||
ignoreMemorySafetyChecks,
|
||||
getVramState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading,
|
||||
defaultContextFlashAttention,
|
||||
defaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Resolve a context size option for the given options and constraints.
|
||||
*
|
||||
* If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown.
|
||||
*/
|
||||
async resolveContextContextSize(contextSize, { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention = false, swaFullCache = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaGpu = this._ggufInsights._llama.gpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, sequences = getDefaultContextSequences() }) {
|
||||
return await resolveContextContextSizeOption({
|
||||
contextSize,
|
||||
batchSize,
|
||||
sequences,
|
||||
modelFileInsights: this._ggufInsights,
|
||||
modelGpuLayers,
|
||||
modelTrainContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
getVramState,
|
||||
getRamState,
|
||||
getSwapState,
|
||||
llamaGpu,
|
||||
ignoreMemorySafetyChecks,
|
||||
isEmbeddingContext
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
static _create(ggufInsights) {
|
||||
return new GgufInsightsConfigurationResolver(ggufInsights);
|
||||
}
|
||||
}
|
||||
function scoreModelSizeForCpuOnlyUsage(modelSize) {
|
||||
const s1GB = Math.pow(1024, 3);
|
||||
return 70 - scoreLevels(modelSize, [{
|
||||
start: s1GB,
|
||||
end: s1GB * 2.5,
|
||||
points: 46
|
||||
}, {
|
||||
start: s1GB * 2.5,
|
||||
end: s1GB * 4,
|
||||
points: 17
|
||||
}, {
|
||||
start: s1GB * 4,
|
||||
points: 7
|
||||
}]);
|
||||
}
|
||||
//# sourceMappingURL=GgufInsightsConfigurationResolver.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
5
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
export declare class GgufInsightsTokens {
|
||||
private constructor();
|
||||
get sepToken(): number | null;
|
||||
get eosToken(): number | null;
|
||||
}
|
||||
40
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.js
generated
vendored
Normal file
40
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.js
generated
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
export class GgufInsightsTokens {
|
||||
/** @internal */ _ggufInsights;
|
||||
constructor(ggufInsights) {
|
||||
this._ggufInsights = ggufInsights;
|
||||
}
|
||||
get sepToken() {
|
||||
const tokenizerModel = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.model;
|
||||
const totalTokens = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.tokens?.length;
|
||||
let sepTokenId = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.["seperator_token_id"];
|
||||
if (sepTokenId == null && tokenizerModel === "bert") {
|
||||
sepTokenId = 102; // source: `llama_vocab::impl::load` in `llama-vocab.cpp`
|
||||
}
|
||||
if (totalTokens != null && sepTokenId != null && sepTokenId >= totalTokens)
|
||||
return null;
|
||||
return sepTokenId ?? null;
|
||||
}
|
||||
get eosToken() {
|
||||
const tokenizerModel = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.model;
|
||||
const totalTokens = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.tokens?.length;
|
||||
const eosTokenId = this._ggufInsights._ggufFileInfo?.metadata?.tokenizer?.ggml?.["eos_token_id"];
|
||||
if (eosTokenId != null && totalTokens != null && eosTokenId < totalTokens)
|
||||
return eosTokenId;
|
||||
switch (tokenizerModel) {
|
||||
case "no_vocab": return null;
|
||||
case "none": return null;
|
||||
case "bert": return null;
|
||||
case "rwkv": return null;
|
||||
case "llama": return 2;
|
||||
case "gpt2": return 11;
|
||||
case "t5": return 1;
|
||||
case "plamo2": return 2;
|
||||
}
|
||||
return 2; // source: `llama_vocab::impl::load` in `llama-vocab.cpp`
|
||||
}
|
||||
/** @internal */
|
||||
static _create(ggufInsights) {
|
||||
return new GgufInsightsTokens(ggufInsights);
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=GgufInsightsTokens.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsTokens.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"GgufInsightsTokens.js","sourceRoot":"","sources":["../../../src/gguf/insights/GgufInsightsTokens.ts"],"names":[],"mappings":"AAGA,MAAM,OAAO,kBAAkB;IAC3B,gBAAgB,CAAkB,aAAa,CAAe;IAE9D,YAAoB,YAA0B;QAC1C,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;IACtC,CAAC;IAED,IAAW,QAAQ;QACf,MAAM,cAAc,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,CAAC;QAC1F,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC;QAEhG,IAAI,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,oBAAoB,CAAC,CAAC;QACrG,IAAI,UAAU,IAAI,IAAI,IAAI,cAAc,KAAK,MAAM,EAAE,CAAC;YAClD,UAAU,GAAG,GAAG,CAAC,CAAC,yDAAyD;QAC/E,CAAC;QAED,IAAI,WAAW,IAAI,IAAI,IAAI,UAAU,IAAI,IAAI,IAAI,UAAU,IAAI,WAAW;YACtE,OAAO,IAAI,CAAC;QAEhB,OAAO,UAAU,IAAI,IAAI,CAAC;IAC9B,CAAC;IAED,IAAW,QAAQ;QACf,MAAM,cAAc,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,CAAC;QAC1F,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC;QAEhG,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;QACjG,IAAI,UAAU,IAAI,IAAI,IAAI,WAAW,IAAI,IAAI,IAAI,UAAU,GAAG,WAAW;YACrE,OAAO,UAAU,CAAC;QAEtB,QAAQ,cAAc,EAAE,CAAC;YACrB,KAAK,UAAU,CAAC,CAAC,OAAO,IAAI,CAAC;YAC7B,KAAK,MAAM,CAAC,CAAC,OAAO,IAAI,CAAC;YACzB,KAAK,MAAM,CAAC,CAAC,OAAO,IAAI,CAAC;YACzB,KAAK,MAAM,CAAC,CAAC,OAAO,IAAI,CAAC;YACzB,KAAK,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC;YACvB,KAAK,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC;YACvB,KAAK,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC;YACpB,KAAK,QAAQ,CAAC,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QACD,OAAO,CAAC,CAAC,CAAC,yDAAyD;IACvE,CAAC;IAED,gBAAgB;IACT,MAAM,CAAC,OAAO,CAAC,YAA0B;QAC5C,OAAO,IAAI,kBAAkB,CAAC,YAAY,CAAC,CAAC;IAChD,CAAC;CACJ"}
|
||||
5
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
export declare function getRamUsageFromUnifiedVram(vramUsage: number, vramState: {
|
||||
total: number;
|
||||
free: number;
|
||||
unifiedSize: number;
|
||||
}): number;
|
||||
7
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.js
generated
vendored
Normal file
7
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.js
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
export function getRamUsageFromUnifiedVram(vramUsage, vramState) {
|
||||
const onlyVramSize = vramState.total - vramState.unifiedSize;
|
||||
const existingUsage = Math.max(0, vramState.total - vramState.free);
|
||||
const unifiedRamUsage = Math.min(vramState.unifiedSize, Math.max(0, vramUsage - Math.max(0, onlyVramSize - existingUsage)));
|
||||
return unifiedRamUsage;
|
||||
}
|
||||
//# sourceMappingURL=getRamUsageFromUnifiedVram.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/getRamUsageFromUnifiedVram.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"getRamUsageFromUnifiedVram.js","sourceRoot":"","sources":["../../../../src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,0BAA0B,CAAC,SAAiB,EAAE,SAA6D;IACvH,MAAM,YAAY,GAAG,SAAS,CAAC,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC;IAC7D,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAEpE,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,YAAY,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAE5H,OAAO,eAAe,CAAC;AAC3B,CAAC"}
|
||||
30
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.d.ts
generated
vendored
Normal file
30
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.d.ts
generated
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
import { LlamaContextOptions } from "../../../evaluator/LlamaContext/types.js";
|
||||
import { GgufInsights } from "../GgufInsights.js";
|
||||
import { BuildGpu } from "../../../bindings/types.js";
|
||||
export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks, isEmbeddingContext, maxContextSizeSwapUse }: {
|
||||
contextSize?: LlamaContextOptions["contextSize"];
|
||||
batchSize?: LlamaContextOptions["batchSize"];
|
||||
sequences: number;
|
||||
modelFileInsights: GgufInsights;
|
||||
modelGpuLayers: number;
|
||||
modelTrainContextSize: number;
|
||||
flashAttention: boolean;
|
||||
swaFullCache: boolean;
|
||||
getVramState(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
unifiedSize: number;
|
||||
}>;
|
||||
getRamState(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
getSwapState(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaGpu: BuildGpu;
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
isEmbeddingContext?: boolean;
|
||||
maxContextSizeSwapUse?: number;
|
||||
}): Promise<number>;
|
||||
111
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.js
generated
vendored
Normal file
111
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.js
generated
vendored
Normal file
@@ -0,0 +1,111 @@
|
||||
import { minAllowedContextSizeInCalculations } from "../../../config.js";
|
||||
import { getDefaultContextBatchSize, getDefaultModelContextSize } from "../../../evaluator/LlamaContext/LlamaContext.js";
|
||||
import { InsufficientMemoryError } from "../../../utils/InsufficientMemoryError.js";
|
||||
import { getRamUsageFromUnifiedVram } from "./getRamUsageFromUnifiedVram.js";
|
||||
const defaultMaxContextSizeSwapUse = 2048;
|
||||
export async function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }) {
|
||||
if (contextSize == null)
|
||||
contextSize = "auto";
|
||||
if (typeof contextSize === "number") {
|
||||
const resolvedContextSize = Math.max(1, Math.floor(contextSize));
|
||||
if (ignoreMemorySafetyChecks)
|
||||
return resolvedContextSize;
|
||||
const [vramState, ramState, swapState] = await Promise.all([
|
||||
getVramState(),
|
||||
getRamState(),
|
||||
getSwapState()
|
||||
]);
|
||||
const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
|
||||
contextSize: resolvedContextSize,
|
||||
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
|
||||
modelGpuLayers: modelGpuLayers,
|
||||
sequences,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
isEmbeddingContext
|
||||
});
|
||||
if (contextResourceRequirements.gpuVram > vramState.free)
|
||||
throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
|
||||
else if (contextResourceRequirements.cpuRam > (ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState)))
|
||||
throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`);
|
||||
return resolvedContextSize;
|
||||
}
|
||||
else if (contextSize === "auto" || typeof contextSize === "object") {
|
||||
const [vramState, ramState, swapState] = await Promise.all([
|
||||
getVramState(),
|
||||
getRamState(),
|
||||
getSwapState()
|
||||
]);
|
||||
const maxContextSize = contextSize === "auto"
|
||||
? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
|
||||
: Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
|
||||
const minContextSize = contextSize === "auto"
|
||||
? minAllowedContextSizeInCalculations
|
||||
: Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
|
||||
let highestCompatibleContextSize = null;
|
||||
let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4));
|
||||
for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) {
|
||||
const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
|
||||
contextSize: testContextSize,
|
||||
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
|
||||
modelGpuLayers: modelGpuLayers,
|
||||
sequences,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
isEmbeddingContext
|
||||
});
|
||||
if (contextResourceRequirements.gpuVram <= vramState.free &&
|
||||
contextResourceRequirements.cpuRam <= (ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + (testContextSize <= maxContextSizeSwapUse
|
||||
? swapState.free
|
||||
: 0))) {
|
||||
if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) {
|
||||
highestCompatibleContextSize = testContextSize;
|
||||
if (step === -1)
|
||||
break;
|
||||
else if (step < 0)
|
||||
step = Math.max(1, Math.floor(-step / 2));
|
||||
}
|
||||
}
|
||||
else if (step > 0)
|
||||
step = -Math.max(1, Math.floor(step / 2));
|
||||
if (testContextSize == minContextSize && step === -1)
|
||||
break;
|
||||
testContextSize += step;
|
||||
if (testContextSize < minContextSize) {
|
||||
testContextSize = minContextSize;
|
||||
step = Math.max(1, Math.floor(Math.abs(step) / 2));
|
||||
}
|
||||
else if (testContextSize > maxContextSize) {
|
||||
testContextSize = maxContextSize;
|
||||
step = -Math.max(1, Math.floor(Math.abs(step) / 2));
|
||||
}
|
||||
}
|
||||
if (highestCompatibleContextSize != null)
|
||||
return highestCompatibleContextSize;
|
||||
if (ignoreMemorySafetyChecks)
|
||||
return minContextSize;
|
||||
const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
|
||||
contextSize: minContextSize,
|
||||
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: minContextSize, sequences }),
|
||||
modelGpuLayers: modelGpuLayers,
|
||||
sequences,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
isEmbeddingContext
|
||||
});
|
||||
const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState);
|
||||
if (minContextSizeResourceRequirements.gpuVram > vramState.free &&
|
||||
minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage)
|
||||
throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`);
|
||||
else if (minContextSizeResourceRequirements.gpuVram > vramState.free)
|
||||
throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
|
||||
else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage)
|
||||
throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`);
|
||||
else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage)
|
||||
throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`);
|
||||
else
|
||||
throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`);
|
||||
}
|
||||
throw new Error(`Invalid context size: "${contextSize}"`);
|
||||
}
|
||||
//# sourceMappingURL=resolveContextContextSizeOption.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveContextContextSizeOption.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
17
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts
generated
vendored
Normal file
17
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts
generated
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
import { LlamaModelOptions } from "../../../evaluator/LlamaModel/LlamaModel.js";
|
||||
import { BuildGpu } from "../../../bindings/types.js";
|
||||
import type { GgufInsights } from "../GgufInsights.js";
|
||||
export declare function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { ggufInsights, ignoreMemorySafetyChecks, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }: {
|
||||
ggufInsights: GgufInsights;
|
||||
ignoreMemorySafetyChecks?: boolean;
|
||||
getVramState(): Promise<{
|
||||
total: number;
|
||||
free: number;
|
||||
}>;
|
||||
llamaVramPaddingSize: number;
|
||||
llamaGpu: BuildGpu;
|
||||
llamaSupportsGpuOffloading: boolean;
|
||||
defaultContextFlashAttention: boolean;
|
||||
defaultContextSwaFullCache: boolean;
|
||||
useMmap?: boolean;
|
||||
}): Promise<number>;
|
||||
239
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.js
generated
vendored
Normal file
239
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.js
generated
vendored
Normal file
@@ -0,0 +1,239 @@
|
||||
import { InsufficientMemoryError } from "../../../utils/InsufficientMemoryError.js";
|
||||
import { findBestOption } from "../../../utils/findBestOption.js";
|
||||
import { getDefaultContextBatchSize, getDefaultModelContextSize } from "../../../evaluator/LlamaContext/LlamaContext.js";
|
||||
import { minAllowedContextSizeInCalculations } from "../../../config.js";
|
||||
import { scoreLevels } from "./scoreLevels.js";
|
||||
const fitContextExtraMemoryPaddingPercentage = 0.5;
|
||||
export async function resolveModelGpuLayersOption(gpuLayers, { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }) {
|
||||
if (gpuLayers == null)
|
||||
gpuLayers = "auto";
|
||||
if (!llamaSupportsGpuOffloading)
|
||||
return 0;
|
||||
if (gpuLayers === "max" || typeof gpuLayers === "number") {
|
||||
const resolvedGpuLayers = typeof gpuLayers === "number"
|
||||
? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers))
|
||||
: ggufInsights.totalLayers;
|
||||
if (ignoreMemorySafetyChecks)
|
||||
return resolvedGpuLayers;
|
||||
const vramState = await getVramState();
|
||||
const maxLayersRequirements = getVramRequiredForGpuLayers({
|
||||
gpuLayers: resolvedGpuLayers,
|
||||
ggufInsights,
|
||||
currentVram: vramState.free,
|
||||
defaultContextFlashAttention,
|
||||
defaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
if (maxLayersRequirements == null)
|
||||
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
|
||||
return resolvedGpuLayers;
|
||||
}
|
||||
else if (gpuLayers === "auto" || typeof gpuLayers === "object") {
|
||||
if (llamaGpu === false)
|
||||
return 0;
|
||||
const vramState = await getVramState();
|
||||
if (vramState.total === 0)
|
||||
return 0;
|
||||
let freeVram = vramState.free;
|
||||
if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) {
|
||||
freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage;
|
||||
if (freeVram < 0)
|
||||
freeVram = 0;
|
||||
}
|
||||
const bestGpuLayersOption = getBestGpuLayersForFreeVram({
|
||||
ggufInsights,
|
||||
freeVram,
|
||||
fitContext: typeof gpuLayers === "object"
|
||||
? gpuLayers.fitContext
|
||||
: undefined,
|
||||
minGpuLayers: typeof gpuLayers === "object"
|
||||
? gpuLayers.min
|
||||
: undefined,
|
||||
maxGpuLayers: typeof gpuLayers === "object"
|
||||
? gpuLayers.max
|
||||
: undefined,
|
||||
defaultContextFlashAttention,
|
||||
defaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
|
||||
(gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null);
|
||||
if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements)
|
||||
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
|
||||
return bestGpuLayersOption ?? 0;
|
||||
}
|
||||
throw new Error(`Invalid gpuLayers value: ${gpuLayers}`);
|
||||
}
|
||||
function getBestGpuLayersForFreeVram({ ggufInsights, freeVram, fitContext, minGpuLayers, maxGpuLayers, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }) {
|
||||
return findBestOption({
|
||||
*generator() {
|
||||
const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0));
|
||||
const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers));
|
||||
for (let layers = maxLayers; layers >= minLayers; layers--) {
|
||||
yield {
|
||||
gpuLayers: layers
|
||||
};
|
||||
}
|
||||
},
|
||||
score(option) {
|
||||
const layersRequirements = getVramRequiredForGpuLayers({
|
||||
gpuLayers: option.gpuLayers,
|
||||
ggufInsights,
|
||||
currentVram: freeVram,
|
||||
fitContext,
|
||||
defaultContextFlashAttention,
|
||||
defaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
if (layersRequirements == null)
|
||||
return null;
|
||||
return scoreGpuLayersAndContextCombination({ gpuLayers: option.gpuLayers, contextSize: layersRequirements.contextSize }, {
|
||||
totalGpuLayers: ggufInsights.totalLayers,
|
||||
trainContextSize: getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize })
|
||||
});
|
||||
}
|
||||
})?.gpuLayers ?? null;
|
||||
}
|
||||
function scoreGpuLayersAndContextCombination({ gpuLayers, contextSize }, { totalGpuLayers, trainContextSize }) {
|
||||
function scoreGpuLayers() {
|
||||
return scoreLevels(gpuLayers, [{
|
||||
start: 0,
|
||||
points: 4
|
||||
}, {
|
||||
start: 1,
|
||||
points: 26
|
||||
}, {
|
||||
start: totalGpuLayers,
|
||||
points: 14,
|
||||
end: totalGpuLayers
|
||||
}]);
|
||||
}
|
||||
function scoreContextSize() {
|
||||
const gpuLayersPercentage = gpuLayers / totalGpuLayers;
|
||||
return scoreLevels(contextSize, [{
|
||||
start: 0,
|
||||
points: 2
|
||||
}, {
|
||||
start: 1024,
|
||||
points: 4
|
||||
}, {
|
||||
start: 2048,
|
||||
points: gpuLayersPercentage < 0.1 ? 1 : 8
|
||||
}, {
|
||||
start: 4096,
|
||||
points: gpuLayersPercentage < 0.3 ? 4 : 16
|
||||
}, {
|
||||
start: 8192,
|
||||
points: gpuLayersPercentage < 0.6 ? 1 : 8,
|
||||
end: Math.max(trainContextSize, 16384)
|
||||
}]);
|
||||
}
|
||||
return scoreGpuLayers() + scoreContextSize();
|
||||
}
|
||||
function getVramRequiredForGpuLayers({ gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap }) {
|
||||
const modelVram = ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers,
|
||||
useMmap
|
||||
}).gpuVram;
|
||||
if (modelVram > currentVram)
|
||||
return null;
|
||||
if (fitContext != null && fitContext.contextSize != null) {
|
||||
const contextVram = ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize: fitContext.contextSize,
|
||||
batchSize: getDefaultContextBatchSize({ contextSize: fitContext.contextSize, sequences: 1 }),
|
||||
modelGpuLayers: gpuLayers,
|
||||
sequences: 1,
|
||||
isEmbeddingContext: fitContext.embeddingContext ?? false,
|
||||
flashAttention: defaultContextFlashAttention,
|
||||
swaFullCache: defaultContextSwaFullCache
|
||||
}).gpuVram;
|
||||
const totalVram = modelVram + contextVram;
|
||||
if (totalVram > currentVram)
|
||||
return null;
|
||||
return {
|
||||
contextSize: fitContext.contextSize,
|
||||
contextVram,
|
||||
totalVram
|
||||
};
|
||||
}
|
||||
const maxContext = findMaxPossibleContextSizeForVram({
|
||||
gpuLayers,
|
||||
ggufInsights,
|
||||
vram: currentVram - modelVram,
|
||||
isEmbeddingContext: fitContext?.embeddingContext ?? false,
|
||||
flashAttention: defaultContextFlashAttention,
|
||||
swaFullCache: defaultContextSwaFullCache
|
||||
});
|
||||
if (maxContext == null || modelVram + maxContext.vram > currentVram)
|
||||
return null;
|
||||
return {
|
||||
contextSize: maxContext.contextSize,
|
||||
contextVram: maxContext.vram,
|
||||
totalVram: modelVram + maxContext.vram
|
||||
};
|
||||
}
|
||||
function findMaxPossibleContextSizeForVram({ gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache }) {
|
||||
const maxContextSize = getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize });
|
||||
return findMaxValidValue({
|
||||
maxValue: maxContextSize,
|
||||
minValue: minAllowedContextSizeInCalculations,
|
||||
minStep: 1,
|
||||
test(contextSize) {
|
||||
const contextVram = ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize,
|
||||
batchSize: getDefaultContextBatchSize({ contextSize, sequences: 1 }),
|
||||
modelGpuLayers: gpuLayers,
|
||||
sequences: 1,
|
||||
isEmbeddingContext,
|
||||
flashAttention,
|
||||
swaFullCache
|
||||
}).gpuVram;
|
||||
if (contextVram <= vram)
|
||||
return {
|
||||
contextSize,
|
||||
vram: contextVram
|
||||
};
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
function findMaxValidValue({ maxValue, minValue, minStep = 1, test }) {
|
||||
let step = -Math.max(minStep, Math.floor((maxValue - minValue) / 4));
|
||||
let bestValue = null;
|
||||
for (let value = maxValue; value >= minValue;) {
|
||||
const result = (bestValue != null && value === bestValue.value)
|
||||
? bestValue.result
|
||||
: test(value);
|
||||
if (result != null) {
|
||||
if (bestValue == null || value >= bestValue.value) {
|
||||
bestValue = { value: value, result: result };
|
||||
if (step === -minStep)
|
||||
break;
|
||||
else if (step < 0)
|
||||
step = Math.max(minStep, Math.floor(-step / 2));
|
||||
}
|
||||
}
|
||||
else if (bestValue != null && value < bestValue.value) {
|
||||
value = bestValue.value;
|
||||
step = Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
||||
continue;
|
||||
}
|
||||
else if (step > 0)
|
||||
step = -Math.max(minStep, Math.floor(step / 2));
|
||||
if (value === minValue && step === -minStep)
|
||||
break;
|
||||
value += step;
|
||||
if (value < minValue) {
|
||||
value = minValue;
|
||||
step = Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
||||
}
|
||||
else if (value > maxValue) {
|
||||
value = maxValue;
|
||||
step = -Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
||||
}
|
||||
}
|
||||
if (bestValue != null)
|
||||
return bestValue.result;
|
||||
return null;
|
||||
}
|
||||
//# sourceMappingURL=resolveModelGpuLayersOption.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
5
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.d.ts
generated
vendored
Normal file
5
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.d.ts
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
export declare function scoreLevels(num: number, levels: {
|
||||
start: number;
|
||||
end?: number;
|
||||
points: number;
|
||||
}[]): number;
|
||||
16
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.js
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.js
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
export function scoreLevels(num, levels) {
|
||||
let res = 0;
|
||||
for (let i = 0; i < levels.length; i++) {
|
||||
const level = levels[i];
|
||||
const start = level.start;
|
||||
const end = level.end ?? levels[i + 1]?.start ?? Math.max(start, num);
|
||||
if (num < start)
|
||||
break;
|
||||
else if (num >= end)
|
||||
res += level.points;
|
||||
else
|
||||
res += level.points * ((num - start) / (end - start));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
//# sourceMappingURL=scoreLevels.js.map
|
||||
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/gguf/insights/utils/scoreLevels.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"scoreLevels.js","sourceRoot":"","sources":["../../../../src/gguf/insights/utils/scoreLevels.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,WAAW,CAAC,GAAW,EAAE,MAAuD;IAC5F,IAAI,GAAG,GAAG,CAAC,CAAC;IAEZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;QACzB,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QAC1B,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QAEtE,IAAI,GAAG,GAAG,KAAK;YACX,MAAM;aACL,IAAI,GAAG,IAAI,GAAG;YACf,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC;;YAEpB,GAAG,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC;IAC9D,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC"}
|
||||
Reference in New Issue
Block a user