First upload version 0.0.1
This commit is contained in:
272
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js
generated
vendored
Normal file
272
node_modules/node-llama-cpp/dist/gguf/insights/GgufInsightsConfigurationResolver.js
generated
vendored
Normal file
@@ -0,0 +1,272 @@
|
||||
import { getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
|
||||
import { InsufficientMemoryError } from "../../utils/InsufficientMemoryError.js";
|
||||
import { resolveModelGpuLayersOption } from "./utils/resolveModelGpuLayersOption.js";
|
||||
import { resolveContextContextSizeOption } from "./utils/resolveContextContextSizeOption.js";
|
||||
import { scoreLevels } from "./utils/scoreLevels.js";
|
||||
import { getRamUsageFromUnifiedVram } from "./utils/getRamUsageFromUnifiedVram.js";
|
||||
export const defaultTrainContextSizeForEstimationPurposes = 4096;
|
||||
const defaultContextSizeForUnfitContextSizeConfiguration = 2048;
|
||||
export class GgufInsightsConfigurationResolver {
|
||||
/** @internal */ _ggufInsights;
|
||||
constructor(ggufInsights) {
|
||||
this._ggufInsights = ggufInsights;
|
||||
}
|
||||
get ggufInsights() {
|
||||
return this._ggufInsights;
|
||||
}
|
||||
/**
|
||||
* Resolve the best configuration for loading a model and creating a context using the current hardware.
|
||||
*
|
||||
* Specifying a `targetGpuLayers` and/or `targetContextSize` will ensure the resolved configuration matches those values,
|
||||
* but note it can lower the compatibility score if the hardware doesn't support it.
|
||||
*
|
||||
* Overriding hardware values it possible by configuring `hardwareOverrides`.
|
||||
* @param options
|
||||
* @param hardwareOverrides
|
||||
*/
|
||||
async resolveAndScoreConfig({ targetGpuLayers, targetContextSize, embeddingContext = false, flashAttention = false, swaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
|
||||
const compatibilityScore = await this.scoreModelConfigurationCompatibility({
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
contextSize: targetContextSize,
|
||||
embeddingContext,
|
||||
forceGpuLayers: targetGpuLayers,
|
||||
forceStrictContextSize: targetContextSize != null,
|
||||
useMmap
|
||||
}, {
|
||||
getVramState,
|
||||
getRamState,
|
||||
getSwapState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading
|
||||
});
|
||||
return compatibilityScore;
|
||||
}
|
||||
/**
|
||||
* Score the compatibility of the model configuration with the current GPU and VRAM state.
|
||||
* Assumes a model is loaded with the default `"auto"` configurations.
|
||||
* Scored based on the following criteria:
|
||||
* - The number of GPU layers that can be offloaded to the GPU (only if there's a GPU. If there's no GPU then by how small the model is)
|
||||
* - Whether all layers can be offloaded to the GPU (gives additional points)
|
||||
* - Whether the resolved context size is at least as large as the specified `contextSize`
|
||||
*
|
||||
* If the resolved context size is larger than the specified context size, for each multiplier of the specified `contextSize`
|
||||
* that the resolved context size is larger by, 1 bonus point is given in the `bonusScore`.
|
||||
*
|
||||
* `maximumFittedContextSizeMultiplier` is used to improve the proportionality of the bonus score between models.
|
||||
* Set this to any value higher than `<max compared model context size> / contextSize`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models.
|
||||
* Set this to any value higher than `<max compared model resource usage> / <total available resources>`.
|
||||
* Defaults to `100`.
|
||||
*
|
||||
* `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead).
|
||||
*/
|
||||
async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, swaFullCache = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, forceGpuLayers, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
|
||||
const [vramState, ramState, swapState] = await Promise.all([
|
||||
getVramState(),
|
||||
getRamState(),
|
||||
getSwapState()
|
||||
]);
|
||||
let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max")
|
||||
? this.ggufInsights.totalLayers
|
||||
: forceGpuLayers;
|
||||
let gpuLayersFitMemory = false;
|
||||
try {
|
||||
resolvedGpuLayers = await this.resolveModelGpuLayers(forceGpuLayers != null
|
||||
? forceGpuLayers
|
||||
: embeddingContext
|
||||
? {
|
||||
fitContext: {
|
||||
embeddingContext: true,
|
||||
contextSize: forceStrictContextSize
|
||||
? contextSize
|
||||
: undefined
|
||||
}
|
||||
}
|
||||
: forceStrictContextSize != null
|
||||
? { fitContext: { contextSize } }
|
||||
: "auto", {
|
||||
getVramState: async () => vramState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
ignoreMemorySafetyChecks: forceGpuLayers != null,
|
||||
useMmap
|
||||
});
|
||||
gpuLayersFitMemory = true;
|
||||
}
|
||||
catch (err) {
|
||||
if (!(err instanceof InsufficientMemoryError))
|
||||
throw err;
|
||||
}
|
||||
const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
|
||||
const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers: resolvedGpuLayers,
|
||||
useMmap
|
||||
});
|
||||
let resolvedContextSize = forceStrictContextSize
|
||||
? contextSize
|
||||
: Math.min(this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, defaultContextSizeForUnfitContextSizeConfiguration);
|
||||
let contextFitsMemory = false;
|
||||
try {
|
||||
resolvedContextSize = await this.resolveContextContextSize("auto", {
|
||||
getVramState: async () => ({
|
||||
total: vramState.total,
|
||||
free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram),
|
||||
unifiedSize: vramState.unifiedSize
|
||||
}),
|
||||
getRamState: async () => ({
|
||||
total: ramState.total,
|
||||
free: Math.max(0, ramState.free - estimatedModelResourceUsage.cpuRam +
|
||||
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)))
|
||||
}),
|
||||
getSwapState: async () => ({
|
||||
total: swapState.total,
|
||||
free: Math.max(0, swapState.free - Math.max(0, estimatedModelResourceUsage.cpuRam +
|
||||
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) +
|
||||
(-ramState.free)))
|
||||
}),
|
||||
llamaGpu,
|
||||
isEmbeddingContext: embeddingContext,
|
||||
modelGpuLayers: resolvedGpuLayers,
|
||||
modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
|
||||
ignoreMemorySafetyChecks: forceStrictContextSize,
|
||||
flashAttention,
|
||||
swaFullCache
|
||||
});
|
||||
contextFitsMemory = true;
|
||||
if (forceStrictContextSize && resolvedContextSize < contextSize) {
|
||||
contextFitsMemory = false;
|
||||
resolvedContextSize = contextSize;
|
||||
}
|
||||
else if (forceStrictContextSize && resolvedContextSize > contextSize) {
|
||||
resolvedContextSize = contextSize;
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
if (!(err instanceof InsufficientMemoryError))
|
||||
throw err;
|
||||
}
|
||||
const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize: resolvedContextSize,
|
||||
isEmbeddingContext: embeddingContext,
|
||||
modelGpuLayers: resolvedGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache
|
||||
});
|
||||
const rankPoints = {
|
||||
gpuLayers: 60,
|
||||
allLayersAreOffloaded: 10,
|
||||
contextSize: 30,
|
||||
ramUsageFitsInRam: 10,
|
||||
cpuOnlySmallModelSize: 70, // also defined inside `scoreModelSizeForCpuOnlyUsage`
|
||||
bonusContextSize: 10
|
||||
};
|
||||
const gpuLayersPoints = rankPoints.gpuLayers * Math.min(1, resolvedGpuLayers / this._ggufInsights.totalLayers);
|
||||
const allLayersAreOffloadedPoints = rankPoints.allLayersAreOffloaded * (resolvedGpuLayers === this._ggufInsights.totalLayers ? 1 : 0);
|
||||
const contextSizePoints = contextFitsMemory
|
||||
? rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize)
|
||||
: 0;
|
||||
const ramUsageFitsInRamPoints = rankPoints.ramUsageFitsInRam * (estimatedModelResourceUsage.cpuRam <= ramState.free
|
||||
? 1
|
||||
: estimatedModelResourceUsage.cpuRam <= ramState.free + swapState.free
|
||||
? 0.8
|
||||
: estimatedModelResourceUsage.cpuRam <= ramState.total
|
||||
? 0.5
|
||||
: (0.5 - Math.min(0.5, 0.5 * ((estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total))));
|
||||
const bonusContextSizePoints = contextFitsMemory
|
||||
? (10 * Math.min(1, (Math.max(0, resolvedContextSize - contextSize) / contextSize) / maximumFittedContextSizeMultiplier))
|
||||
: 0;
|
||||
let compatibilityScore = canUseGpu
|
||||
? ((gpuLayersPoints + allLayersAreOffloadedPoints + contextSizePoints + ramUsageFitsInRamPoints) /
|
||||
(rankPoints.gpuLayers + rankPoints.allLayersAreOffloaded + rankPoints.contextSize + rankPoints.ramUsageFitsInRam))
|
||||
: ((contextSizePoints + ramUsageFitsInRamPoints + scoreModelSizeForCpuOnlyUsage(this._ggufInsights.modelSize)) /
|
||||
(rankPoints.contextSize + rankPoints.ramUsageFitsInRam + rankPoints.cpuOnlySmallModelSize));
|
||||
let bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize;
|
||||
if (!gpuLayersFitMemory || !contextFitsMemory ||
|
||||
estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram > vramState.total ||
|
||||
estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam > ramState.total + swapState.total) {
|
||||
const totalVramRequirement = estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram;
|
||||
const totalRamRequirement = estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam;
|
||||
compatibilityScore = 0;
|
||||
bonusScore = ((1 - (totalVramRequirement / (vramState.total * maximumUnfitConfigurationResourceMultiplier))) +
|
||||
(1 - (totalRamRequirement / ((ramState.total + swapState.total) * maximumUnfitConfigurationResourceMultiplier)))) / 2;
|
||||
}
|
||||
return {
|
||||
compatibilityScore,
|
||||
bonusScore,
|
||||
totalScore: compatibilityScore + bonusScore,
|
||||
resolvedValues: {
|
||||
gpuLayers: resolvedGpuLayers,
|
||||
contextSize: resolvedContextSize,
|
||||
modelRamUsage: estimatedModelResourceUsage.cpuRam,
|
||||
contextRamUsage: estimatedContextResourceUsage.cpuRam,
|
||||
totalRamUsage: estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam,
|
||||
modelVramUsage: estimatedModelResourceUsage.gpuVram,
|
||||
contextVramUsage: estimatedContextResourceUsage.gpuVram,
|
||||
totalVramUsage: estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram
|
||||
}
|
||||
};
|
||||
}
|
||||
async resolveModelGpuLayers(gpuLayers, { ignoreMemorySafetyChecks = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}) {
|
||||
return resolveModelGpuLayersOption(gpuLayers, {
|
||||
ggufInsights: this._ggufInsights,
|
||||
ignoreMemorySafetyChecks,
|
||||
getVramState,
|
||||
llamaVramPaddingSize,
|
||||
llamaGpu,
|
||||
llamaSupportsGpuOffloading,
|
||||
defaultContextFlashAttention,
|
||||
defaultContextSwaFullCache,
|
||||
useMmap
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Resolve a context size option for the given options and constraints.
|
||||
*
|
||||
* If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown.
|
||||
*/
|
||||
async resolveContextContextSize(contextSize, { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention = false, swaFullCache = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaGpu = this._ggufInsights._llama.gpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, sequences = getDefaultContextSequences() }) {
|
||||
return await resolveContextContextSizeOption({
|
||||
contextSize,
|
||||
batchSize,
|
||||
sequences,
|
||||
modelFileInsights: this._ggufInsights,
|
||||
modelGpuLayers,
|
||||
modelTrainContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
getVramState,
|
||||
getRamState,
|
||||
getSwapState,
|
||||
llamaGpu,
|
||||
ignoreMemorySafetyChecks,
|
||||
isEmbeddingContext
|
||||
});
|
||||
}
|
||||
/** @internal */
|
||||
static _create(ggufInsights) {
|
||||
return new GgufInsightsConfigurationResolver(ggufInsights);
|
||||
}
|
||||
}
|
||||
function scoreModelSizeForCpuOnlyUsage(modelSize) {
|
||||
const s1GB = Math.pow(1024, 3);
|
||||
return 70 - scoreLevels(modelSize, [{
|
||||
start: s1GB,
|
||||
end: s1GB * 2.5,
|
||||
points: 46
|
||||
}, {
|
||||
start: s1GB * 2.5,
|
||||
end: s1GB * 4,
|
||||
points: 17
|
||||
}, {
|
||||
start: s1GB * 4,
|
||||
points: 7
|
||||
}]);
|
||||
}
|
||||
//# sourceMappingURL=GgufInsightsConfigurationResolver.js.map
|
||||
Reference in New Issue
Block a user