239 lines
9.8 KiB
JavaScript
239 lines
9.8 KiB
JavaScript
import { InsufficientMemoryError } from "../../../utils/InsufficientMemoryError.js";
|
|
import { findBestOption } from "../../../utils/findBestOption.js";
|
|
import { getDefaultContextBatchSize, getDefaultModelContextSize } from "../../../evaluator/LlamaContext/LlamaContext.js";
|
|
import { minAllowedContextSizeInCalculations } from "../../../config.js";
|
|
import { scoreLevels } from "./scoreLevels.js";
|
|
const fitContextExtraMemoryPaddingPercentage = 0.5;
|
|
export async function resolveModelGpuLayersOption(gpuLayers, { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }) {
|
|
if (gpuLayers == null)
|
|
gpuLayers = "auto";
|
|
if (!llamaSupportsGpuOffloading)
|
|
return 0;
|
|
if (gpuLayers === "max" || typeof gpuLayers === "number") {
|
|
const resolvedGpuLayers = typeof gpuLayers === "number"
|
|
? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers))
|
|
: ggufInsights.totalLayers;
|
|
if (ignoreMemorySafetyChecks)
|
|
return resolvedGpuLayers;
|
|
const vramState = await getVramState();
|
|
const maxLayersRequirements = getVramRequiredForGpuLayers({
|
|
gpuLayers: resolvedGpuLayers,
|
|
ggufInsights,
|
|
currentVram: vramState.free,
|
|
defaultContextFlashAttention,
|
|
defaultContextSwaFullCache,
|
|
useMmap
|
|
});
|
|
if (maxLayersRequirements == null)
|
|
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
|
|
return resolvedGpuLayers;
|
|
}
|
|
else if (gpuLayers === "auto" || typeof gpuLayers === "object") {
|
|
if (llamaGpu === false)
|
|
return 0;
|
|
const vramState = await getVramState();
|
|
if (vramState.total === 0)
|
|
return 0;
|
|
let freeVram = vramState.free;
|
|
if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) {
|
|
freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage;
|
|
if (freeVram < 0)
|
|
freeVram = 0;
|
|
}
|
|
const bestGpuLayersOption = getBestGpuLayersForFreeVram({
|
|
ggufInsights,
|
|
freeVram,
|
|
fitContext: typeof gpuLayers === "object"
|
|
? gpuLayers.fitContext
|
|
: undefined,
|
|
minGpuLayers: typeof gpuLayers === "object"
|
|
? gpuLayers.min
|
|
: undefined,
|
|
maxGpuLayers: typeof gpuLayers === "object"
|
|
? gpuLayers.max
|
|
: undefined,
|
|
defaultContextFlashAttention,
|
|
defaultContextSwaFullCache,
|
|
useMmap
|
|
});
|
|
const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
|
|
(gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null);
|
|
if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements)
|
|
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
|
|
return bestGpuLayersOption ?? 0;
|
|
}
|
|
throw new Error(`Invalid gpuLayers value: ${gpuLayers}`);
|
|
}
|
|
function getBestGpuLayersForFreeVram({ ggufInsights, freeVram, fitContext, minGpuLayers, maxGpuLayers, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }) {
|
|
return findBestOption({
|
|
*generator() {
|
|
const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0));
|
|
const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers));
|
|
for (let layers = maxLayers; layers >= minLayers; layers--) {
|
|
yield {
|
|
gpuLayers: layers
|
|
};
|
|
}
|
|
},
|
|
score(option) {
|
|
const layersRequirements = getVramRequiredForGpuLayers({
|
|
gpuLayers: option.gpuLayers,
|
|
ggufInsights,
|
|
currentVram: freeVram,
|
|
fitContext,
|
|
defaultContextFlashAttention,
|
|
defaultContextSwaFullCache,
|
|
useMmap
|
|
});
|
|
if (layersRequirements == null)
|
|
return null;
|
|
return scoreGpuLayersAndContextCombination({ gpuLayers: option.gpuLayers, contextSize: layersRequirements.contextSize }, {
|
|
totalGpuLayers: ggufInsights.totalLayers,
|
|
trainContextSize: getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize })
|
|
});
|
|
}
|
|
})?.gpuLayers ?? null;
|
|
}
|
|
function scoreGpuLayersAndContextCombination({ gpuLayers, contextSize }, { totalGpuLayers, trainContextSize }) {
|
|
function scoreGpuLayers() {
|
|
return scoreLevels(gpuLayers, [{
|
|
start: 0,
|
|
points: 4
|
|
}, {
|
|
start: 1,
|
|
points: 26
|
|
}, {
|
|
start: totalGpuLayers,
|
|
points: 14,
|
|
end: totalGpuLayers
|
|
}]);
|
|
}
|
|
function scoreContextSize() {
|
|
const gpuLayersPercentage = gpuLayers / totalGpuLayers;
|
|
return scoreLevels(contextSize, [{
|
|
start: 0,
|
|
points: 2
|
|
}, {
|
|
start: 1024,
|
|
points: 4
|
|
}, {
|
|
start: 2048,
|
|
points: gpuLayersPercentage < 0.1 ? 1 : 8
|
|
}, {
|
|
start: 4096,
|
|
points: gpuLayersPercentage < 0.3 ? 4 : 16
|
|
}, {
|
|
start: 8192,
|
|
points: gpuLayersPercentage < 0.6 ? 1 : 8,
|
|
end: Math.max(trainContextSize, 16384)
|
|
}]);
|
|
}
|
|
return scoreGpuLayers() + scoreContextSize();
|
|
}
|
|
function getVramRequiredForGpuLayers({ gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap }) {
|
|
const modelVram = ggufInsights.estimateModelResourceRequirements({
|
|
gpuLayers,
|
|
useMmap
|
|
}).gpuVram;
|
|
if (modelVram > currentVram)
|
|
return null;
|
|
if (fitContext != null && fitContext.contextSize != null) {
|
|
const contextVram = ggufInsights.estimateContextResourceRequirements({
|
|
contextSize: fitContext.contextSize,
|
|
batchSize: getDefaultContextBatchSize({ contextSize: fitContext.contextSize, sequences: 1 }),
|
|
modelGpuLayers: gpuLayers,
|
|
sequences: 1,
|
|
isEmbeddingContext: fitContext.embeddingContext ?? false,
|
|
flashAttention: defaultContextFlashAttention,
|
|
swaFullCache: defaultContextSwaFullCache
|
|
}).gpuVram;
|
|
const totalVram = modelVram + contextVram;
|
|
if (totalVram > currentVram)
|
|
return null;
|
|
return {
|
|
contextSize: fitContext.contextSize,
|
|
contextVram,
|
|
totalVram
|
|
};
|
|
}
|
|
const maxContext = findMaxPossibleContextSizeForVram({
|
|
gpuLayers,
|
|
ggufInsights,
|
|
vram: currentVram - modelVram,
|
|
isEmbeddingContext: fitContext?.embeddingContext ?? false,
|
|
flashAttention: defaultContextFlashAttention,
|
|
swaFullCache: defaultContextSwaFullCache
|
|
});
|
|
if (maxContext == null || modelVram + maxContext.vram > currentVram)
|
|
return null;
|
|
return {
|
|
contextSize: maxContext.contextSize,
|
|
contextVram: maxContext.vram,
|
|
totalVram: modelVram + maxContext.vram
|
|
};
|
|
}
|
|
function findMaxPossibleContextSizeForVram({ gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache }) {
|
|
const maxContextSize = getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize });
|
|
return findMaxValidValue({
|
|
maxValue: maxContextSize,
|
|
minValue: minAllowedContextSizeInCalculations,
|
|
minStep: 1,
|
|
test(contextSize) {
|
|
const contextVram = ggufInsights.estimateContextResourceRequirements({
|
|
contextSize,
|
|
batchSize: getDefaultContextBatchSize({ contextSize, sequences: 1 }),
|
|
modelGpuLayers: gpuLayers,
|
|
sequences: 1,
|
|
isEmbeddingContext,
|
|
flashAttention,
|
|
swaFullCache
|
|
}).gpuVram;
|
|
if (contextVram <= vram)
|
|
return {
|
|
contextSize,
|
|
vram: contextVram
|
|
};
|
|
return null;
|
|
}
|
|
});
|
|
}
|
|
function findMaxValidValue({ maxValue, minValue, minStep = 1, test }) {
|
|
let step = -Math.max(minStep, Math.floor((maxValue - minValue) / 4));
|
|
let bestValue = null;
|
|
for (let value = maxValue; value >= minValue;) {
|
|
const result = (bestValue != null && value === bestValue.value)
|
|
? bestValue.result
|
|
: test(value);
|
|
if (result != null) {
|
|
if (bestValue == null || value >= bestValue.value) {
|
|
bestValue = { value: value, result: result };
|
|
if (step === -minStep)
|
|
break;
|
|
else if (step < 0)
|
|
step = Math.max(minStep, Math.floor(-step / 2));
|
|
}
|
|
}
|
|
else if (bestValue != null && value < bestValue.value) {
|
|
value = bestValue.value;
|
|
step = Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
|
continue;
|
|
}
|
|
else if (step > 0)
|
|
step = -Math.max(minStep, Math.floor(step / 2));
|
|
if (value === minValue && step === -minStep)
|
|
break;
|
|
value += step;
|
|
if (value < minValue) {
|
|
value = minValue;
|
|
step = Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
|
}
|
|
else if (value > maxValue) {
|
|
value = maxValue;
|
|
step = -Math.max(minStep, Math.floor(Math.abs(step) / 2));
|
|
}
|
|
}
|
|
if (bestValue != null)
|
|
return bestValue.result;
|
|
return null;
|
|
}
|
|
//# sourceMappingURL=resolveModelGpuLayersOption.js.map
|