First upload version 0.0.1
This commit is contained in:
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectCommand = {};
|
||||
export declare const InspectCommand: CommandModule<object, InspectCommand>;
|
||||
export {};
|
||||
21
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js
generated
vendored
Normal file
21
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../config.js";
|
||||
import { InspectGgufCommand } from "./commands/InspectGgufCommand.js";
|
||||
import { InspectGpuCommand } from "./commands/InspectGpuCommand.js";
|
||||
import { InspectMeasureCommand } from "./commands/InspectMeasureCommand.js";
|
||||
import { InspectEstimateCommand } from "./commands/InspectEstimateCommand.js";
|
||||
export const InspectCommand = {
|
||||
command: "inspect <command>",
|
||||
describe: withCliCommandDescriptionDocsUrl("Inspect the inner workings of `node-llama-cpp`", documentationPageUrls.CLI.Inspect.index),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.command(InspectGpuCommand)
|
||||
.command(InspectGgufCommand)
|
||||
.command(InspectMeasureCommand)
|
||||
.command(InspectEstimateCommand);
|
||||
},
|
||||
async handler() {
|
||||
// this function must exist, even though we do nothing here
|
||||
}
|
||||
};
|
||||
//# sourceMappingURL=InspectCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"InspectCommand.js","sourceRoot":"","sources":["../../../../src/cli/commands/inspect/InspectCommand.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,gCAAgC,EAAC,MAAM,iDAAiD,CAAC;AACjG,OAAO,EAAC,qBAAqB,EAAC,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAC,kBAAkB,EAAC,MAAM,kCAAkC,CAAC;AACpE,OAAO,EAAC,iBAAiB,EAAC,MAAM,iCAAiC,CAAC;AAClE,OAAO,EAAC,qBAAqB,EAAC,MAAM,qCAAqC,CAAC;AAC1E,OAAO,EAAC,sBAAsB,EAAC,MAAM,sCAAsC,CAAC;AAM5E,MAAM,CAAC,MAAM,cAAc,GAA0C;IACjE,OAAO,EAAE,mBAAmB;IAC5B,QAAQ,EAAE,gCAAgC,CACtC,gDAAgD,EAChD,qBAAqB,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAC1C;IACD,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,OAAO,CAAC,iBAAiB,CAAC;aAC1B,OAAO,CAAC,kBAAkB,CAAC;aAC3B,OAAO,CAAC,qBAAqB,CAAC;aAC9B,OAAO,CAAC,sBAAsB,CAAC,CAAC;IACzC,CAAC;IACD,KAAK,CAAC,OAAO;QACT,2DAA2D;IAC/D,CAAC;CACJ,CAAC"}
|
||||
14
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts
generated
vendored
Normal file
14
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type InspectEstimateCommand = {
|
||||
modelPath: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
gpuLayers?: number | "max";
|
||||
contextSize?: number | "train";
|
||||
embedding?: boolean;
|
||||
noMmap?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
};
|
||||
export declare const InspectEstimateCommand: CommandModule<object, InspectEstimateCommand>;
|
||||
export {};
|
||||
248
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js
generated
vendored
Normal file
248
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js
generated
vendored
Normal file
@@ -0,0 +1,248 @@
|
||||
import process from "process";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import { printInfoLine } from "../../../utils/printInfoLine.js";
|
||||
import { renderModelCompatibilityPercentageWithColors } from "../../../utils/renderModelCompatibilityPercentageWithColors.js";
|
||||
import { getReadableContextSize } from "../../../../utils/getReadableContextSize.js";
|
||||
import { GgufInsights } from "../../../../gguf/insights/GgufInsights.js";
|
||||
import { getLlama } from "../../../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { defaultTrainContextSizeForEstimationPurposes } from "../../../../gguf/insights/GgufInsightsConfigurationResolver.js";
|
||||
import { getGgufFileTypeName } from "../../../../gguf/utils/getGgufFileTypeName.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { resolveModelArgToFilePathOrUrl } from "../../../../utils/resolveModelDestination.js";
|
||||
import { printModelDestination } from "../../../utils/printModelDestination.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { printDidYouMeanUri } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { isModelUri } from "../../../../utils/parseModelUri.js";
|
||||
export const InspectEstimateCommand = {
|
||||
command: "estimate [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Estimate the compatibility of a model with the current hardware", documentationPageUrls.CLI.Inspect.Estimate),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
description: "The path or URI of the GGUF file to use. If a URI is provided, the metadata will be read from the remote file without downloading the entire file.",
|
||||
group: "Required:"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\"",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("gpuLayers", {
|
||||
alias: "gl",
|
||||
type: "number",
|
||||
description: "number of layers to store in VRAM. Set to `max` to use all the layers the model has",
|
||||
string: true,
|
||||
coerce: (value) => {
|
||||
if (value === "max")
|
||||
return -2;
|
||||
return parseInt(value);
|
||||
},
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("contextSize", {
|
||||
alias: "c",
|
||||
type: "number",
|
||||
description: "Context size to use for the model context. Set to `max` or `train` to use the training context size. " +
|
||||
"Note that the train context size is not necessarily what you should use for inference, " +
|
||||
"and a big context size will use a lot of memory",
|
||||
string: true,
|
||||
coerce: (value) => {
|
||||
if (value === "max" || value === "train")
|
||||
return -2;
|
||||
return parseInt(value);
|
||||
},
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("embedding", {
|
||||
alias: "e",
|
||||
type: "boolean",
|
||||
description: "Whether to estimate for creating an embedding context",
|
||||
default: false,
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, swaFullCache }) {
|
||||
if (gpuLayers === -1)
|
||||
gpuLayers = undefined;
|
||||
if (gpuLayers === -2)
|
||||
gpuLayers = "max";
|
||||
if (contextSizeArg === -1)
|
||||
contextSizeArg = undefined;
|
||||
if (contextSizeArg === -2)
|
||||
contextSizeArg = "train";
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
const [resolvedModelDestination, resolvedGgufPath] = isModelUri(ggufPath)
|
||||
? await withOra({
|
||||
loading: chalk.blue("Resolving model URI"),
|
||||
success: chalk.blue("Resolved model URI"),
|
||||
fail: chalk.blue("Failed to resolve model URI"),
|
||||
noSuccessLiveStatus: true
|
||||
}, () => resolveModelArgToFilePathOrUrl(ggufPath, headers))
|
||||
: await resolveModelArgToFilePathOrUrl(ggufPath, headers);
|
||||
if (resolvedModelDestination.type === "file" && !await fs.pathExists(resolvedGgufPath)) {
|
||||
console.error(`${chalk.red("File does not exist:")} ${resolvedGgufPath}`);
|
||||
printDidYouMeanUri(ggufPath);
|
||||
process.exit(1);
|
||||
}
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
printModelDestination(resolvedModelDestination);
|
||||
if (embedding)
|
||||
console.info(`${chalk.yellow("Estimating for an embedding context")}`);
|
||||
const ggufFileInfo = await withOra({
|
||||
loading: chalk.blue("Reading model metadata"),
|
||||
success: chalk.blue("Read model metadata"),
|
||||
fail: chalk.blue("Failed to read model metadata"),
|
||||
noSuccessLiveStatus: true
|
||||
}, async () => {
|
||||
return await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers
|
||||
});
|
||||
});
|
||||
const ggufInsights = await GgufInsights.from(ggufFileInfo, llama);
|
||||
const contextSize = contextSizeArg === "train"
|
||||
? ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes
|
||||
: contextSizeArg;
|
||||
async function resolveCompatibilityScore(flashAttention) {
|
||||
return await ggufInsights.configurationResolver.resolveAndScoreConfig({
|
||||
flashAttention,
|
||||
targetContextSize: contextSize,
|
||||
targetGpuLayers: gpuLayers,
|
||||
embeddingContext: embedding,
|
||||
useMmap,
|
||||
swaFullCache
|
||||
});
|
||||
}
|
||||
const [compatibilityScore, compatibilityScoreWithFlashAttention] = await Promise.all([
|
||||
resolveCompatibilityScore(false),
|
||||
resolveCompatibilityScore(true)
|
||||
]);
|
||||
const longestTitle = Math.max("GPU info".length, "Model info".length, "Resolved config".length, "With flash attention".length) + 1;
|
||||
if (llama.gpu !== false) {
|
||||
const [vramState, deviceNames] = await Promise.all([
|
||||
llama.getVramState(),
|
||||
llama.getGpuDeviceNames()
|
||||
]);
|
||||
printInfoLine({
|
||||
title: "GPU info",
|
||||
padTitle: longestTitle,
|
||||
info: [{
|
||||
title: "Type",
|
||||
value: getPrettyBuildGpuName(llama.gpu)
|
||||
}, {
|
||||
title: "VRAM",
|
||||
value: toBytes(vramState.total)
|
||||
}, {
|
||||
title: "Name",
|
||||
value: toOneLine(deviceNames.join(", "))
|
||||
}]
|
||||
});
|
||||
}
|
||||
printInfoLine({
|
||||
title: "Model info",
|
||||
padTitle: longestTitle,
|
||||
info: [{
|
||||
title: "Type",
|
||||
value: toOneLine([
|
||||
ggufFileInfo.metadata?.general?.architecture,
|
||||
ggufFileInfo.metadata?.general?.size_label,
|
||||
getGgufFileTypeName(ggufFileInfo.metadata.general?.file_type)
|
||||
].filter(Boolean).join(" "))
|
||||
}, {
|
||||
title: "Size",
|
||||
value: toBytes(ggufInsights.modelSize)
|
||||
}, {
|
||||
show: ggufInsights.trainContextSize != null,
|
||||
title: "Train context size",
|
||||
value: getReadableContextSize(ggufInsights.trainContextSize ?? 0)
|
||||
}]
|
||||
});
|
||||
console.info();
|
||||
logCompatibilityScore("Resolved config", longestTitle, compatibilityScore, ggufInsights, llama, false);
|
||||
logCompatibilityScore("With flash attention", longestTitle, compatibilityScoreWithFlashAttention, ggufInsights, llama, true);
|
||||
}
|
||||
};
|
||||
function logCompatibilityScore(title, padTitle, compatibilityScore, ggufInsights, llama, flashAttention) {
|
||||
printInfoLine({
|
||||
title,
|
||||
padTitle,
|
||||
separateLines: false,
|
||||
info: [{
|
||||
title: "",
|
||||
value: renderModelCompatibilityPercentageWithColors(compatibilityScore.compatibilityScore * 100) + " compatibility"
|
||||
}, {
|
||||
show: ggufInsights.trainContextSize != null,
|
||||
title: "Context size",
|
||||
value: getReadableContextSize(compatibilityScore.resolvedValues.contextSize)
|
||||
}, {
|
||||
show: llama.gpu !== false,
|
||||
title: "GPU layers",
|
||||
value: () => (compatibilityScore.resolvedValues.gpuLayers + "/" + ggufInsights.totalLayers + " " +
|
||||
chalk.dim(`(${Math.floor((compatibilityScore.resolvedValues.gpuLayers / ggufInsights.totalLayers) * 100)}%)`))
|
||||
}, {
|
||||
show: llama.gpu !== false,
|
||||
title: "VRAM usage",
|
||||
value: () => toBytes(compatibilityScore.resolvedValues.totalVramUsage)
|
||||
}, {
|
||||
title: "RAM usage",
|
||||
value: () => toBytes(compatibilityScore.resolvedValues.totalRamUsage)
|
||||
}, {
|
||||
show: flashAttention,
|
||||
title: "Flash attention",
|
||||
value: "enabled"
|
||||
}]
|
||||
});
|
||||
}
|
||||
function toOneLine(text) {
|
||||
return text.replaceAll("\n", chalk.gray("\\n"));
|
||||
}
|
||||
//# sourceMappingURL=InspectEstimateCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
13
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts
generated
vendored
Normal file
13
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectGgufCommand = {
|
||||
modelPath: string;
|
||||
header?: string[];
|
||||
key?: string;
|
||||
noSplice: boolean;
|
||||
fullTensorInfo: boolean;
|
||||
fullMetadataArrays: boolean;
|
||||
plainJson: boolean;
|
||||
outputToJsonFile?: string;
|
||||
};
|
||||
export declare const InspectGgufCommand: CommandModule<object, InspectGgufCommand>;
|
||||
export {};
|
||||
225
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js
generated
vendored
Normal file
225
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js
generated
vendored
Normal file
@@ -0,0 +1,225 @@
|
||||
import path from "path";
|
||||
import process from "process";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import { Template } from "@huggingface/jinja";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { prettyPrintObject } from "../../../../utils/prettyPrintObject.js";
|
||||
import { getGgufFileTypeName } from "../../../../gguf/utils/getGgufFileTypeName.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { resolveModelArgToFilePathOrUrl } from "../../../../utils/resolveModelDestination.js";
|
||||
import { printModelDestination } from "../../../utils/printModelDestination.js";
|
||||
import { getGgufMetadataKeyValue } from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { printDidYouMeanUri } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { isModelUri } from "../../../../utils/parseModelUri.js";
|
||||
const chatTemplateKey = ".chatTemplate";
|
||||
export const InspectGgufCommand = {
|
||||
command: "gguf [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Inspect a GGUF file", documentationPageUrls.CLI.Inspect.GGUF),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
description: "The path or URI of the GGUF file to inspect. If a URI is provided, the metadata will be read from the remote file without downloading the entire file.",
|
||||
group: "Required:"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("key", {
|
||||
alias: ["k"],
|
||||
type: "string",
|
||||
description: "A single metadata key to print the value of. If not provided, all metadata will be printed. " +
|
||||
"If the key is `" + chatTemplateKey + "` then the chat template of the model will be formatted and printed.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noSplice", {
|
||||
alias: "s",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "When split files are detected, it reads the metadata of the first file and splices the tensorInfo from all the parts. Use this flag to disable that behavior and read only the given file",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("fullTensorInfo", {
|
||||
alias: "t",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Show the full tensor info",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("fullMetadataArrays", {
|
||||
alias: "ma",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print the full arrays in the metadata. Caution: those arrays can be extremely large and cover the entire terminal screen. Use with caution.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("plainJson", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print the output as plain JSON with no formatting. Useful for piping the output to other commands. The output won't truncate any values, so it may be extremely large. Use with caution.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("outputToJsonFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to write the output to as JSON. The output won't truncate any values. The output won't be printed to the console",
|
||||
group: "Optional:"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, key, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile }) {
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
const [resolvedModelDestination, resolvedGgufPath] = (!plainJson && isModelUri(ggufPath))
|
||||
? await withOra({
|
||||
loading: chalk.blue("Resolving model URI"),
|
||||
success: chalk.blue("Resolved model URI"),
|
||||
fail: chalk.blue("Failed to resolve model URI"),
|
||||
noSuccessLiveStatus: true
|
||||
}, () => resolveModelArgToFilePathOrUrl(ggufPath, headers))
|
||||
: await resolveModelArgToFilePathOrUrl(ggufPath, headers);
|
||||
if (resolvedModelDestination.type === "file" && !await fs.pathExists(resolvedGgufPath)) {
|
||||
console.error(`${chalk.red("File does not exist:")} ${resolvedGgufPath}`);
|
||||
printDidYouMeanUri(ggufPath);
|
||||
process.exit(1);
|
||||
}
|
||||
if (!plainJson)
|
||||
printModelDestination(resolvedModelDestination);
|
||||
const parsedMetadata = plainJson
|
||||
? await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers,
|
||||
spliceSplitFiles: !noSplice
|
||||
})
|
||||
: await withOra({
|
||||
loading: chalk.blue("Reading model metadata"),
|
||||
success: chalk.blue("Read model metadata"),
|
||||
fail: chalk.blue("Failed to read model metadata"),
|
||||
noSuccessLiveStatus: true
|
||||
}, async () => {
|
||||
return await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers,
|
||||
spliceSplitFiles: !noSplice
|
||||
});
|
||||
});
|
||||
removeAdditionalTensorInfoFields(parsedMetadata.fullTensorInfo);
|
||||
const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type);
|
||||
if (plainJson || outputToJsonFile != null) {
|
||||
const getOutputJson = () => {
|
||||
if (key != null) {
|
||||
const keyValue = key === chatTemplateKey
|
||||
? tryFormattingJinja(getGgufMetadataKeyValue(parsedMetadata.metadata, "tokenizer.chat_template"))
|
||||
: getGgufMetadataKeyValue(parsedMetadata.metadata, key);
|
||||
if (keyValue === undefined) {
|
||||
console.log(`Key not found: ${key}`);
|
||||
process.exit(1);
|
||||
}
|
||||
return JSON.stringify(keyValue, undefined, 4);
|
||||
}
|
||||
return JSON.stringify({
|
||||
splicedParts: parsedMetadata.splicedParts,
|
||||
version: parsedMetadata.version,
|
||||
fileType: fileTypeName,
|
||||
tensorCount: parsedMetadata.totalTensorCount,
|
||||
metadataSize: parsedMetadata.totalMetadataSize,
|
||||
tensorInfoSize: parsedMetadata.totalTensorInfoSize,
|
||||
metadata: parsedMetadata.metadata,
|
||||
tensorInfo: parsedMetadata.fullTensorInfo
|
||||
}, undefined, 4);
|
||||
};
|
||||
const outputJson = getOutputJson();
|
||||
if (outputToJsonFile != null) {
|
||||
const filePath = path.resolve(process.cwd(), outputToJsonFile);
|
||||
await fs.writeFile(filePath, outputJson, "utf8");
|
||||
console.info(`${chalk.yellow("JSON written to file:")} ${filePath}`);
|
||||
}
|
||||
else {
|
||||
console.info(outputJson);
|
||||
}
|
||||
}
|
||||
else if (key != null) {
|
||||
const keyValue = key === chatTemplateKey
|
||||
? tryFormattingJinja(getGgufMetadataKeyValue(parsedMetadata.metadata, "tokenizer.chat_template"))
|
||||
: getGgufMetadataKeyValue(parsedMetadata.metadata, key);
|
||||
if (keyValue === undefined) {
|
||||
console.log(`${chalk.red("Metadata key not found:")} ${key}`);
|
||||
process.exit(1);
|
||||
}
|
||||
const metadataPrettyPrintOptions = {
|
||||
maxArrayValues: fullMetadataArrays
|
||||
? undefined
|
||||
: 10,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1
|
||||
};
|
||||
console.info(`${chalk.yellow("Metadata key:")} ${prettyPrintObject(key)}`);
|
||||
console.info(`${chalk.yellow("Metadata:")} ${typeof keyValue === "string"
|
||||
? keyValue
|
||||
: prettyPrintObject(keyValue, undefined, metadataPrettyPrintOptions)}`);
|
||||
}
|
||||
else {
|
||||
const metadataPrettyPrintOptions = {
|
||||
maxArrayValues: fullMetadataArrays
|
||||
? undefined
|
||||
: 10,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1
|
||||
};
|
||||
const tensorInfoPrettyPrintOptions = {
|
||||
maxArrayValues: fullTensorInfo
|
||||
? undefined
|
||||
: 4,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1,
|
||||
multilineObjects: false
|
||||
};
|
||||
const numberLocaleFormattingOptions = {
|
||||
style: "decimal",
|
||||
useGrouping: true
|
||||
};
|
||||
if (parsedMetadata.splicedParts > 1)
|
||||
console.info(`${chalk.yellow("Spliced parts:")} ${parsedMetadata.splicedParts}`);
|
||||
console.info(`${chalk.yellow("GGUF version:")} ${parsedMetadata.version}`);
|
||||
console.info(`${chalk.yellow("Tensor count:")} ${parsedMetadata.totalTensorCount.toLocaleString("en-US", numberLocaleFormattingOptions)}`);
|
||||
console.info(`${chalk.yellow("Metadata size:")} ${toBytes(parsedMetadata.totalMetadataSize)}`);
|
||||
console.info(`${chalk.yellow("Tensor info size:")} ${toBytes(parsedMetadata.totalTensorInfoSize)}`);
|
||||
console.info(`${chalk.yellow("File type:")} ${fileTypeName ?? ""} ${chalk.white(`(${parsedMetadata.metadata.general?.file_type})`)}`);
|
||||
console.info(`${chalk.yellow("Metadata:")} ${prettyPrintObject(parsedMetadata.metadata, undefined, metadataPrettyPrintOptions)}`);
|
||||
console.info(`${chalk.yellow("Tensor info:")} ${prettyPrintObject(parsedMetadata.fullTensorInfo, undefined, tensorInfoPrettyPrintOptions)}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
// these fields are added by the parser for ease of use and are not found in the gguf file itself
|
||||
function removeAdditionalTensorInfoFields(tensorInfo) {
|
||||
if (tensorInfo == null)
|
||||
return;
|
||||
for (const tensor of tensorInfo) {
|
||||
delete tensor.fileOffset;
|
||||
delete tensor.filePart;
|
||||
}
|
||||
}
|
||||
function tryFormattingJinja(template) {
|
||||
if (typeof template !== "string")
|
||||
return template;
|
||||
try {
|
||||
const parsedTemplate = new Template(template);
|
||||
return parsedTemplate.format({
|
||||
indent: 4
|
||||
}) ?? template;
|
||||
}
|
||||
catch (err) {
|
||||
return template;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=InspectGgufCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectGpuCommand = {};
|
||||
export declare const InspectGpuCommand: CommandModule<object, InspectGpuCommand>;
|
||||
export {};
|
||||
249
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js
generated
vendored
Normal file
249
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js
generated
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
import os from "os";
|
||||
import chalk from "chalk";
|
||||
import { getLlamaForOptions } from "../../../../bindings/getLlama.js";
|
||||
import { detectAvailableComputeLayers } from "../../../../bindings/utils/detectAvailableComputeLayers.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
import { LlamaLogLevel } from "../../../../bindings/types.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getModuleVersion } from "../../../../utils/getModuleVersion.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { builtinLlamaCppGitHubRepo, documentationPageUrls } from "../../../../config.js";
|
||||
import { getPlatformInfo } from "../../../../bindings/utils/getPlatformInfo.js";
|
||||
import { getLinuxDistroInfo } from "../../../../bindings/utils/getLinuxDistroInfo.js";
|
||||
import { isRunningUnderRosetta } from "../../../utils/isRunningUnderRosetta.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { getBinariesGithubRelease } from "../../../../bindings/utils/binariesGithubRelease.js";
|
||||
import { getClonedLlamaCppRepoReleaseInfo } from "../../../../bindings/utils/cloneLlamaCppRepo.js";
|
||||
export const InspectGpuCommand = {
|
||||
command: "gpu",
|
||||
describe: withCliCommandDescriptionDocsUrl("Show the detected GPU types and their VRAM usage", documentationPageUrls.CLI.Inspect.GPU),
|
||||
async handler() {
|
||||
const platform = getPlatform();
|
||||
const arch = process.arch;
|
||||
const availableComputeLayers = await detectAvailableComputeLayers({ platform });
|
||||
const gpusToLogVramUsageOf = [];
|
||||
const gpuToLlama = new Map();
|
||||
let lastLlama;
|
||||
async function loadLlamaForGpu(gpu) {
|
||||
if (!gpuToLlama.has(gpu)) {
|
||||
const loadedLlama = await getLlamaForGpu(gpu);
|
||||
gpuToLlama.set(gpu, loadedLlama);
|
||||
if (loadedLlama != null)
|
||||
lastLlama = loadedLlama;
|
||||
}
|
||||
return gpuToLlama.get(gpu);
|
||||
}
|
||||
if (platform === "linux") {
|
||||
const linuxDistroInfo = await getLinuxDistroInfo();
|
||||
if (linuxDistroInfo.prettyName !== "")
|
||||
console.info(`${chalk.yellow("OS:")} ${linuxDistroInfo.prettyName} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
else
|
||||
console.info(`${chalk.yellow("OS:")} ${linuxDistroInfo.name || os.type()} ${linuxDistroInfo.version || os.release()} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
}
|
||||
else {
|
||||
const platformInfo = await getPlatformInfo();
|
||||
const osName = platformInfo.name === "Unknown"
|
||||
? os.type()
|
||||
: platformInfo.name;
|
||||
console.info(`${chalk.yellow("OS:")} ${osName} ${platformInfo.version} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
}
|
||||
if (process.versions.node != null)
|
||||
console.info(`${chalk.yellow("Node:")} ${process.versions.node} ${chalk.dim("(" + arch + ")")}`);
|
||||
if (process.versions.bun != null)
|
||||
console.info(`${chalk.yellow("Bun:")} ${process.versions.bun}`);
|
||||
const typeScriptVersion = await getInstalledTypescriptVersion();
|
||||
if (typeScriptVersion != null)
|
||||
console.info(`${chalk.yellow("TypeScript:")} ${typeScriptVersion}`);
|
||||
try {
|
||||
const moduleVersion = await getModuleVersion();
|
||||
if (moduleVersion != null) {
|
||||
console.info();
|
||||
console.info(`${chalk.yellow("node-llama-cpp:")} ${moduleVersion}`);
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
try {
|
||||
const prebuiltBinariesRelease = await getBinariesGithubRelease();
|
||||
console.info(`${chalk.yellow("Prebuilt binaries:")} ${prebuiltBinariesRelease}`);
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
try {
|
||||
const clonedLlamaCppRelease = await getClonedLlamaCppRepoReleaseInfo();
|
||||
if (clonedLlamaCppRelease != null)
|
||||
console.info(`${chalk.yellow("Cloned source:")} ${clonedLlamaCppRelease.tag}` + (clonedLlamaCppRelease.llamaCppGithubRepo !== builtinLlamaCppGitHubRepo
|
||||
? ` (${clonedLlamaCppRelease.llamaCppGithubRepo})`
|
||||
: ""));
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
console.info();
|
||||
if (platform === "mac" && arch === "arm64") {
|
||||
const llama = await loadLlamaForGpu("metal");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.red("Metal is detected, but using it failed")}`);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("metal");
|
||||
}
|
||||
}
|
||||
else if (platform === "mac") {
|
||||
if (await isRunningUnderRosetta()) {
|
||||
console.error(chalk.red("llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
|
||||
"Ensure that you're using a native arm64 node.js installation."));
|
||||
console.error("process.platform: " + process.platform + ", process.arch: " + process.arch);
|
||||
console.error("troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction);
|
||||
}
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.red("not supported by llama.cpp on Intel Macs")}`);
|
||||
const llama = await loadLlamaForGpu(false);
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("CPU:")} ${chalk.red("Loading a binding with only CPU support failed")}`);
|
||||
}
|
||||
}
|
||||
if (availableComputeLayers.cuda.hasNvidiaDriver && !availableComputeLayers.cuda.hasCudaRuntime) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("NVIDIA driver is installed, but CUDA runtime is not")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else if (availableComputeLayers.cuda.hasCudaRuntime && !availableComputeLayers.cuda.hasNvidiaDriver) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA runtime is installed, but NVIDIA driver is not")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else if (availableComputeLayers.cuda.hasCudaRuntime && availableComputeLayers.cuda.hasNvidiaDriver) {
|
||||
const llama = await loadLlamaForGpu("cuda");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA is detected, but using it failed")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("cuda");
|
||||
if (llama._hadErrorLogs)
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
}
|
||||
if (availableComputeLayers.vulkan) {
|
||||
const llama = await loadLlamaForGpu("vulkan");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("Vulkan:")} ${chalk.red("Vulkan is detected, but using it failed")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to Vulkan, see the Vulkan guide: ") + documentationPageUrls.Vulkan);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("Vulkan:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("vulkan");
|
||||
if (llama._hadErrorLogs)
|
||||
console.info(chalk.yellow("To resolve errors related to Vulkan, see the Vulkan guide: ") + documentationPageUrls.Vulkan);
|
||||
}
|
||||
}
|
||||
if (lastLlama == null)
|
||||
await loadLlamaForGpu(false);
|
||||
for (const gpu of gpusToLogVramUsageOf) {
|
||||
const llama = gpuToLlama.get(gpu);
|
||||
if (llama == null || llama.gpu !== gpu)
|
||||
continue;
|
||||
console.info();
|
||||
await logGpuVramUsage(llama);
|
||||
}
|
||||
console.info();
|
||||
await logRamUsage(lastLlama?.cpuMathCores);
|
||||
if (lastLlama != null) {
|
||||
await logSwapUsage(lastLlama);
|
||||
console.info(`${chalk.yellow("mmap:")} ${lastLlama.supportsMmap ? "supported" : "unsupported"}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
async function getLlamaForGpu(gpu) {
|
||||
try {
|
||||
// if you're reading this line, then you're probably looking for the `dryRun` option on `getLlama`
|
||||
return await getLlamaForOptions({
|
||||
gpu: gpu,
|
||||
build: "never",
|
||||
progressLogs: false,
|
||||
logLevel: LlamaLogLevel.warn,
|
||||
vramPadding: 0
|
||||
}, {
|
||||
skipLlamaInit: true,
|
||||
pipeBinaryTestErrorLogs: true
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
async function logGpuVramUsage(llama) {
|
||||
try {
|
||||
const gpuName = getPrettyBuildGpuName(llama.gpu);
|
||||
const vramState = await llama.getVramState();
|
||||
const gpuDeviceNames = await llama.getGpuDeviceNames();
|
||||
if (gpuDeviceNames.length > 0)
|
||||
console.info(`${chalk.yellow(`${gpuName} device${gpuDeviceNames.length > 1 ? "s" : ""}:`)} ${gpuDeviceNames.join(", ")}`);
|
||||
console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramState.used, vramState.total)}% ${chalk.gray("(" + toBytes(vramState.used) + "/" + toBytes(vramState.total) + ")")}`);
|
||||
console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramState.free, vramState.total)}% ${chalk.gray("(" + toBytes(vramState.free) + "/" + toBytes(vramState.total) + ")")}`);
|
||||
if (vramState.unifiedSize > 0)
|
||||
console.info(`${chalk.yellow(`${gpuName} unified memory:`)} ${toBytes(vramState.unifiedSize)} ${chalk.gray("(" + getPercentageString(vramState.unifiedSize, vramState.total) + "%)")}`);
|
||||
}
|
||||
catch (err) { }
|
||||
}
|
||||
async function logRamUsage(cpuMathCores) {
|
||||
const totalMemory = os.totalmem();
|
||||
const freeMemory = os.freemem();
|
||||
const usedMemory = totalMemory - freeMemory;
|
||||
const cpuDeviceNames = Array.from(new Set(os.cpus()
|
||||
.map((cpu) => (cpu.model?.trim?.() ?? ""))
|
||||
.filter((deviceName) => deviceName.length > 0)));
|
||||
if (cpuDeviceNames.length > 0)
|
||||
console.info(`${chalk.yellow("CPU model" + (cpuDeviceNames.length > 1 ? "s" : "") + ":")} ${cpuDeviceNames.join(", ")}`);
|
||||
if (cpuMathCores != null)
|
||||
console.info(`${chalk.yellow("Math cores:")} ${cpuMathCores}`);
|
||||
console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, totalMemory)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + toBytes(freeMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
}
|
||||
async function logSwapUsage(llama) {
|
||||
const swapState = await llama.getSwapState();
|
||||
console.info(`${chalk.yellow("Used swap:")} ${getPercentageString(swapState.used, swapState.allocated)}% ${chalk.gray("(" + toBytes(swapState.used) + "/" + toBytes(swapState.allocated) + ")")}`);
|
||||
console.info(`${chalk.yellow("Max swap size:")} ${swapState.maxSize === Infinity ? "dynamic" : toBytes(swapState.maxSize)}`);
|
||||
}
|
||||
function getPercentageString(amount, total) {
|
||||
if (total === 0)
|
||||
return "0";
|
||||
return String(Math.floor((amount / total) * 100 * 100) / 100);
|
||||
}
|
||||
async function getInstalledTypescriptVersion() {
|
||||
try {
|
||||
const ts = await import("typescript");
|
||||
const version = ts?.version ?? ts?.default?.version;
|
||||
if (version != null && typeof version === "string" && version.length > 0)
|
||||
return version;
|
||||
return null;
|
||||
}
|
||||
catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
// // simple script to copy console logs as ansi to clipboard. Used to update the documentation
|
||||
// import {spawn} from "child_process";
|
||||
// const pendingLog: string[] = [];
|
||||
// const originalConsoleInfo = console.info;
|
||||
// console.info = function info(...args: any[]) {
|
||||
// originalConsoleInfo.call(console, ...args);
|
||||
// pendingLog.push(args.join(" "));
|
||||
// };
|
||||
//
|
||||
// function copyLogs() {
|
||||
// const res = pendingLog.join("\n");
|
||||
//
|
||||
// pbcopy(res);
|
||||
// originalConsoleInfo.call(console, "Copied logs to clipboard");
|
||||
// }
|
||||
// function pbcopy(text: string) {
|
||||
// const pbcopyProcess = spawn("pbcopy");
|
||||
// pbcopyProcess.stdin.write(text);
|
||||
// pbcopyProcess.stdin.end();
|
||||
// }
|
||||
//
|
||||
// process.on("exit", copyLogs);
|
||||
//# sourceMappingURL=InspectGpuCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
23
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts
generated
vendored
Normal file
23
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type InspectMeasureCommand = {
|
||||
modelPath?: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
minLayers: number;
|
||||
maxLayers?: number;
|
||||
minContextSize: number;
|
||||
maxContextSize?: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
batchSize?: number;
|
||||
measures: number;
|
||||
memory: "vram" | "ram" | "all";
|
||||
noMmap: boolean;
|
||||
noDirectIo: boolean;
|
||||
printHeaderBeforeEachLayer?: boolean;
|
||||
evaluateText?: string;
|
||||
repeatEvaluateText?: number;
|
||||
};
|
||||
export declare const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>;
|
||||
export {};
|
||||
828
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js
generated
vendored
Normal file
828
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js
generated
vendored
Normal file
@@ -0,0 +1,828 @@
|
||||
import path from "path";
|
||||
import process from "process";
|
||||
import { fileURLToPath } from "url";
|
||||
import { fork } from "node:child_process";
|
||||
import os from "os";
|
||||
import chalk from "chalk";
|
||||
import stripAnsi from "strip-ansi";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { resolveCommandGgufPath } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { getLlama } from "../../../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { getConsoleLogPrefix } from "../../../../utils/getConsoleLogPrefix.js";
|
||||
import { ConsoleTable } from "../../../utils/ConsoleTable.js";
|
||||
import { GgufInsights } from "../../../../gguf/insights/GgufInsights.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getReadablePath } from "../../../utils/getReadablePath.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { padSafeContextSize } from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
export const InspectMeasureCommand = {
|
||||
command: "measure [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Measure VRAM consumption of a GGUF model file with all possible combinations of gpu layers and context sizes", documentationPageUrls.CLI.Inspect.Measure),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
description: "Model file to use for the measurements. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
|
||||
})
|
||||
.option("minLayers", {
|
||||
alias: "mnl",
|
||||
type: "number",
|
||||
default: 1,
|
||||
description: "Minimum number of layers to offload to the GPU"
|
||||
})
|
||||
.option("maxLayers", {
|
||||
alias: "mxl",
|
||||
type: "number",
|
||||
default: -1,
|
||||
defaultDescription: "All layers",
|
||||
description: "Maximum number of layers to offload to the GPU"
|
||||
})
|
||||
.option("minContextSize", {
|
||||
alias: "mncs",
|
||||
type: "number",
|
||||
default: 512,
|
||||
description: "Minimum context size"
|
||||
})
|
||||
.option("maxContextSize", {
|
||||
alias: "mxcs",
|
||||
type: "number",
|
||||
default: -1,
|
||||
defaultDescription: "Train context size",
|
||||
description: "Maximum context size"
|
||||
})
|
||||
.option("flashAttention", {
|
||||
alias: "fa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Enable flash attention for the context"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
})
|
||||
.option("batchSize", {
|
||||
alias: "b",
|
||||
type: "number",
|
||||
description: "Batch size to use for the model context"
|
||||
})
|
||||
.option("measures", {
|
||||
alias: "n",
|
||||
type: "number",
|
||||
default: 10,
|
||||
description: "Number of context size measures to take for each gpu layers count"
|
||||
})
|
||||
.option("memory", {
|
||||
type: "string",
|
||||
choices: ["vram", "ram", "all"],
|
||||
default: "vram",
|
||||
description: "Type of memory to measure"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("noDirectIo", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable Direct I/O usage when available"
|
||||
})
|
||||
.option("printHeaderBeforeEachLayer", {
|
||||
alias: "ph",
|
||||
type: "boolean",
|
||||
default: true,
|
||||
description: "Print header before each layer's measures"
|
||||
})
|
||||
.option("evaluateText", {
|
||||
alias: ["evaluate", "et"],
|
||||
type: "string",
|
||||
description: "Text to evaluate with the model"
|
||||
})
|
||||
.option("repeatEvaluateText", {
|
||||
alias: ["repeatEvaluate", "ret"],
|
||||
type: "number",
|
||||
default: 1,
|
||||
description: "Number of times to repeat the evaluation text before sending it for evaluation, in order to make it longer"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
|
||||
if (maxLayers === -1)
|
||||
maxLayers = undefined;
|
||||
if (maxContextSize === -1)
|
||||
maxContextSize = undefined;
|
||||
if (minLayers < 1)
|
||||
minLayers = 1;
|
||||
const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
// ensure a llama build is available
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
const platform = getPlatform();
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
const useDirectIo = !noDirectIo;
|
||||
const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
|
||||
flashAttention, swaFullCache, useMmap
|
||||
});
|
||||
console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
|
||||
console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
|
||||
console.info(chalk.yellow("mmap:") + " " + (!llama.supportsMmap
|
||||
? "unsupported"
|
||||
: useMmap
|
||||
? "enabled"
|
||||
: "disabled"));
|
||||
if (platform !== "mac") // Direct I/O is not supported on macOS
|
||||
console.info(chalk.yellow("Direct I/O:") + " " + (useDirectIo
|
||||
? "enabled"
|
||||
: "disabled"));
|
||||
if (measureMemoryType === "ram" || measureMemoryType === "all")
|
||||
console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
|
||||
console.info();
|
||||
const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
|
||||
sourceType: "filesystem"
|
||||
});
|
||||
const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
|
||||
const totalVram = (await llama.getVramState()).total;
|
||||
const totalRam = os.totalmem();
|
||||
let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
|
||||
let previousContextSizeCheck = undefined;
|
||||
const measureTable = getMeasureTable(measureMemoryType);
|
||||
measureTable.logHeader({ drawRowSeparator: !printHeaderBeforeEachLayer });
|
||||
while (lastGpuLayers >= (minLayers ?? 0)) {
|
||||
let printedAlreadyWithThisProcess = false;
|
||||
let hadSuccessInThisProcess = false;
|
||||
const getNewProccessValue = () => {
|
||||
if (printedAlreadyWithThisProcess)
|
||||
return undefined;
|
||||
printedAlreadyWithThisProcess = true;
|
||||
return chalk.green("*");
|
||||
};
|
||||
const done = await measureModel({
|
||||
modelPath: resolvedGgufPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
gpu: gpu == null
|
||||
? undefined
|
||||
: llama.gpu,
|
||||
maxGpuLayers: lastGpuLayers,
|
||||
minGpuLayers: minLayers,
|
||||
initialMaxContextSize: previousContextSizeCheck,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
tests: measures,
|
||||
evaluateText: evaluateText == null
|
||||
? undefined
|
||||
: evaluateText.repeat(repeatEvaluateText ?? 1),
|
||||
exitAfterMeasurement: exitAfterEachMeasurement,
|
||||
onInfo({ gpuLayers, result }) {
|
||||
if (lastGpuLayers !== gpuLayers) {
|
||||
lastGpuLayers = gpuLayers;
|
||||
previousContextSizeCheck = undefined;
|
||||
measureTable.logLine({});
|
||||
if (printHeaderBeforeEachLayer)
|
||||
measureTable.logHeader({ drawRowSeparator: false });
|
||||
}
|
||||
if (result.type === "crash") {
|
||||
if (!hadSuccessInThisProcess) {
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: chalk.redBright("Crash"),
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: chalk.red(result.result),
|
||||
estimatedModelVram: previousContextSizeCheck == null
|
||||
? undefined
|
||||
: chalk.red(result.result)
|
||||
});
|
||||
lastGpuLayers--;
|
||||
}
|
||||
}
|
||||
else if (result.type === "error") {
|
||||
previousContextSizeCheck = result.contextSize;
|
||||
hadSuccessInThisProcess = true;
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: chalk.red("Error"),
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: chalk.red(result.error),
|
||||
estimatedModelVram: previousContextSizeCheck == null
|
||||
? undefined
|
||||
: chalk.red(result.error)
|
||||
});
|
||||
}
|
||||
else if (result.type === "success") {
|
||||
previousContextSizeCheck = result.contextSize;
|
||||
hadSuccessInThisProcess = true;
|
||||
const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers: lastGpuLayers,
|
||||
useMmap
|
||||
});
|
||||
const modelVramEstimation = modelResourceEstimation.gpuVram;
|
||||
const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.modelVramUsage - modelVramEstimation));
|
||||
const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
|
||||
const modelRamEstimation = modelResourceEstimation.cpuRam;
|
||||
const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.modelRamUsage - modelRamEstimation));
|
||||
const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
|
||||
const contextResourceEstimation = previousContextSizeCheck == null
|
||||
? undefined
|
||||
: ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize: previousContextSizeCheck,
|
||||
modelGpuLayers: lastGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize
|
||||
});
|
||||
const contextVramEstimation = contextResourceEstimation?.gpuVram;
|
||||
const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
|
||||
? undefined
|
||||
: ((contextVramEstimation < result.contextVramUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
|
||||
const contextVramEstimationDiffText = (contextVramEstimation == null || contextVramEstimationDiffBytes == null || result.contextVramUsage == null)
|
||||
? undefined
|
||||
: (contextVramEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9));
|
||||
const contextRamEstimation = contextResourceEstimation?.cpuRam;
|
||||
const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
|
||||
? undefined
|
||||
: ((contextRamEstimation < result.contextRamUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.contextRamUsage - contextRamEstimation)));
|
||||
const contextRamEstimationDiffText = (contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null)
|
||||
? undefined
|
||||
: (contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9));
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: previousContextSizeCheck == null
|
||||
? "Model"
|
||||
: "Context",
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: undefined,
|
||||
estimatedModelVram: toBytes(modelVramEstimation),
|
||||
actualModelVram: toBytes(result.modelVramUsage),
|
||||
modelVramEstimationDiff: modelVramEstimationDiffText,
|
||||
estimatedModelRam: toBytes(modelRamEstimation),
|
||||
actualModelRam: toBytes(result.modelRamUsage),
|
||||
modelRamEstimationDiff: modelRamEstimationDiffText,
|
||||
estimatedContextVram: contextVramEstimation == null
|
||||
? undefined
|
||||
: toBytes(contextVramEstimation),
|
||||
actualContextVram: result.contextVramUsage == null
|
||||
? undefined
|
||||
: toBytes(result.contextVramUsage),
|
||||
contextVramEstimationDiff: contextVramEstimationDiffText,
|
||||
totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
|
||||
chalk.gray("(" + toBytes(result.totalVramUsage) + "/" + toBytes(totalVram) + ")"),
|
||||
estimatedContextRam: contextRamEstimation == null
|
||||
? undefined
|
||||
: toBytes(contextRamEstimation),
|
||||
actualContextRam: result.contextRamUsage == null
|
||||
? undefined
|
||||
: toBytes(result.contextRamUsage),
|
||||
contextRamEstimationDiff: contextRamEstimationDiffText,
|
||||
totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
|
||||
chalk.gray("(" + toBytes(result.totalRamUsage) + "/" + toBytes(totalRam) + ")")
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
function getMeasureTable(memoryType) {
|
||||
return new ConsoleTable([{
|
||||
key: "newProcess",
|
||||
title: " ",
|
||||
width: 1
|
||||
}, {
|
||||
key: "type",
|
||||
title: "Type",
|
||||
width: Math.max("Type".length, "Model".length, "Context".length),
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "gpuLayers",
|
||||
title: "Layers",
|
||||
width: "Layers".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "contextSize",
|
||||
title: "Context size",
|
||||
width: "Context size".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "estimatedModelVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Estimated model VRAM",
|
||||
width: "Estimated model VRAM".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "actualModelVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Model VRAM",
|
||||
width: "Model VRAM".length
|
||||
}, {
|
||||
key: "modelVramEstimationDiff",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "estimatedModelRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Estimated model RAM",
|
||||
width: "Estimated model RAM".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "actualModelRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Model RAM",
|
||||
width: "Model RAM".length
|
||||
}, {
|
||||
key: "modelRamEstimationDiff",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "estimatedContextVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Estimated context VRAM",
|
||||
width: "Estimated context VRAM".length
|
||||
}, {
|
||||
key: "actualContextVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Context VRAM",
|
||||
width: "Context VRAM".length
|
||||
}, {
|
||||
key: "contextVramEstimationDiff",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "totalVramUsage",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "VRAM usage",
|
||||
width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
|
||||
}, {
|
||||
key: "estimatedContextRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Estimated context RAM",
|
||||
width: "Estimated context RAM".length
|
||||
}, {
|
||||
key: "actualContextRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Context RAM",
|
||||
width: "Context RAM".length
|
||||
}, {
|
||||
key: "contextRamEstimationDiff",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "totalRamUsage",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "RAM usage",
|
||||
width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
|
||||
}]);
|
||||
}
|
||||
function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6, yellow = 10, yellowBright = 14 } = {}) {
|
||||
const percentageText = percentage.toFixed(2).padStart(5, "0") + "%";
|
||||
const absPercentage = Math.abs(percentage);
|
||||
if (absPercentage < greenBright)
|
||||
return chalk.greenBright(percentageText);
|
||||
else if (absPercentage < green)
|
||||
return chalk.green(percentageText);
|
||||
else if (absPercentage < yellow)
|
||||
return chalk.yellow(percentageText);
|
||||
else if (absPercentage < yellowBright)
|
||||
return chalk.yellowBright(percentageText);
|
||||
return chalk.red(percentageText);
|
||||
}
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const detectedFileName = path.basename(__filename);
|
||||
const expectedFileName = "InspectMeasureCommand";
|
||||
async function measureModel({ modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo }) {
|
||||
if (!detectedFileName.startsWith(expectedFileName)) {
|
||||
console.warn(getConsoleLogPrefix() +
|
||||
`"${expectedFileName}.js" file is not independent, so running sub-process tests cannot be done with it\n` +
|
||||
getConsoleLogPrefix() +
|
||||
'To resolve this issue, make sure that "node-llama-cpp" is not bundled together with other code.');
|
||||
throw new Error("Sub-process tests cannot be done with the current file");
|
||||
}
|
||||
const subProcess = fork(__filename, [], {
|
||||
detached: false,
|
||||
stdio: [null, null, null, "ipc"],
|
||||
env: {
|
||||
...process.env,
|
||||
MEASURE_MODEL_CP: "true",
|
||||
MEASURE_MODEL_CP_GPU: gpu == null
|
||||
? undefined
|
||||
: JSON.stringify(gpu)
|
||||
}
|
||||
});
|
||||
let isPlannedExit = false;
|
||||
let isDone = false;
|
||||
let forkSucceeded = false;
|
||||
let timeoutHandle = null;
|
||||
const processCreationTimeout = 1000 * 60 * 5;
|
||||
const stdTexts = [];
|
||||
let lastGpuLayers = maxGpuLayers;
|
||||
function cleanup() {
|
||||
if (subProcess.exitCode == null)
|
||||
subProcess.kill("SIGKILL");
|
||||
if (timeoutHandle != null)
|
||||
clearTimeout(timeoutHandle);
|
||||
process.off("exit", cleanup);
|
||||
}
|
||||
process.on("exit", cleanup);
|
||||
subProcess.stdout?.on("data", (data) => {
|
||||
stdTexts.push(data.toString());
|
||||
});
|
||||
subProcess.stderr?.on("data", (data) => {
|
||||
stdTexts.push(data.toString());
|
||||
});
|
||||
return Promise.race([
|
||||
new Promise((_, reject) => {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
if (!forkSucceeded) {
|
||||
reject(new Error("Measuring using a sub-process timed out"));
|
||||
cleanup();
|
||||
}
|
||||
}, processCreationTimeout);
|
||||
}),
|
||||
new Promise((resolve, reject) => {
|
||||
function done() {
|
||||
if (!forkSucceeded)
|
||||
reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
|
||||
else if (isPlannedExit)
|
||||
resolve(isPlannedExit && isDone);
|
||||
cleanup();
|
||||
}
|
||||
subProcess.on("message", (message) => {
|
||||
if (message.type === "ready") {
|
||||
forkSucceeded = true;
|
||||
subProcess.send({
|
||||
type: "start",
|
||||
modelPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
tests,
|
||||
initialMaxContextSize,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
maxGpuLayers,
|
||||
minGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
evaluateText,
|
||||
exitAfterMeasurement
|
||||
});
|
||||
if (timeoutHandle != null) {
|
||||
clearTimeout(timeoutHandle);
|
||||
timeoutHandle = null;
|
||||
}
|
||||
}
|
||||
else if (message.type === "done") {
|
||||
isPlannedExit = true;
|
||||
isDone = true;
|
||||
subProcess.send({ type: "exit" });
|
||||
}
|
||||
else if (message.type === "exit") {
|
||||
isPlannedExit = true;
|
||||
subProcess.send({ type: "exit" });
|
||||
}
|
||||
else if (message.type === "error") {
|
||||
lastGpuLayers = message.gpuLayers;
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "error",
|
||||
error: message.error,
|
||||
contextSize: message.contextSize
|
||||
}
|
||||
});
|
||||
}
|
||||
else if (message.type === "stats") {
|
||||
lastGpuLayers = message.gpuLayers;
|
||||
onInfo({
|
||||
gpuLayers: message.gpuLayers,
|
||||
result: {
|
||||
type: "success",
|
||||
modelVramUsage: message.modelVramUsage,
|
||||
modelRamUsage: message.modelRamUsage,
|
||||
contextSize: message.contextSize,
|
||||
contextVramUsage: message.contextVramUsage,
|
||||
contextRamUsage: message.contextRamUsage,
|
||||
contextStateSize: message.contextStateSize,
|
||||
totalVramUsage: message.totalVramUsage,
|
||||
totalRamUsage: message.totalRamUsage
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
subProcess.on("exit", (code) => {
|
||||
if (code !== 0 || !isPlannedExit)
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "crash",
|
||||
result: stdTexts.join("")
|
||||
}
|
||||
});
|
||||
done();
|
||||
});
|
||||
if (subProcess.killed || subProcess.exitCode != null) {
|
||||
if (subProcess.exitCode !== 0 || !isPlannedExit)
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "crash",
|
||||
result: stdTexts.join("")
|
||||
}
|
||||
});
|
||||
done();
|
||||
}
|
||||
})
|
||||
]);
|
||||
}
|
||||
if (process.env.MEASURE_MODEL_CP === "true" && process.send != null) {
|
||||
void runTestWorkerLogic();
|
||||
}
|
||||
async function runTestWorkerLogic() {
|
||||
const gpuEnvVar = process.env.MEASURE_MODEL_CP_GPU;
|
||||
const llama = (gpuEnvVar == null || gpuEnvVar === "")
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu: JSON.parse(gpuEnvVar),
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
if (process.send == null)
|
||||
throw new Error("No IPC channel to parent process");
|
||||
function sendInfoBack(info) {
|
||||
if (process.send == null)
|
||||
process.exit(1);
|
||||
process.send(info);
|
||||
}
|
||||
async function testContextSizes({ model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }) {
|
||||
let measurementsDone = 0;
|
||||
const contextSizeCheckPlan = getContextSizesCheckPlan(maxContextSize != null
|
||||
? Math.min(model.trainContextSize, maxContextSize)
|
||||
: model.trainContextSize, tests, minContextSize);
|
||||
let currentContextSizeCheck = startContextSize == null
|
||||
? -1
|
||||
: getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, startContextSize);
|
||||
while (currentContextSizeCheck != null) {
|
||||
if (currentContextSizeCheck === -1)
|
||||
currentContextSizeCheck = null;
|
||||
try {
|
||||
const preContextVramUsage = (await llama.getVramState()).used;
|
||||
const preContextRamUsage = getMemoryUsage(llama);
|
||||
const context = await model.createContext({
|
||||
contextSize: currentContextSizeCheck ?? (maxContextSize != null
|
||||
? { max: maxContextSize }
|
||||
: undefined),
|
||||
ignoreMemorySafetyChecks: currentContextSizeCheck != null,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
failedCreationRemedy: false
|
||||
});
|
||||
if (evaluateText != null && evaluateText != "") {
|
||||
const sequence = context.getSequence();
|
||||
await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText));
|
||||
}
|
||||
const postContextVramUsage = (await llama.getVramState()).used;
|
||||
const postContextRamUsage = getMemoryUsage(llama);
|
||||
measurementsDone++;
|
||||
sendInfoBack({
|
||||
type: "stats",
|
||||
gpuLayers: model.gpuLayers,
|
||||
modelVramUsage,
|
||||
modelRamUsage,
|
||||
contextSize: context.contextSize,
|
||||
contextVramUsage: postContextVramUsage - preContextVramUsage,
|
||||
contextRamUsage: postContextRamUsage - preContextRamUsage,
|
||||
contextStateSize: context.stateSize,
|
||||
totalVramUsage: postContextVramUsage,
|
||||
totalRamUsage: postContextRamUsage
|
||||
});
|
||||
currentContextSizeCheck = context.contextSize;
|
||||
await context.dispose();
|
||||
}
|
||||
catch (err) {
|
||||
sendInfoBack({
|
||||
type: "error",
|
||||
error: String(err),
|
||||
gpuLayers: model.gpuLayers,
|
||||
contextSize: currentContextSizeCheck == null
|
||||
? undefined
|
||||
: currentContextSizeCheck
|
||||
});
|
||||
if (currentContextSizeCheck == null) {
|
||||
currentContextSizeCheck = contextSizeCheckPlan[0];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
|
||||
if (exitAfterMeasurement)
|
||||
return measurementsDone;
|
||||
}
|
||||
return measurementsDone;
|
||||
}
|
||||
async function testWithGpuLayers({ modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }) {
|
||||
try {
|
||||
const preModelVramUsage = (await llama.getVramState()).used;
|
||||
const preModelRamUsage = getMemoryUsage(llama);
|
||||
const model = await llama.loadModel({
|
||||
modelPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
gpuLayers,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
ignoreMemorySafetyChecks: true
|
||||
});
|
||||
const postModelVramUsage = (await llama.getVramState()).used;
|
||||
const postModelRamUsage = getMemoryUsage(llama);
|
||||
sendInfoBack({
|
||||
type: "stats",
|
||||
gpuLayers: model.gpuLayers,
|
||||
modelVramUsage: postModelVramUsage - preModelVramUsage,
|
||||
modelRamUsage: postModelRamUsage - preModelRamUsage,
|
||||
totalVramUsage: postModelVramUsage,
|
||||
totalRamUsage: postModelRamUsage
|
||||
});
|
||||
const measurementsDone = await testContextSizes({
|
||||
model,
|
||||
modelVramUsage: postModelVramUsage - preModelVramUsage,
|
||||
modelRamUsage: postModelRamUsage - preModelRamUsage,
|
||||
startContextSize,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
tests,
|
||||
evaluateText,
|
||||
exitAfterMeasurement
|
||||
});
|
||||
await model.dispose();
|
||||
return measurementsDone;
|
||||
}
|
||||
catch (err) {
|
||||
sendInfoBack({
|
||||
type: "error",
|
||||
error: String(err),
|
||||
gpuLayers: gpuLayers
|
||||
});
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
process.on("message", async (message) => {
|
||||
if (message.type === "start") {
|
||||
for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
|
||||
if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
|
||||
const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
|
||||
const contextSizeCheckPlan = getContextSizesCheckPlan(message.maxContextSize != null
|
||||
? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
|
||||
: ggufInsights.trainContextSize ?? 4096, message.tests, message.minContextSize);
|
||||
const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
|
||||
if (firstContextSizeCheck == null)
|
||||
continue;
|
||||
}
|
||||
const measurementsDone = await testWithGpuLayers({
|
||||
modelPath: message.modelPath,
|
||||
useMmap: message.useMmap,
|
||||
useDirectIo: message.useDirectIo,
|
||||
gpuLayers,
|
||||
tests: message.tests,
|
||||
startContextSize: gpuLayers == message.maxGpuLayers
|
||||
? message.initialMaxContextSize
|
||||
: undefined,
|
||||
maxContextSize: message.maxContextSize,
|
||||
minContextSize: message.minContextSize,
|
||||
flashAttention: message.flashAttention,
|
||||
swaFullCache: message.swaFullCache,
|
||||
batchSize: message.batchSize,
|
||||
evaluateText: message.evaluateText,
|
||||
exitAfterMeasurement: message.exitAfterMeasurement
|
||||
});
|
||||
if (measurementsDone > 0 && message.exitAfterMeasurement) {
|
||||
sendInfoBack({ type: "exit" });
|
||||
return;
|
||||
}
|
||||
}
|
||||
sendInfoBack({ type: "done" });
|
||||
}
|
||||
else if (message.type === "exit") {
|
||||
await llama.dispose();
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
process.send({ type: "ready" });
|
||||
}
|
||||
function getContextSizesCheckPlan(trainContextSize, tests = 10, minContextSize) {
|
||||
const res = [];
|
||||
let shouldStop = false;
|
||||
const attemptToCoverSizes = [256, 512, 1024, 2048, 4096];
|
||||
function addSize(size) {
|
||||
if (size > trainContextSize) {
|
||||
size = trainContextSize;
|
||||
shouldStop = true;
|
||||
}
|
||||
if (size < 2)
|
||||
size = 2;
|
||||
size = padSafeContextSize(size, "up");
|
||||
if (res[res.length - 1] === size) {
|
||||
shouldStop = true;
|
||||
return;
|
||||
}
|
||||
res.push(size);
|
||||
}
|
||||
while (!shouldStop && res.length < tests) {
|
||||
const lastSize = res[res.length - 1];
|
||||
if (lastSize == null) {
|
||||
addSize(Math.max(minContextSize ?? 0, Math.min(attemptToCoverSizes[0], trainContextSize / tests)));
|
||||
continue;
|
||||
}
|
||||
const stepSizesLeft = Math.floor((trainContextSize - Math.min(lastSize, attemptToCoverSizes[attemptToCoverSizes.length - 1])) / (tests - res.length));
|
||||
let stopAddingAttemptedSizes = false;
|
||||
for (const size of attemptToCoverSizes) {
|
||||
if (stepSizesLeft > lastSize && lastSize < size && size <= trainContextSize) {
|
||||
addSize(size);
|
||||
stopAddingAttemptedSizes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (stopAddingAttemptedSizes)
|
||||
continue;
|
||||
addSize(lastSize + stepSizesLeft);
|
||||
}
|
||||
return res.reverse();
|
||||
}
|
||||
function getNextItemInCheckContextSizesPlan(plan, currentSize) {
|
||||
for (const size of plan) {
|
||||
if (size < currentSize)
|
||||
return size;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function padStartAnsi(text, length, padChar = " ") {
|
||||
const textWithoutAnsi = stripAnsi(text);
|
||||
return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
|
||||
}
|
||||
function getMemoryUsage(llama) {
|
||||
const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
|
||||
const vramUsage = llama._bindings.getGpuVramInfo();
|
||||
let memoryUsage = totalMemoryUsage;
|
||||
const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
|
||||
if (unifiedMemoryVramUsage <= memoryUsage)
|
||||
memoryUsage -= unifiedMemoryVramUsage;
|
||||
return memoryUsage;
|
||||
}
|
||||
//# sourceMappingURL=InspectMeasureCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user