First upload version 0.0.1
This commit is contained in:
50
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.d.ts
generated
vendored
Normal file
50
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { LlamaGrammar } from "../../evaluator/LlamaGrammar.js";
|
||||
import { BuildGpu, LlamaNuma } from "../../bindings/types.js";
|
||||
import { SpecializedChatWrapperTypeName } from "../../chatWrappers/utils/resolveChatWrapper.js";
|
||||
type ChatCommand = {
|
||||
modelPath?: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
systemInfo: boolean;
|
||||
systemPrompt?: string;
|
||||
systemPromptFile?: string;
|
||||
prompt?: string;
|
||||
promptFile?: string;
|
||||
wrapper: SpecializedChatWrapperTypeName | "auto";
|
||||
noJinja?: boolean;
|
||||
contextSize?: number;
|
||||
batchSize?: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
noTrimWhitespace: boolean;
|
||||
grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1];
|
||||
jsonSchemaGrammarFile?: string;
|
||||
threads?: number;
|
||||
temperature: number;
|
||||
minP: number;
|
||||
topK: number;
|
||||
topP: number;
|
||||
seed?: number;
|
||||
gpuLayers?: number;
|
||||
repeatPenalty: number;
|
||||
lastTokensRepeatPenalty: number;
|
||||
penalizeRepeatingNewLine: boolean;
|
||||
repeatFrequencyPenalty?: number;
|
||||
repeatPresencePenalty?: number;
|
||||
maxTokens: number;
|
||||
reasoningBudget?: number;
|
||||
noHistory: boolean;
|
||||
environmentFunctions: boolean;
|
||||
tokenPredictionDraftModel?: string;
|
||||
tokenPredictionModelContextSize?: number;
|
||||
debug: boolean;
|
||||
numa?: LlamaNuma;
|
||||
meter: boolean;
|
||||
timing: boolean;
|
||||
noMmap: boolean;
|
||||
noDirectIo: boolean;
|
||||
printTimings: boolean;
|
||||
};
|
||||
export declare const ChatCommand: CommandModule<object, ChatCommand>;
|
||||
export {};
|
||||
765
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.js
generated
vendored
Normal file
765
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.js
generated
vendored
Normal file
@@ -0,0 +1,765 @@
|
||||
import * as readline from "readline";
|
||||
import process from "process";
|
||||
import path from "path";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import prettyMilliseconds from "pretty-ms";
|
||||
import { chatCommandHistoryFilePath, defaultChatSystemPrompt, documentationPageUrls } from "../../config.js";
|
||||
import { getIsInDocumentationMode } from "../../state.js";
|
||||
import { ReplHistory } from "../../utils/ReplHistory.js";
|
||||
import { defineChatSessionFunction } from "../../evaluator/LlamaChatSession/utils/defineChatSessionFunction.js";
|
||||
import { getLlama } from "../../bindings/getLlama.js";
|
||||
import { LlamaGrammar } from "../../evaluator/LlamaGrammar.js";
|
||||
import { LlamaChatSession } from "../../evaluator/LlamaChatSession/LlamaChatSession.js";
|
||||
import { LlamaLogLevel, LlamaLogLevelGreaterThan, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, parseNumaOption } from "../../bindings/types.js";
|
||||
import withOra from "../../utils/withOra.js";
|
||||
import { TokenMeter } from "../../evaluator/TokenMeter.js";
|
||||
import { printInfoLine } from "../utils/printInfoLine.js";
|
||||
import { resolveChatWrapper, specializedChatWrapperTypeNames } from "../../chatWrappers/utils/resolveChatWrapper.js";
|
||||
import { GeneralChatWrapper } from "../../chatWrappers/GeneralChatWrapper.js";
|
||||
import { printCommonInfoLines } from "../utils/printCommonInfoLines.js";
|
||||
import { resolveCommandGgufPath } from "../utils/resolveCommandGgufPath.js";
|
||||
import { withProgressLog } from "../../utils/withProgressLog.js";
|
||||
import { resolveHeaderFlag } from "../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { ConsoleInteraction, ConsoleInteractionKey } from "../utils/ConsoleInteraction.js";
|
||||
import { DraftSequenceTokenPredictor } from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
|
||||
export const ChatCommand = {
|
||||
command: "chat [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Chat with a model", documentationPageUrls.CLI.Chat),
|
||||
builder(yargs) {
|
||||
const isInDocumentationMode = getIsInDocumentationMode();
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
description: "Model file to use for the chat. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
|
||||
})
|
||||
.option("systemInfo", {
|
||||
alias: "i",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp system info"
|
||||
})
|
||||
.option("systemPrompt", {
|
||||
alias: "s",
|
||||
type: "string",
|
||||
description: "System prompt to use against the model" +
|
||||
(isInDocumentationMode ? "" : (". [the default value is determined by the chat wrapper, but is usually: " + defaultChatSystemPrompt.split("\n").join(" ") + "]"))
|
||||
})
|
||||
.option("systemPromptFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to load text from and use as as the model system prompt"
|
||||
})
|
||||
.option("prompt", {
|
||||
type: "string",
|
||||
description: "First prompt to automatically send to the model when starting the chat"
|
||||
})
|
||||
.option("promptFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to load text from and use as a first prompt to automatically send to the model when starting the chat"
|
||||
})
|
||||
.option("wrapper", {
|
||||
alias: "w",
|
||||
type: "string",
|
||||
default: "auto",
|
||||
choices: ["auto", ...specializedChatWrapperTypeNames],
|
||||
description: "Chat wrapper to use. Use `auto` to automatically select a wrapper based on the model's metadata and tokenizer"
|
||||
})
|
||||
.option("noJinja", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Don't use a Jinja wrapper, even if it's the best option for the model"
|
||||
})
|
||||
.option("contextSize", {
|
||||
alias: "c",
|
||||
type: "number",
|
||||
description: "Context size to use for the model context",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("batchSize", {
|
||||
alias: "b",
|
||||
type: "number",
|
||||
description: "Batch size to use for the model context"
|
||||
})
|
||||
.option("flashAttention", {
|
||||
alias: "fa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Enable flash attention"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
})
|
||||
.option("noTrimWhitespace", {
|
||||
type: "boolean",
|
||||
alias: ["noTrim"],
|
||||
default: false,
|
||||
description: "Don't trim whitespaces from the model response"
|
||||
})
|
||||
.option("grammar", {
|
||||
alias: "g",
|
||||
type: "string",
|
||||
default: "text",
|
||||
choices: ["text", "json", "list", "arithmetic", "japanese", "chess"],
|
||||
description: "Restrict the model response to a specific grammar, like JSON for example"
|
||||
})
|
||||
.option("jsonSchemaGrammarFile", {
|
||||
alias: ["jsgf"],
|
||||
type: "string",
|
||||
description: "File path to a JSON schema file, to restrict the model response to only generate output that conforms to the JSON schema"
|
||||
})
|
||||
.option("threads", {
|
||||
type: "number",
|
||||
defaultDescription: "Number of cores that are useful for math on the current machine",
|
||||
description: "Number of threads to use for the evaluation of tokens"
|
||||
})
|
||||
.option("temperature", {
|
||||
alias: "t",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable."
|
||||
})
|
||||
.option("minP", {
|
||||
alias: "mp",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "From the next token candidates, discard the percentage of tokens with the lowest probability. For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded. This is useful for generating more high-quality results when using a high temperature. Set to a value between `0` and `1` to enable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("topK", {
|
||||
alias: "k",
|
||||
type: "number",
|
||||
default: 40,
|
||||
description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0."
|
||||
})
|
||||
.option("topP", {
|
||||
alias: "p",
|
||||
type: "number",
|
||||
default: 0.95,
|
||||
description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("seed", {
|
||||
type: "number",
|
||||
description: "Used to control the randomness of the generated text. Only relevant when using `temperature`.",
|
||||
defaultDescription: "The current epoch time"
|
||||
})
|
||||
.option("gpuLayers", {
|
||||
alias: "gl",
|
||||
type: "number",
|
||||
description: "number of layers to store in VRAM",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("repeatPenalty", {
|
||||
alias: "rp",
|
||||
type: "number",
|
||||
default: 1.1,
|
||||
description: "Prevent the model from repeating the same token too much. Set to `1` to disable."
|
||||
})
|
||||
.option("lastTokensRepeatPenalty", {
|
||||
alias: "rpn",
|
||||
type: "number",
|
||||
default: 64,
|
||||
description: "Number of recent tokens generated by the model to apply penalties to repetition of"
|
||||
})
|
||||
.option("penalizeRepeatingNewLine", {
|
||||
alias: "rpnl",
|
||||
type: "boolean",
|
||||
default: true,
|
||||
description: "Penalize new line tokens. set `--no-penalizeRepeatingNewLine` or `--no-rpnl` to disable"
|
||||
})
|
||||
.option("repeatFrequencyPenalty", {
|
||||
alias: "rfp",
|
||||
type: "number",
|
||||
description: "For n time a token is in the `punishTokens` array, lower its probability by `n * repeatFrequencyPenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("repeatPresencePenalty", {
|
||||
alias: "rpp",
|
||||
type: "number",
|
||||
description: "Lower the probability of all the tokens in the `punishTokens` array by `repeatPresencePenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("maxTokens", {
|
||||
alias: "mt",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
|
||||
})
|
||||
.option("reasoningBudget", {
|
||||
alias: ["tb", "thinkingBudget", "thoughtsBudget"],
|
||||
type: "number",
|
||||
default: -1,
|
||||
defaultDescription: "Unlimited",
|
||||
description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning"
|
||||
})
|
||||
.option("noHistory", {
|
||||
alias: "nh",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Don't load or save chat history"
|
||||
})
|
||||
.option("environmentFunctions", {
|
||||
alias: "ef",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Provide access to environment functions like `getDate` and `getTime`"
|
||||
})
|
||||
.option("tokenPredictionDraftModel", {
|
||||
alias: ["dm", "draftModel"],
|
||||
type: "string",
|
||||
description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
|
||||
})
|
||||
.option("tokenPredictionModelContextSize", {
|
||||
alias: ["dc", "draftContextSize", "draftContext"],
|
||||
type: "number",
|
||||
description: "Max context size to use for the draft sequence token prediction model context",
|
||||
default: 4096
|
||||
})
|
||||
.option("debug", {
|
||||
alias: "d",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp info and debug logs"
|
||||
})
|
||||
.option("numa", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: llamaNumaOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return false;
|
||||
return parseNumaOption(value);
|
||||
},
|
||||
defaultDescription: "false",
|
||||
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
|
||||
})
|
||||
.option("meter", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print how many tokens were used as input and output for each response"
|
||||
})
|
||||
.option("timing", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print how how long it took to generate each response"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("noDirectIo", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable Direct I/O usage when available"
|
||||
})
|
||||
.option("printTimings", {
|
||||
alias: "pt",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp's internal timings after each response"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
try {
|
||||
await RunChat({
|
||||
modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
|
||||
batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
|
||||
temperature, minP, topK, topP, seed,
|
||||
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
|
||||
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
|
||||
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
if (contextSize === -1)
|
||||
contextSize = undefined;
|
||||
if (gpuLayers === -1)
|
||||
gpuLayers = undefined;
|
||||
if (reasoningBudget === -1)
|
||||
reasoningBudget = undefined;
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
const trimWhitespace = !noTrimWhitespace;
|
||||
if (debug)
|
||||
console.info(`${chalk.yellow("Log level:")} debug`);
|
||||
const llamaLogLevel = debug
|
||||
? LlamaLogLevel.debug
|
||||
: LlamaLogLevel.warn;
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
});
|
||||
const logBatchSize = batchSize != null;
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
const useDirectIo = !noDirectIo;
|
||||
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap
|
||||
});
|
||||
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
|
||||
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap,
|
||||
consoleTitle: "Draft model file"
|
||||
})
|
||||
: undefined;
|
||||
if (systemInfo)
|
||||
console.log(llama.systemInfo);
|
||||
if (systemPromptFile != null && systemPromptFile !== "") {
|
||||
if (systemPrompt != null && systemPrompt !== "" && systemPrompt !== defaultChatSystemPrompt)
|
||||
console.warn(chalk.yellow("Both `systemPrompt` and `systemPromptFile` were specified. `systemPromptFile` will be used."));
|
||||
systemPrompt = await fs.readFile(path.resolve(process.cwd(), systemPromptFile), "utf8");
|
||||
}
|
||||
if (promptFile != null && promptFile !== "") {
|
||||
if (prompt != null && prompt !== "")
|
||||
console.warn(chalk.yellow("Both `prompt` and `promptFile` were specified. `promptFile` will be used."));
|
||||
prompt = await fs.readFile(path.resolve(process.cwd(), promptFile), "utf8");
|
||||
}
|
||||
if (batchSize != null && contextSize != null && batchSize > contextSize) {
|
||||
console.warn(chalk.yellow("Batch size is greater than the context size. Batch size will be set to the context size."));
|
||||
batchSize = contextSize;
|
||||
}
|
||||
let initialPrompt = prompt ?? null;
|
||||
const model = await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading model"),
|
||||
successText: chalk.blue("Model loaded"),
|
||||
failText: chalk.blue("Failed to load model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedModelPath,
|
||||
gpuLayers: gpuLayers != null
|
||||
? gpuLayers
|
||||
: contextSize != null
|
||||
? { fitContext: { contextSize } }
|
||||
: undefined,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftModel = resolvedDraftModelPath == null
|
||||
? undefined
|
||||
: await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading draft model"),
|
||||
successText: chalk.blue("Draft model loaded"),
|
||||
failText: chalk.blue("Failed to load draft model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedDraftModelPath,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftContext = draftModel == null
|
||||
? undefined
|
||||
: await withOra({
|
||||
loading: chalk.blue("Creating draft context"),
|
||||
success: chalk.blue("Draft context created"),
|
||||
fail: chalk.blue("Failed to create draft context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await draftModel.createContext({
|
||||
contextSize: { max: tokenPredictionModelContextSize }
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const context = await withOra({
|
||||
loading: chalk.blue("Creating context"),
|
||||
success: chalk.blue("Context created"),
|
||||
fail: chalk.blue("Failed to create context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await model.createContext({
|
||||
contextSize: contextSize != null ? contextSize : undefined,
|
||||
batchSize: batchSize != null ? batchSize : undefined,
|
||||
threads: threads === null ? undefined : threads,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null || contextSize != null,
|
||||
performanceTracking: printTimings
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const grammar = jsonSchemaGrammarFilePath != null
|
||||
? await llama.createGrammarForJsonSchema(await fs.readJson(path.resolve(process.cwd(), jsonSchemaGrammarFilePath)))
|
||||
: grammarArg !== "text"
|
||||
? await LlamaGrammar.getFor(llama, grammarArg)
|
||||
: undefined;
|
||||
const chatWrapper = resolveChatWrapper({
|
||||
type: wrapper,
|
||||
bosString: model.tokens.bosString,
|
||||
filename: model.filename,
|
||||
fileInfo: model.fileInfo,
|
||||
tokenizer: model.tokenizer,
|
||||
noJinja
|
||||
}) ?? new GeneralChatWrapper();
|
||||
const draftContextSequence = draftContext?.getSequence();
|
||||
const contextSequence = draftContextSequence != null
|
||||
? context.getSequence({
|
||||
tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
|
||||
})
|
||||
: context.getSequence();
|
||||
const session = new LlamaChatSession({
|
||||
contextSequence,
|
||||
systemPrompt,
|
||||
chatWrapper: chatWrapper
|
||||
});
|
||||
let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
|
||||
let lastTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
let lastTokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
if (grammarArg != "text" && jsonSchemaGrammarFilePath != null)
|
||||
console.warn(chalk.yellow("Both `grammar` and `jsonSchemaGrammarFile` were specified. `jsonSchemaGrammarFile` will be used."));
|
||||
if (environmentFunctions && grammar != null) {
|
||||
console.warn(chalk.yellow("Environment functions are disabled since a grammar is already specified"));
|
||||
environmentFunctions = false;
|
||||
}
|
||||
const padTitle = await printCommonInfoLines({
|
||||
context,
|
||||
draftContext,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
printBos: true,
|
||||
printEos: true,
|
||||
logBatchSize,
|
||||
tokenMeterEnabled: meter
|
||||
});
|
||||
printInfoLine({
|
||||
title: "Chat",
|
||||
padTitle: padTitle,
|
||||
info: [{
|
||||
title: "Wrapper",
|
||||
value: chatWrapper.wrapperName
|
||||
}, {
|
||||
title: "Repeat penalty",
|
||||
value: `${repeatPenalty} (apply to last ${lastTokensRepeatPenalty} tokens)`
|
||||
}, {
|
||||
show: repeatFrequencyPenalty != null,
|
||||
title: "Repeat frequency penalty",
|
||||
value: String(repeatFrequencyPenalty)
|
||||
}, {
|
||||
show: repeatPresencePenalty != null,
|
||||
title: "Repeat presence penalty",
|
||||
value: String(repeatPresencePenalty)
|
||||
}, {
|
||||
show: !penalizeRepeatingNewLine,
|
||||
title: "Penalize repeating new line",
|
||||
value: "disabled"
|
||||
}, {
|
||||
show: jsonSchemaGrammarFilePath != null,
|
||||
title: "JSON schema grammar file",
|
||||
value: () => path.relative(process.cwd(), path.resolve(process.cwd(), jsonSchemaGrammarFilePath ?? ""))
|
||||
}, {
|
||||
show: jsonSchemaGrammarFilePath == null && grammarArg !== "text",
|
||||
title: "Grammar",
|
||||
value: grammarArg
|
||||
}, {
|
||||
show: environmentFunctions,
|
||||
title: "Environment functions",
|
||||
value: "enabled"
|
||||
}, {
|
||||
show: timing,
|
||||
title: "Response timing",
|
||||
value: "enabled"
|
||||
}]
|
||||
});
|
||||
// this is for ora to not interfere with readline
|
||||
await new Promise((resolve) => setTimeout(resolve, 1));
|
||||
const replHistory = await ReplHistory.load(chatCommandHistoryFilePath, !noHistory);
|
||||
async function getPrompt() {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
history: replHistory.history.slice()
|
||||
});
|
||||
const res = await new Promise((accept) => rl.question(chalk.yellow("> "), accept));
|
||||
rl.close();
|
||||
return res;
|
||||
}
|
||||
if (prompt != null && prompt !== "" && !printTimings && (meter || timing)) {
|
||||
// warm up the context sequence before the first evaluation, to make the timings of the actual evaluations more accurate
|
||||
const contextFirstToken = session.chatWrapper.generateContextState({
|
||||
chatHistory: [
|
||||
...session.getChatHistory(),
|
||||
{ type: "user", text: "" }
|
||||
]
|
||||
}).contextText.tokenize(model.tokenizer)[0];
|
||||
if (contextFirstToken != null)
|
||||
await contextSequence.evaluateWithoutGeneratingNewTokens([contextFirstToken]);
|
||||
}
|
||||
else if (!printTimings && !meter)
|
||||
void session.preloadPrompt("")
|
||||
.catch(() => void 0); // don't throw an error if preloading fails because a real prompt is sent early
|
||||
while (true) {
|
||||
let hadTrimmedWhitespaceTextInThisIterationAndSegment = false;
|
||||
let nextPrintLeftovers = "";
|
||||
const input = initialPrompt != null
|
||||
? initialPrompt
|
||||
: await getPrompt();
|
||||
if (initialPrompt != null) {
|
||||
console.log(chalk.green("> ") + initialPrompt);
|
||||
initialPrompt = null;
|
||||
}
|
||||
else
|
||||
await replHistory.add(input);
|
||||
if (input === ".exit")
|
||||
break;
|
||||
process.stdout.write(chalk.yellow("AI: "));
|
||||
const [startColor, endColor] = chalk.blue("MIDDLE").split("MIDDLE");
|
||||
const [segmentStartColor, segmentEndColor] = chalk.gray("MIDDLE").split("MIDDLE");
|
||||
const abortController = new AbortController();
|
||||
const consoleInteraction = new ConsoleInteraction();
|
||||
consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
|
||||
abortController.abort();
|
||||
consoleInteraction.stop();
|
||||
});
|
||||
const timeBeforePrompt = Date.now();
|
||||
let currentSegmentType;
|
||||
try {
|
||||
process.stdout.write(startColor);
|
||||
consoleInteraction.start();
|
||||
await session.prompt(input, {
|
||||
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
|
||||
temperature,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed: seed ?? undefined,
|
||||
signal: abortController.signal,
|
||||
stopOnAbortSignal: true,
|
||||
budgets: {
|
||||
thoughtTokens: reasoningBudget
|
||||
},
|
||||
repeatPenalty: {
|
||||
penalty: repeatPenalty,
|
||||
frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,
|
||||
presencePenalty: repeatPresencePenalty != null ? repeatPresencePenalty : undefined,
|
||||
penalizeNewLine: penalizeRepeatingNewLine,
|
||||
lastTokens: lastTokensRepeatPenalty
|
||||
},
|
||||
maxTokens: maxTokens === -1
|
||||
? context.contextSize
|
||||
: maxTokens <= 0
|
||||
? undefined
|
||||
: maxTokens,
|
||||
onResponseChunk({ text: chunk, type: chunkType, segmentType }) {
|
||||
if (segmentType != currentSegmentType) {
|
||||
const printNewline = hadTrimmedWhitespaceTextInThisIterationAndSegment
|
||||
? "\n"
|
||||
: "";
|
||||
hadTrimmedWhitespaceTextInThisIterationAndSegment = false;
|
||||
if (chunkType !== "segment" || segmentType == null) {
|
||||
process.stdout.write(segmentEndColor);
|
||||
process.stdout.write(chalk.reset.whiteBright.bold(printNewline + "[response] "));
|
||||
process.stdout.write(startColor);
|
||||
}
|
||||
else if (currentSegmentType == null) {
|
||||
process.stdout.write(endColor);
|
||||
process.stdout.write(chalk.reset.whiteBright.bold(printNewline + `[segment: ${segmentType}] `));
|
||||
process.stdout.write(segmentStartColor);
|
||||
}
|
||||
else {
|
||||
process.stdout.write(segmentEndColor);
|
||||
process.stdout.write(chalk.reset.whiteBright.bold(printNewline + `[segment: ${segmentType}] `));
|
||||
process.stdout.write(segmentStartColor);
|
||||
}
|
||||
currentSegmentType = segmentType;
|
||||
}
|
||||
let text = nextPrintLeftovers + chunk;
|
||||
nextPrintLeftovers = "";
|
||||
if (trimWhitespace) {
|
||||
if (!hadTrimmedWhitespaceTextInThisIterationAndSegment) {
|
||||
text = text.trimStart();
|
||||
if (text.length > 0)
|
||||
hadTrimmedWhitespaceTextInThisIterationAndSegment = true;
|
||||
}
|
||||
const textWithTrimmedEnd = text.trimEnd();
|
||||
if (textWithTrimmedEnd.length < text.length) {
|
||||
nextPrintLeftovers = text.slice(textWithTrimmedEnd.length);
|
||||
text = textWithTrimmedEnd;
|
||||
}
|
||||
}
|
||||
process.stdout.write(text);
|
||||
},
|
||||
functions: (grammar == null && environmentFunctions)
|
||||
? defaultEnvironmentFunctions
|
||||
: undefined,
|
||||
trimWhitespaceSuffix: trimWhitespace
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (!(abortController.signal.aborted && err === abortController.signal.reason))
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
consoleInteraction.stop();
|
||||
const currentEndColor = currentSegmentType != null
|
||||
? segmentEndColor
|
||||
: endColor;
|
||||
if (abortController.signal.aborted)
|
||||
process.stdout.write(currentEndColor + chalk.yellow("[generation aborted by user]"));
|
||||
else
|
||||
process.stdout.write(currentEndColor);
|
||||
console.log();
|
||||
}
|
||||
const timeAfterPrompt = Date.now();
|
||||
if (printTimings) {
|
||||
if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
|
||||
llama.logLevel = LlamaLogLevel.info;
|
||||
await context.printTimings();
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
llama.logLevel = llamaLogLevel;
|
||||
}
|
||||
if (timing)
|
||||
console.info(chalk.dim("Response duration: ") +
|
||||
prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
|
||||
keepDecimalsOnWholeSeconds: true,
|
||||
secondsDecimalDigits: 2,
|
||||
separateMilliseconds: true,
|
||||
compact: false
|
||||
}));
|
||||
if (meter) {
|
||||
const newTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
|
||||
lastTokenMeterState = newTokenMeterState;
|
||||
const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
|
||||
const tokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
|
||||
const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
|
||||
const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
|
||||
const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
|
||||
lastTokenPredictionsStats = tokenPredictionsStats;
|
||||
console.info([
|
||||
showDraftTokenMeterDiff && (chalk.yellow("Main".padEnd("Drafter".length))),
|
||||
chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " "))
|
||||
].filter(Boolean).join(" "));
|
||||
if (lastDraftTokenMeterState != null && draftContextSequence != null) {
|
||||
const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
|
||||
const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
|
||||
lastDraftTokenMeterState = newDraftTokenMeterState;
|
||||
console.info([
|
||||
chalk.yellow("Drafter"),
|
||||
chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
|
||||
].join(" "));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const defaultEnvironmentFunctions = {
|
||||
getDate: defineChatSessionFunction({
|
||||
description: "Retrieve the current date",
|
||||
handler() {
|
||||
const date = new Date();
|
||||
return [
|
||||
date.getFullYear(),
|
||||
String(date.getMonth() + 1).padStart(2, "0"),
|
||||
String(date.getDate()).padStart(2, "0")
|
||||
].join("-");
|
||||
}
|
||||
}),
|
||||
getTime: defineChatSessionFunction({
|
||||
description: "Retrieve the current time",
|
||||
handler() {
|
||||
return new Date().toLocaleTimeString("en-US");
|
||||
}
|
||||
})
|
||||
};
|
||||
//# sourceMappingURL=ChatCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/ChatCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
38
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.d.ts
generated
vendored
Normal file
38
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu, LlamaNuma } from "../../bindings/types.js";
|
||||
type CompleteCommand = {
|
||||
modelPath?: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
systemInfo: boolean;
|
||||
text?: string;
|
||||
textFile?: string;
|
||||
contextSize?: number;
|
||||
batchSize?: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
threads?: number;
|
||||
temperature: number;
|
||||
minP: number;
|
||||
topK: number;
|
||||
topP: number;
|
||||
seed?: number;
|
||||
gpuLayers?: number;
|
||||
repeatPenalty: number;
|
||||
lastTokensRepeatPenalty: number;
|
||||
penalizeRepeatingNewLine: boolean;
|
||||
repeatFrequencyPenalty?: number;
|
||||
repeatPresencePenalty?: number;
|
||||
maxTokens: number;
|
||||
tokenPredictionDraftModel?: string;
|
||||
tokenPredictionModelContextSize?: number;
|
||||
debug: boolean;
|
||||
numa?: LlamaNuma;
|
||||
meter: boolean;
|
||||
timing: boolean;
|
||||
noMmap: boolean;
|
||||
noDirectIo: boolean;
|
||||
printTimings: boolean;
|
||||
};
|
||||
export declare const CompleteCommand: CommandModule<object, CompleteCommand>;
|
||||
export {};
|
||||
567
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.js
generated
vendored
Normal file
567
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.js
generated
vendored
Normal file
@@ -0,0 +1,567 @@
|
||||
import * as readline from "readline";
|
||||
import process from "process";
|
||||
import path from "path";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import prettyMilliseconds from "pretty-ms";
|
||||
import { getLlama } from "../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, LlamaLogLevelGreaterThan, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, parseNumaOption } from "../../bindings/types.js";
|
||||
import { LlamaCompletion } from "../../evaluator/LlamaCompletion.js";
|
||||
import withOra from "../../utils/withOra.js";
|
||||
import { TokenMeter } from "../../evaluator/TokenMeter.js";
|
||||
import { printInfoLine } from "../utils/printInfoLine.js";
|
||||
import { printCommonInfoLines } from "../utils/printCommonInfoLines.js";
|
||||
import { resolveCommandGgufPath } from "../utils/resolveCommandGgufPath.js";
|
||||
import { withProgressLog } from "../../utils/withProgressLog.js";
|
||||
import { resolveHeaderFlag } from "../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../config.js";
|
||||
import { ConsoleInteraction, ConsoleInteractionKey } from "../utils/ConsoleInteraction.js";
|
||||
import { DraftSequenceTokenPredictor } from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
|
||||
export const CompleteCommand = {
|
||||
command: "complete [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Generate a completion for a given text", documentationPageUrls.CLI.Complete),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
description: "Model file to use for the completion. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
|
||||
})
|
||||
.option("systemInfo", {
|
||||
alias: "i",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp system info"
|
||||
})
|
||||
.option("text", {
|
||||
type: "string",
|
||||
description: "First text to automatically start generating completion for"
|
||||
})
|
||||
.option("textFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to load text from and use as the first text to automatically start generating completion for"
|
||||
})
|
||||
.option("contextSize", {
|
||||
alias: "c",
|
||||
type: "number",
|
||||
description: "Context size to use for the model context",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("batchSize", {
|
||||
alias: "b",
|
||||
type: "number",
|
||||
description: "Batch size to use for the model context"
|
||||
})
|
||||
.option("flashAttention", {
|
||||
alias: "fa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Enable flash attention"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
})
|
||||
.option("threads", {
|
||||
type: "number",
|
||||
defaultDescription: "Number of cores that are useful for math on the current machine",
|
||||
description: "Number of threads to use for the evaluation of tokens"
|
||||
})
|
||||
.option("temperature", {
|
||||
alias: "t",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable."
|
||||
})
|
||||
.option("minP", {
|
||||
alias: "mp",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "From the next token candidates, discard the percentage of tokens with the lowest probability. For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded. This is useful for generating more high-quality results when using a high temperature. Set to a value between `0` and `1` to enable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("topK", {
|
||||
alias: "k",
|
||||
type: "number",
|
||||
default: 40,
|
||||
description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0."
|
||||
})
|
||||
.option("topP", {
|
||||
alias: "p",
|
||||
type: "number",
|
||||
default: 0.95,
|
||||
description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("seed", {
|
||||
type: "number",
|
||||
description: "Used to control the randomness of the generated text. Only relevant when using `temperature`.",
|
||||
defaultDescription: "The current epoch time"
|
||||
})
|
||||
.option("gpuLayers", {
|
||||
alias: "gl",
|
||||
type: "number",
|
||||
description: "number of layers to store in VRAM",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("repeatPenalty", {
|
||||
alias: "rp",
|
||||
type: "number",
|
||||
default: 1.1,
|
||||
description: "Prevent the model from repeating the same token too much. Set to `1` to disable."
|
||||
})
|
||||
.option("lastTokensRepeatPenalty", {
|
||||
alias: "rpn",
|
||||
type: "number",
|
||||
default: 64,
|
||||
description: "Number of recent tokens generated by the model to apply penalties to repetition of"
|
||||
})
|
||||
.option("penalizeRepeatingNewLine", {
|
||||
alias: "rpnl",
|
||||
type: "boolean",
|
||||
default: true,
|
||||
description: "Penalize new line tokens. set `--no-penalizeRepeatingNewLine` or `--no-rpnl` to disable"
|
||||
})
|
||||
.option("repeatFrequencyPenalty", {
|
||||
alias: "rfp",
|
||||
type: "number",
|
||||
description: "For n time a token is in the `punishTokens` array, lower its probability by `n * repeatFrequencyPenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("repeatPresencePenalty", {
|
||||
alias: "rpp",
|
||||
type: "number",
|
||||
description: "Lower the probability of all the tokens in the `punishTokens` array by `repeatPresencePenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("maxTokens", {
|
||||
alias: "mt",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
|
||||
})
|
||||
.option("tokenPredictionDraftModel", {
|
||||
alias: ["dm", "draftModel"],
|
||||
type: "string",
|
||||
description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
|
||||
})
|
||||
.option("tokenPredictionModelContextSize", {
|
||||
alias: ["dc", "draftContextSize", "draftContext"],
|
||||
type: "number",
|
||||
description: "Max context size to use for the draft sequence token prediction model context",
|
||||
default: 4096
|
||||
})
|
||||
.option("debug", {
|
||||
alias: "d",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp info and debug logs"
|
||||
})
|
||||
.option("numa", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: llamaNumaOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return false;
|
||||
return parseNumaOption(value);
|
||||
},
|
||||
defaultDescription: "false",
|
||||
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
|
||||
})
|
||||
.option("meter", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Log how many tokens were used as input and output for each response"
|
||||
})
|
||||
.option("timing", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print how how long it took to generate each response"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("noDirectIo", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable Direct I/O usage when available"
|
||||
})
|
||||
.option("printTimings", {
|
||||
alias: "pt",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp's internal timings after each response"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
try {
|
||||
await RunCompletion({
|
||||
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
|
||||
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
|
||||
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
|
||||
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
async function RunCompletion({ modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
if (contextSize === -1)
|
||||
contextSize = undefined;
|
||||
if (gpuLayers === -1)
|
||||
gpuLayers = undefined;
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
if (debug)
|
||||
console.info(`${chalk.yellow("Log level:")} debug`);
|
||||
const llamaLogLevel = debug
|
||||
? LlamaLogLevel.debug
|
||||
: LlamaLogLevel.warn;
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
});
|
||||
const logBatchSize = batchSize != null;
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
const useDirectIo = !noDirectIo;
|
||||
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap
|
||||
});
|
||||
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
|
||||
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap,
|
||||
consoleTitle: "Draft model file"
|
||||
})
|
||||
: undefined;
|
||||
if (systemInfo)
|
||||
console.log(llama.systemInfo);
|
||||
if (textFile != null && textFile !== "") {
|
||||
if (text != null && text !== "")
|
||||
console.warn(chalk.yellow("Both `text` and `textFile` were specified. `textFile` will be used."));
|
||||
text = await fs.readFile(path.resolve(process.cwd(), textFile), "utf8");
|
||||
}
|
||||
if (batchSize != null && contextSize != null && batchSize > contextSize) {
|
||||
console.warn(chalk.yellow("Batch size is greater than the context size. Batch size will be set to the context size."));
|
||||
batchSize = contextSize;
|
||||
}
|
||||
let initialText = text ?? null;
|
||||
const model = await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading model"),
|
||||
successText: chalk.blue("Model loaded"),
|
||||
failText: chalk.blue("Failed to load model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedModelPath,
|
||||
gpuLayers: gpuLayers != null
|
||||
? gpuLayers
|
||||
: contextSize != null
|
||||
? { fitContext: { contextSize } }
|
||||
: undefined,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftModel = resolvedDraftModelPath == null
|
||||
? undefined
|
||||
: await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading draft model"),
|
||||
successText: chalk.blue("Draft model loaded"),
|
||||
failText: chalk.blue("Failed to load draft model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedDraftModelPath,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftContext = draftModel == null
|
||||
? undefined
|
||||
: await withOra({
|
||||
loading: chalk.blue("Creating draft context"),
|
||||
success: chalk.blue("Draft context created"),
|
||||
fail: chalk.blue("Failed to create draft context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await draftModel.createContext({
|
||||
contextSize: { max: tokenPredictionModelContextSize }
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const context = await withOra({
|
||||
loading: chalk.blue("Creating context"),
|
||||
success: chalk.blue("Context created"),
|
||||
fail: chalk.blue("Failed to create context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await model.createContext({
|
||||
contextSize: contextSize != null ? contextSize : undefined,
|
||||
batchSize: batchSize != null ? batchSize : undefined,
|
||||
threads: threads === null ? undefined : threads,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null || contextSize != null,
|
||||
performanceTracking: printTimings
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftContextSequence = draftContext?.getSequence();
|
||||
const contextSequence = draftContextSequence != null
|
||||
? context.getSequence({
|
||||
tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
|
||||
})
|
||||
: context.getSequence();
|
||||
const completion = new LlamaCompletion({
|
||||
contextSequence
|
||||
});
|
||||
let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
|
||||
let lastTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
let lastTokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
const padTitle = await printCommonInfoLines({
|
||||
context,
|
||||
draftContext,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
minTitleLength: "Complete".length + 1,
|
||||
logBatchSize,
|
||||
tokenMeterEnabled: meter
|
||||
});
|
||||
printInfoLine({
|
||||
title: "Complete",
|
||||
padTitle: padTitle,
|
||||
info: [{
|
||||
title: "Repeat penalty",
|
||||
value: `${repeatPenalty} (apply to last ${lastTokensRepeatPenalty} tokens)`
|
||||
}, {
|
||||
show: repeatFrequencyPenalty != null,
|
||||
title: "Repeat frequency penalty",
|
||||
value: String(repeatFrequencyPenalty)
|
||||
}, {
|
||||
show: repeatPresencePenalty != null,
|
||||
title: "Repeat presence penalty",
|
||||
value: String(repeatPresencePenalty)
|
||||
}, {
|
||||
show: !penalizeRepeatingNewLine,
|
||||
title: "Penalize repeating new line",
|
||||
value: "disabled"
|
||||
}, {
|
||||
show: timing,
|
||||
title: "Response timing",
|
||||
value: "enabled"
|
||||
}]
|
||||
});
|
||||
// this is for ora to not interfere with readline
|
||||
await new Promise((resolve) => setTimeout(resolve, 1));
|
||||
const replHistory = [];
|
||||
async function getPrompt() {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
history: replHistory.slice()
|
||||
});
|
||||
const res = await new Promise((accept) => rl.question(chalk.yellow("> "), accept));
|
||||
rl.close();
|
||||
return res;
|
||||
}
|
||||
while (true) {
|
||||
const input = initialText != null
|
||||
? initialText
|
||||
: await getPrompt();
|
||||
if (initialText != null) {
|
||||
console.log(chalk.green("> ") + initialText);
|
||||
initialText = null;
|
||||
}
|
||||
else
|
||||
await replHistory.push(input);
|
||||
if (input === ".exit")
|
||||
break;
|
||||
process.stdout.write(chalk.yellow("Completion: "));
|
||||
const [startColor, endColor] = chalk.blue("MIDDLE").split("MIDDLE");
|
||||
const abortController = new AbortController();
|
||||
const consoleInteraction = new ConsoleInteraction();
|
||||
consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
|
||||
abortController.abort();
|
||||
consoleInteraction.stop();
|
||||
});
|
||||
const timeBeforePrompt = Date.now();
|
||||
try {
|
||||
process.stdout.write(startColor);
|
||||
consoleInteraction.start();
|
||||
await completion.generateCompletion(input, {
|
||||
temperature,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed: seed ?? undefined,
|
||||
signal: abortController.signal,
|
||||
repeatPenalty: {
|
||||
penalty: repeatPenalty,
|
||||
frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,
|
||||
presencePenalty: repeatPresencePenalty != null ? repeatPresencePenalty : undefined,
|
||||
penalizeNewLine: penalizeRepeatingNewLine,
|
||||
lastTokens: lastTokensRepeatPenalty
|
||||
},
|
||||
maxTokens: maxTokens === -1
|
||||
? context.contextSize
|
||||
: maxTokens <= 0
|
||||
? undefined
|
||||
: maxTokens,
|
||||
onTextChunk(chunk) {
|
||||
process.stdout.write(chunk);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (!(abortController.signal.aborted && err === abortController.signal.reason))
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
consoleInteraction.stop();
|
||||
if (abortController.signal.aborted)
|
||||
process.stdout.write(endColor + chalk.yellow("[generation aborted by user]"));
|
||||
else
|
||||
process.stdout.write(endColor);
|
||||
console.log();
|
||||
}
|
||||
const timeAfterPrompt = Date.now();
|
||||
if (printTimings) {
|
||||
if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
|
||||
llama.logLevel = LlamaLogLevel.info;
|
||||
await context.printTimings();
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
llama.logLevel = llamaLogLevel;
|
||||
}
|
||||
if (timing)
|
||||
console.info(chalk.dim("Response duration: ") +
|
||||
prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
|
||||
keepDecimalsOnWholeSeconds: true,
|
||||
secondsDecimalDigits: 2,
|
||||
separateMilliseconds: true,
|
||||
compact: false
|
||||
}));
|
||||
if (meter) {
|
||||
const newTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
|
||||
lastTokenMeterState = newTokenMeterState;
|
||||
const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
|
||||
const tokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
|
||||
const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
|
||||
const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
|
||||
const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
|
||||
lastTokenPredictionsStats = tokenPredictionsStats;
|
||||
console.info([
|
||||
showDraftTokenMeterDiff && (chalk.yellow("Main".padEnd("Drafter".length))),
|
||||
chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " "))
|
||||
].filter(Boolean).join(" "));
|
||||
if (lastDraftTokenMeterState != null && draftContextSequence != null) {
|
||||
const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
|
||||
const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
|
||||
lastDraftTokenMeterState = newDraftTokenMeterState;
|
||||
console.info([
|
||||
chalk.yellow("Drafter"),
|
||||
chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
|
||||
].join(" "));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=CompleteCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/CompleteCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
7
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.d.ts
generated
vendored
Normal file
7
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
import { CommandModule } from "yargs";
|
||||
declare const debugFunctions: readonly ["vram", "cmakeOptions"];
|
||||
type DebugCommand = {
|
||||
function: (typeof debugFunctions)[number];
|
||||
};
|
||||
export declare const DebugCommand: CommandModule<object, DebugCommand>;
|
||||
export {};
|
||||
55
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.js
generated
vendored
Normal file
55
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.js
generated
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
import os from "os";
|
||||
import chalk from "chalk";
|
||||
import { getLlama } from "../../bindings/getLlama.js";
|
||||
import { prettyPrintObject } from "../../utils/prettyPrintObject.js";
|
||||
import { logUsedGpuTypeOption } from "../utils/logUsedGpuTypeOption.js";
|
||||
import { toBytes } from "../utils/toBytes.js";
|
||||
const debugFunctions = ["vram", "cmakeOptions"];
|
||||
export const DebugCommand = {
|
||||
command: "debug [function]",
|
||||
describe: false,
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("function", {
|
||||
type: "string",
|
||||
choices: debugFunctions,
|
||||
demandOption: true,
|
||||
description: "debug function to run"
|
||||
});
|
||||
},
|
||||
async handler({ function: func }) {
|
||||
if (func === "vram")
|
||||
await DebugVramFunction();
|
||||
else if (func === "cmakeOptions")
|
||||
await DebugCmakeOptionsFunction();
|
||||
else
|
||||
void func;
|
||||
}
|
||||
};
|
||||
async function DebugVramFunction() {
|
||||
const llama = await getLlama("lastBuild");
|
||||
const vramStatus = await llama.getVramState();
|
||||
const totalMemory = os.totalmem();
|
||||
const freeMemory = os.freemem();
|
||||
const usedMemory = totalMemory - freeMemory;
|
||||
const getPercentageString = (amount, total) => {
|
||||
if (total === 0)
|
||||
return "0";
|
||||
return String(Math.floor((amount / total) * 100 * 100) / 100);
|
||||
};
|
||||
logUsedGpuTypeOption(llama.gpu);
|
||||
console.info();
|
||||
console.info(`${chalk.yellow("Used VRAM:")} ${getPercentageString(vramStatus.used, vramStatus.total)}% ${chalk.gray("(" + toBytes(vramStatus.used) + "/" + toBytes(vramStatus.total) + ")")}`);
|
||||
console.info(`${chalk.yellow("Free VRAM:")} ${getPercentageString(vramStatus.free, vramStatus.total)}% ${chalk.gray("(" + toBytes(vramStatus.free) + "/" + toBytes(vramStatus.total) + ")")}`);
|
||||
console.info();
|
||||
console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, totalMemory)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + toBytes(freeMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
}
|
||||
async function DebugCmakeOptionsFunction() {
|
||||
const llama = await getLlama("lastBuild");
|
||||
logUsedGpuTypeOption(llama.gpu);
|
||||
console.info();
|
||||
console.info(`${chalk.yellow("CMake options:")} ${prettyPrintObject(llama.cmakeOptions)}`);
|
||||
console.info(`${chalk.yellow("Release:")} ${prettyPrintObject(llama.llamaCppRelease)}`);
|
||||
}
|
||||
//# sourceMappingURL=DebugCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/DebugCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"DebugCommand.js","sourceRoot":"","sources":["../../../src/cli/commands/DebugCommand.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAC,QAAQ,EAAC,MAAM,4BAA4B,CAAC;AACpD,OAAO,EAAC,iBAAiB,EAAC,MAAM,kCAAkC,CAAC;AACnE,OAAO,EAAC,oBAAoB,EAAC,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAC,OAAO,EAAC,MAAM,qBAAqB,CAAC;AAE5C,MAAM,cAAc,GAAG,CAAC,MAAM,EAAE,cAAc,CAAU,CAAC;AAKzD,MAAM,CAAC,MAAM,YAAY,GAAwC;IAC7D,OAAO,EAAE,kBAAkB;IAC3B,QAAQ,EAAE,KAAK;IACf,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,MAAM,CAAC,UAAU,EAAE;YAChB,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,cAAc;YACvB,YAAY,EAAE,IAAI;YAClB,WAAW,EAAE,uBAAuB;SACvC,CAAC,CAAC;IACX,CAAC;IACD,KAAK,CAAC,OAAO,CAAC,EAAC,QAAQ,EAAE,IAAI,EAAe;QACxC,IAAI,IAAI,KAAK,MAAM;YACf,MAAM,iBAAiB,EAAE,CAAC;aACzB,IAAI,IAAI,KAAK,cAAc;YAC5B,MAAM,yBAAyB,EAAE,CAAC;;YAElC,KAAM,IAAqB,CAAC;IACpC,CAAC;CACJ,CAAC;AAEF,KAAK,UAAU,iBAAiB;IAC5B,MAAM,KAAK,GAAG,MAAM,QAAQ,CAAC,WAAW,CAAC,CAAC;IAE1C,MAAM,UAAU,GAAG,MAAM,KAAK,CAAC,YAAY,EAAE,CAAC;IAC9C,MAAM,WAAW,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC;IAClC,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;IAChC,MAAM,UAAU,GAAG,WAAW,GAAG,UAAU,CAAC;IAE5C,MAAM,mBAAmB,GAAG,CAAC,MAAc,EAAE,KAAa,EAAE,EAAE;QAC1D,IAAI,KAAK,KAAK,CAAC;YACX,OAAO,GAAG,CAAC;QAEf,OAAO,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC;IAClE,CAAC,CAAC;IAEF,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,OAAO,CAAC,IAAI,EAAE,CAAC;IAEf,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,mBAAmB,CAAC,UAAU,CAAC,IAAI,EAAE,UAAU,CAAC,KAAK,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;IAC/L,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,mBAAmB,CAAC,UAAU,CAAC,IAAI,EAAE,UAAU,CAAC,KAAK,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;IAC/L,OAAO,CAAC,IAAI,EAAE,CAAC;IACf,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,mBAAmB,CAAC,UAAU,EAAE,WAAW,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG,GAAG,GAAG,OAAO,CAAC,WAAW,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;IAC1K,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,mBAAmB,CAAC,UAAU,EAAE,WAAW,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG,GAAG,GAAG,OAAO,CAAC,WAAW,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;AAC9K,CAAC;AAED,KAAK,UAAU,yBAAyB;IACpC,MAAM,KAAK,GAAG,MAAM,QAAQ,CAAC,WAAW,CAAC,CAAC;IAE1C,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,OAAO,CAAC,IAAI,EAAE,CAAC;IAEf,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,gBAAgB,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC;IAC3F,OAAO,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,UAAU,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC;AAC5F,CAAC"}
|
||||
40
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.d.ts
generated
vendored
Normal file
40
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu, LlamaNuma } from "../../bindings/types.js";
|
||||
type InfillCommand = {
|
||||
modelPath?: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
systemInfo: boolean;
|
||||
prefix?: string;
|
||||
prefixFile?: string;
|
||||
suffix?: string;
|
||||
suffixFile?: string;
|
||||
contextSize?: number;
|
||||
batchSize?: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
threads?: number;
|
||||
temperature: number;
|
||||
minP: number;
|
||||
topK: number;
|
||||
topP: number;
|
||||
seed?: number;
|
||||
gpuLayers?: number;
|
||||
repeatPenalty: number;
|
||||
lastTokensRepeatPenalty: number;
|
||||
penalizeRepeatingNewLine: boolean;
|
||||
repeatFrequencyPenalty?: number;
|
||||
repeatPresencePenalty?: number;
|
||||
maxTokens: number;
|
||||
tokenPredictionDraftModel?: string;
|
||||
tokenPredictionModelContextSize?: number;
|
||||
debug: boolean;
|
||||
numa?: LlamaNuma;
|
||||
meter: boolean;
|
||||
timing: boolean;
|
||||
noMmap: boolean;
|
||||
noDirectIo: boolean;
|
||||
printTimings: boolean;
|
||||
};
|
||||
export declare const InfillCommand: CommandModule<object, InfillCommand>;
|
||||
export {};
|
||||
602
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js
generated
vendored
Normal file
602
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js
generated
vendored
Normal file
@@ -0,0 +1,602 @@
|
||||
import * as readline from "readline";
|
||||
import process from "process";
|
||||
import path from "path";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import prettyMilliseconds from "pretty-ms";
|
||||
import { getLlama } from "../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, LlamaLogLevelGreaterThan, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, parseNumaOption } from "../../bindings/types.js";
|
||||
import { LlamaCompletion } from "../../evaluator/LlamaCompletion.js";
|
||||
import withOra from "../../utils/withOra.js";
|
||||
import { TokenMeter } from "../../evaluator/TokenMeter.js";
|
||||
import { printInfoLine } from "../utils/printInfoLine.js";
|
||||
import { printCommonInfoLines } from "../utils/printCommonInfoLines.js";
|
||||
import { resolveCommandGgufPath } from "../utils/resolveCommandGgufPath.js";
|
||||
import { withProgressLog } from "../../utils/withProgressLog.js";
|
||||
import { resolveHeaderFlag } from "../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../config.js";
|
||||
import { ConsoleInteraction, ConsoleInteractionKey } from "../utils/ConsoleInteraction.js";
|
||||
import { DraftSequenceTokenPredictor } from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
|
||||
export const InfillCommand = {
|
||||
command: "infill [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Generate an infill completion for a given suffix and prefix texts", documentationPageUrls.CLI.Infill),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
description: "Model file to use for the infill. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
|
||||
})
|
||||
.option("systemInfo", {
|
||||
alias: "i",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp system info"
|
||||
})
|
||||
.option("prefix", {
|
||||
type: "string",
|
||||
description: "First prefix text to automatically load"
|
||||
})
|
||||
.option("prefixFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to load prefix text from automatically"
|
||||
})
|
||||
.option("suffix", {
|
||||
type: "string",
|
||||
description: "First suffix text to automatically load. Requires `prefix` or `prefixFile` to be set"
|
||||
})
|
||||
.option("suffixFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to load suffix text from automatically. Requires `prefix` or `prefixFile` to be set"
|
||||
})
|
||||
.option("contextSize", {
|
||||
alias: "c",
|
||||
type: "number",
|
||||
description: "Context size to use for the model context",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("batchSize", {
|
||||
alias: "b",
|
||||
type: "number",
|
||||
description: "Batch size to use for the model context"
|
||||
})
|
||||
.option("flashAttention", {
|
||||
alias: "fa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Enable flash attention"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
})
|
||||
.option("threads", {
|
||||
type: "number",
|
||||
defaultDescription: "Number of cores that are useful for math on the current machine",
|
||||
description: "Number of threads to use for the evaluation of tokens"
|
||||
})
|
||||
.option("temperature", {
|
||||
alias: "t",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable."
|
||||
})
|
||||
.option("minP", {
|
||||
alias: "mp",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "From the next token candidates, discard the percentage of tokens with the lowest probability. For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded. This is useful for generating more high-quality results when using a high temperature. Set to a value between `0` and `1` to enable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("topK", {
|
||||
alias: "k",
|
||||
type: "number",
|
||||
default: 40,
|
||||
description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0."
|
||||
})
|
||||
.option("topP", {
|
||||
alias: "p",
|
||||
type: "number",
|
||||
default: 0.95,
|
||||
description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`."
|
||||
})
|
||||
.option("seed", {
|
||||
type: "number",
|
||||
description: "Used to control the randomness of the generated text. Only relevant when using `temperature`.",
|
||||
defaultDescription: "The current epoch time"
|
||||
})
|
||||
.option("gpuLayers", {
|
||||
alias: "gl",
|
||||
type: "number",
|
||||
description: "number of layers to store in VRAM",
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM"
|
||||
})
|
||||
.option("repeatPenalty", {
|
||||
alias: "rp",
|
||||
type: "number",
|
||||
default: 1.1,
|
||||
description: "Prevent the model from repeating the same token too much. Set to `1` to disable."
|
||||
})
|
||||
.option("lastTokensRepeatPenalty", {
|
||||
alias: "rpn",
|
||||
type: "number",
|
||||
default: 64,
|
||||
description: "Number of recent tokens generated by the model to apply penalties to repetition of"
|
||||
})
|
||||
.option("penalizeRepeatingNewLine", {
|
||||
alias: "rpnl",
|
||||
type: "boolean",
|
||||
default: true,
|
||||
description: "Penalize new line tokens. set `--no-penalizeRepeatingNewLine` or `--no-rpnl` to disable"
|
||||
})
|
||||
.option("repeatFrequencyPenalty", {
|
||||
alias: "rfp",
|
||||
type: "number",
|
||||
description: "For n time a token is in the `punishTokens` array, lower its probability by `n * repeatFrequencyPenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("repeatPresencePenalty", {
|
||||
alias: "rpp",
|
||||
type: "number",
|
||||
description: "Lower the probability of all the tokens in the `punishTokens` array by `repeatPresencePenalty`. Set to a value between `0` and `1` to enable."
|
||||
})
|
||||
.option("maxTokens", {
|
||||
alias: "mt",
|
||||
type: "number",
|
||||
default: 0,
|
||||
description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
|
||||
})
|
||||
.option("tokenPredictionDraftModel", {
|
||||
alias: ["dm", "draftModel"],
|
||||
type: "string",
|
||||
description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
|
||||
})
|
||||
.option("tokenPredictionModelContextSize", {
|
||||
alias: ["dc", "draftContextSize", "draftContext"],
|
||||
type: "number",
|
||||
description: "Max context size to use for the draft sequence token prediction model context",
|
||||
default: 4096
|
||||
})
|
||||
.option("debug", {
|
||||
alias: "d",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp info and debug logs"
|
||||
})
|
||||
.option("numa", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: llamaNumaOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return false;
|
||||
return parseNumaOption(value);
|
||||
},
|
||||
defaultDescription: "false",
|
||||
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
|
||||
})
|
||||
.option("meter", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Log how many tokens were used as input and output for each response"
|
||||
})
|
||||
.option("timing", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print how how long it took to generate each response"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("noDirectIo", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable Direct I/O usage when available"
|
||||
})
|
||||
.option("printTimings", {
|
||||
alias: "pt",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print llama.cpp's internal timings after each response"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
try {
|
||||
await RunInfill({
|
||||
modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
|
||||
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
|
||||
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
|
||||
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
|
||||
if (contextSize === -1)
|
||||
contextSize = undefined;
|
||||
if (gpuLayers === -1)
|
||||
gpuLayers = undefined;
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
if (debug)
|
||||
console.info(`${chalk.yellow("Log level:")} debug`);
|
||||
const llamaLogLevel = debug
|
||||
? LlamaLogLevel.debug
|
||||
: LlamaLogLevel.warn;
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: llamaLogLevel,
|
||||
numa
|
||||
});
|
||||
const logBatchSize = batchSize != null;
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
const useDirectIo = !noDirectIo;
|
||||
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap
|
||||
});
|
||||
const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
|
||||
? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
useMmap,
|
||||
consoleTitle: "Draft model file"
|
||||
})
|
||||
: undefined;
|
||||
if (systemInfo)
|
||||
console.log(llama.systemInfo);
|
||||
if (prefixFile != null && prefixFile !== "") {
|
||||
if (prefix != null && prefix !== "")
|
||||
console.warn(chalk.yellow("Both `prefix` and `prefixFile` were specified. `prefixFile` will be used."));
|
||||
prefix = await fs.readFile(path.resolve(process.cwd(), prefixFile), "utf8");
|
||||
}
|
||||
if (suffixFile != null && suffixFile !== "") {
|
||||
if (suffix != null && suffix !== "")
|
||||
console.warn(chalk.yellow("Both `suffix` and `suffixFile` were specified. `suffixFile` will be used."));
|
||||
suffix = await fs.readFile(path.resolve(process.cwd(), suffixFile), "utf8");
|
||||
}
|
||||
if (suffix != null && prefix == null) {
|
||||
console.warn(chalk.yellow("Suffix was specified but no prefix was specified. Suffix will be ignored."));
|
||||
suffix = undefined;
|
||||
}
|
||||
if (batchSize != null && contextSize != null && batchSize > contextSize) {
|
||||
console.warn(chalk.yellow("Batch size is greater than the context size. Batch size will be set to the context size."));
|
||||
batchSize = contextSize;
|
||||
}
|
||||
let initialPrefix = prefix ?? null;
|
||||
let initialSuffix = suffix ?? null;
|
||||
const model = await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading model"),
|
||||
successText: chalk.blue("Model loaded"),
|
||||
failText: chalk.blue("Failed to load model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedModelPath,
|
||||
gpuLayers: gpuLayers != null
|
||||
? gpuLayers
|
||||
: contextSize != null
|
||||
? { fitContext: { contextSize } }
|
||||
: undefined,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftModel = resolvedDraftModelPath == null
|
||||
? undefined
|
||||
: await withProgressLog({
|
||||
loadingText: chalk.blue.bold("Loading draft model"),
|
||||
successText: chalk.blue("Draft model loaded"),
|
||||
failText: chalk.blue("Failed to load draft model"),
|
||||
liveUpdates: !debug,
|
||||
noProgress: debug,
|
||||
liveCtrlCSendsAbortSignal: true
|
||||
}, async (progressUpdater) => {
|
||||
try {
|
||||
return await llama.loadModel({
|
||||
modelPath: resolvedDraftModelPath,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
onLoadProgress(loadProgress) {
|
||||
progressUpdater.setProgress(loadProgress);
|
||||
},
|
||||
loadSignal: progressUpdater.abortSignal
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (err === progressUpdater.abortSignal?.reason)
|
||||
process.exit(0);
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftContext = draftModel == null
|
||||
? undefined
|
||||
: await withOra({
|
||||
loading: chalk.blue("Creating draft context"),
|
||||
success: chalk.blue("Draft context created"),
|
||||
fail: chalk.blue("Failed to create draft context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await draftModel.createContext({
|
||||
contextSize: { max: tokenPredictionModelContextSize }
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const context = await withOra({
|
||||
loading: chalk.blue("Creating context"),
|
||||
success: chalk.blue("Context created"),
|
||||
fail: chalk.blue("Failed to create context"),
|
||||
useStatusLogs: debug
|
||||
}, async () => {
|
||||
try {
|
||||
return await model.createContext({
|
||||
contextSize: contextSize != null ? contextSize : undefined,
|
||||
batchSize: batchSize != null ? batchSize : undefined,
|
||||
threads: threads === null ? undefined : threads,
|
||||
ignoreMemorySafetyChecks: gpuLayers != null || contextSize != null,
|
||||
performanceTracking: printTimings
|
||||
});
|
||||
}
|
||||
finally {
|
||||
if (llama.logLevel === LlamaLogLevel.debug) {
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
console.info();
|
||||
}
|
||||
}
|
||||
});
|
||||
const draftContextSequence = draftContext?.getSequence();
|
||||
const contextSequence = draftContextSequence != null
|
||||
? context.getSequence({
|
||||
tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
|
||||
})
|
||||
: context.getSequence();
|
||||
const completion = new LlamaCompletion({
|
||||
contextSequence
|
||||
});
|
||||
let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
|
||||
let lastTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
let lastTokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
const padTitle = await printCommonInfoLines({
|
||||
context,
|
||||
draftContext,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
logBatchSize,
|
||||
tokenMeterEnabled: meter
|
||||
});
|
||||
printInfoLine({
|
||||
title: "Infill",
|
||||
padTitle: padTitle,
|
||||
info: [{
|
||||
title: "Repeat penalty",
|
||||
value: `${repeatPenalty} (apply to last ${lastTokensRepeatPenalty} tokens)`
|
||||
}, {
|
||||
show: repeatFrequencyPenalty != null,
|
||||
title: "Repeat frequency penalty",
|
||||
value: String(repeatFrequencyPenalty)
|
||||
}, {
|
||||
show: repeatPresencePenalty != null,
|
||||
title: "Repeat presence penalty",
|
||||
value: String(repeatPresencePenalty)
|
||||
}, {
|
||||
show: !penalizeRepeatingNewLine,
|
||||
title: "Penalize repeating new line",
|
||||
value: "disabled"
|
||||
}, {
|
||||
show: timing,
|
||||
title: "Response timing",
|
||||
value: "enabled"
|
||||
}]
|
||||
});
|
||||
// this is for ora to not interfere with readline
|
||||
await new Promise((resolve) => setTimeout(resolve, 1));
|
||||
if (!completion.infillSupported) {
|
||||
console.log(chalk.red("Infill is not supported for this model"));
|
||||
process.exit(1);
|
||||
}
|
||||
const replPrefixHistory = [];
|
||||
const replSuffixHistory = [];
|
||||
async function getInput(name) {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
history: name === "Prefix"
|
||||
? replPrefixHistory.slice()
|
||||
: replSuffixHistory.slice()
|
||||
});
|
||||
const res = await new Promise((accept) => rl.question(chalk.yellow(name + "> "), accept));
|
||||
rl.close();
|
||||
return res;
|
||||
}
|
||||
while (true) {
|
||||
const prefixInput = initialPrefix != null
|
||||
? initialPrefix
|
||||
: await getInput("Prefix");
|
||||
if (initialPrefix != null) {
|
||||
console.log(chalk.green("Prefix> ") + initialPrefix);
|
||||
initialPrefix = null;
|
||||
}
|
||||
else
|
||||
await replPrefixHistory.push(prefixInput);
|
||||
if (prefixInput === ".exit")
|
||||
break;
|
||||
const suffixInput = initialSuffix != null
|
||||
? initialSuffix
|
||||
: await getInput("Suffix");
|
||||
if (initialSuffix != null) {
|
||||
console.log(chalk.green("Suffix> ") + initialSuffix);
|
||||
initialSuffix = null;
|
||||
}
|
||||
else
|
||||
await replSuffixHistory.push(suffixInput);
|
||||
if (suffixInput === ".exit")
|
||||
break;
|
||||
process.stdout.write(chalk.yellow("Infill: "));
|
||||
const [startColor, endColor] = chalk.blue("MIDDLE").split("MIDDLE");
|
||||
const abortController = new AbortController();
|
||||
const consoleInteraction = new ConsoleInteraction();
|
||||
consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
|
||||
abortController.abort();
|
||||
consoleInteraction.stop();
|
||||
});
|
||||
const timeBeforePrompt = Date.now();
|
||||
try {
|
||||
process.stdout.write(startColor);
|
||||
consoleInteraction.start();
|
||||
await completion.generateInfillCompletion(prefixInput, suffixInput, {
|
||||
temperature,
|
||||
minP,
|
||||
topK,
|
||||
topP,
|
||||
seed: seed ?? undefined,
|
||||
signal: abortController.signal,
|
||||
repeatPenalty: {
|
||||
penalty: repeatPenalty,
|
||||
frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,
|
||||
presencePenalty: repeatPresencePenalty != null ? repeatPresencePenalty : undefined,
|
||||
penalizeNewLine: penalizeRepeatingNewLine,
|
||||
lastTokens: lastTokensRepeatPenalty
|
||||
},
|
||||
maxTokens: maxTokens === -1
|
||||
? context.contextSize
|
||||
: maxTokens <= 0
|
||||
? undefined
|
||||
: maxTokens,
|
||||
onTextChunk(chunk) {
|
||||
process.stdout.write(chunk);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (!(abortController.signal.aborted && err === abortController.signal.reason))
|
||||
throw err;
|
||||
}
|
||||
finally {
|
||||
consoleInteraction.stop();
|
||||
if (abortController.signal.aborted)
|
||||
process.stdout.write(endColor + chalk.yellow("[generation aborted by user]"));
|
||||
else
|
||||
process.stdout.write(endColor);
|
||||
console.log();
|
||||
}
|
||||
const timeAfterPrompt = Date.now();
|
||||
if (printTimings) {
|
||||
if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
|
||||
llama.logLevel = LlamaLogLevel.info;
|
||||
await context.printTimings();
|
||||
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
|
||||
llama.logLevel = llamaLogLevel;
|
||||
}
|
||||
if (timing)
|
||||
console.info(chalk.dim("Response duration: ") +
|
||||
prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
|
||||
keepDecimalsOnWholeSeconds: true,
|
||||
secondsDecimalDigits: 2,
|
||||
separateMilliseconds: true,
|
||||
compact: false
|
||||
}));
|
||||
if (meter) {
|
||||
const newTokenMeterState = contextSequence.tokenMeter.getState();
|
||||
const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
|
||||
lastTokenMeterState = newTokenMeterState;
|
||||
const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
|
||||
const tokenPredictionsStats = contextSequence.tokenPredictions;
|
||||
const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
|
||||
const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
|
||||
const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
|
||||
const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
|
||||
lastTokenPredictionsStats = tokenPredictionsStats;
|
||||
console.info([
|
||||
showDraftTokenMeterDiff && (chalk.yellow("Main".padEnd("Drafter".length))),
|
||||
chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")),
|
||||
showDraftTokenMeterDiff && (chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " "))
|
||||
].filter(Boolean).join(" "));
|
||||
if (lastDraftTokenMeterState != null && draftContextSequence != null) {
|
||||
const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
|
||||
const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
|
||||
lastDraftTokenMeterState = newDraftTokenMeterState;
|
||||
console.info([
|
||||
chalk.yellow("Drafter"),
|
||||
chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
|
||||
chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
|
||||
].join(" "));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=InfillCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
12
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.d.ts
generated
vendored
Normal file
12
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../bindings/types.js";
|
||||
type InitCommand = {
|
||||
name?: string;
|
||||
template?: string;
|
||||
model?: string;
|
||||
gpu?: BuildGpu | "auto";
|
||||
};
|
||||
export declare const InitCommand: CommandModule<object, InitCommand>;
|
||||
export declare const CreateCliCommand: CommandModule<object, InitCommand>;
|
||||
export declare function InitCommandHandler({ name, template, model, gpu }: InitCommand): Promise<void>;
|
||||
export {};
|
||||
230
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.js
generated
vendored
Normal file
230
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.js
generated
vendored
Normal file
@@ -0,0 +1,230 @@
|
||||
import process from "process";
|
||||
import path from "path";
|
||||
import chalk from "chalk";
|
||||
import logSymbols from "log-symbols";
|
||||
import validateNpmPackageName from "validate-npm-package-name";
|
||||
import fs from "fs-extra";
|
||||
import { consolePromptQuestion } from "../utils/consolePromptQuestion.js";
|
||||
import { basicChooseFromListConsoleInteraction } from "../utils/basicChooseFromListConsoleInteraction.js";
|
||||
import { splitAnsiToLines } from "../utils/splitAnsiToLines.js";
|
||||
import { arrowChar } from "../../consts.js";
|
||||
import { interactivelyAskForModel } from "../utils/interactivelyAskForModel.js";
|
||||
import { LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../bindings/types.js";
|
||||
import { getLlama } from "../../bindings/getLlama.js";
|
||||
import { ProjectTemplateParameter, scaffoldProjectTemplate } from "../utils/projectTemplates.js";
|
||||
import { documentationPageUrls, packedProjectTemplatesDirectory } from "../../config.js";
|
||||
import { getModuleVersion } from "../../utils/getModuleVersion.js";
|
||||
import withOra from "../../utils/withOra.js";
|
||||
import { projectTemplates } from "../projectTemplates.js";
|
||||
import { getReadablePath } from "../utils/getReadablePath.js";
|
||||
import { createModelDownloader } from "../../utils/createModelDownloader.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { resolveModelDestination } from "../../utils/resolveModelDestination.js";
|
||||
export const InitCommand = {
|
||||
command: "init [name]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Generate a new `node-llama-cpp` project from a template", documentationPageUrls.CLI.Init),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("name", {
|
||||
type: "string",
|
||||
description: "Project name"
|
||||
})
|
||||
.option("template", {
|
||||
type: "string",
|
||||
choices: projectTemplates.map((template) => template.name),
|
||||
description: "Template to use. If omitted, you will be prompted to select one"
|
||||
})
|
||||
.option("model", {
|
||||
type: "string",
|
||||
description: "Model URI to use. If omitted, you will be prompted to select one interactively"
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp"
|
||||
});
|
||||
},
|
||||
handler: InitCommandHandler
|
||||
};
|
||||
export const CreateCliCommand = {
|
||||
command: "$0",
|
||||
describe: withCliCommandDescriptionDocsUrl("Scaffold a new `node-llama-cpp` project from a template", documentationPageUrls.CLI.Init),
|
||||
builder: InitCommand.builder,
|
||||
handler: InitCommandHandler
|
||||
};
|
||||
export async function InitCommandHandler({ name, template, model, gpu }) {
|
||||
const currentDirectory = path.resolve(process.cwd());
|
||||
const projectName = (name != null && validateNpmPackageName(name ?? "").validForNewPackages)
|
||||
? name
|
||||
: await askForProjectName(currentDirectory);
|
||||
const selectedTemplateOption = ((template != null && template !== "")
|
||||
? projectTemplates.find((item) => item.name === template)
|
||||
: undefined) ?? await askForTemplate();
|
||||
async function resolveModelUri() {
|
||||
if (model != null && model !== "") {
|
||||
try {
|
||||
const resolvedModelDestination = resolveModelDestination(model, true);
|
||||
if (resolvedModelDestination.type === "uri")
|
||||
return resolvedModelDestination.uri;
|
||||
else if (resolvedModelDestination.type === "url")
|
||||
return resolvedModelDestination.url;
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
return await interactivelyAskForModel({
|
||||
llama,
|
||||
allowLocalModels: false,
|
||||
downloadIntent: false
|
||||
});
|
||||
}
|
||||
const modelUri = await resolveModelUri();
|
||||
const targetDirectory = path.join(currentDirectory, projectName);
|
||||
const readableTargetDirectoryPath = getReadablePath(targetDirectory);
|
||||
await withOra({
|
||||
loading: `Scaffolding a ${chalk.yellow(selectedTemplateOption.title)} project to ${chalk.yellow(readableTargetDirectoryPath)}`,
|
||||
success: `Scaffolded a ${chalk.yellow(selectedTemplateOption.title)} project to ${chalk.yellow(readableTargetDirectoryPath)}`,
|
||||
fail: `Failed to scaffold a ${chalk.yellow(selectedTemplateOption.title)} project to ${chalk.yellow(readableTargetDirectoryPath)}`
|
||||
}, async () => {
|
||||
const startTime = Date.now();
|
||||
const minScaffoldTime = 1000 * 2; // ensure the IDE has enough time to refresh and show some progress
|
||||
const template = await loadTemplate(selectedTemplateOption);
|
||||
await fs.ensureDir(targetDirectory);
|
||||
async function resolveModelInfo() {
|
||||
const resolvedModelDestination = resolveModelDestination(modelUri);
|
||||
if (resolvedModelDestination.type === "uri")
|
||||
return {
|
||||
modelUriOrUrl: resolvedModelDestination.uri,
|
||||
modelUriOrFilename: resolvedModelDestination.uri,
|
||||
cancelDownloader: async () => void 0
|
||||
};
|
||||
if (resolvedModelDestination.type === "file")
|
||||
throw new Error("Unexpected file model destination");
|
||||
const modelDownloader = await createModelDownloader({
|
||||
modelUri: resolvedModelDestination.url,
|
||||
showCliProgress: false,
|
||||
deleteTempFileOnCancel: false
|
||||
});
|
||||
const modelEntrypointFilename = modelDownloader.entrypointFilename;
|
||||
return {
|
||||
modelUriOrUrl: resolvedModelDestination.url,
|
||||
modelUriOrFilename: modelEntrypointFilename,
|
||||
async cancelDownloader() {
|
||||
try {
|
||||
await modelDownloader.cancel();
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
const { modelUriOrFilename, modelUriOrUrl, cancelDownloader } = await resolveModelInfo();
|
||||
await scaffoldProjectTemplate({
|
||||
template,
|
||||
directoryPath: targetDirectory,
|
||||
parameters: {
|
||||
[ProjectTemplateParameter.ProjectName]: projectName,
|
||||
[ProjectTemplateParameter.ModelUriOrUrl]: modelUriOrUrl,
|
||||
[ProjectTemplateParameter.ModelUriOrFilename]: modelUriOrFilename,
|
||||
[ProjectTemplateParameter.CurrentModuleVersion]: await getModuleVersion()
|
||||
}
|
||||
});
|
||||
await cancelDownloader();
|
||||
await new Promise((resolve) => setTimeout(resolve, Math.max(0, minScaffoldTime - (Date.now() - startTime))));
|
||||
});
|
||||
console.info(chalk.green("Done."));
|
||||
console.info();
|
||||
console.info("Now run these commands:");
|
||||
console.info();
|
||||
console.info(chalk.greenBright("cd") + " " + projectName);
|
||||
console.info(chalk.greenBright("npm") + " install");
|
||||
console.info(chalk.greenBright("npm") + " start");
|
||||
console.info();
|
||||
console.info(chalk.gray("Note: running \"npm install\" may take a little while since it also downloads the model you selected"));
|
||||
process.exit(0);
|
||||
}
|
||||
async function askForTemplate() {
|
||||
const selectedTemplateOption = await basicChooseFromListConsoleInteraction({
|
||||
title: chalk.bold("Select a template:"),
|
||||
footer(item) {
|
||||
if (item.description == null)
|
||||
return undefined;
|
||||
const leftPad = 3;
|
||||
const maxWidth = Math.max(1, process.stdout.columns - 2 - leftPad);
|
||||
const lines = splitAnsiToLines(item.description, maxWidth);
|
||||
return " \n" +
|
||||
" ".repeat(leftPad) + chalk.bold.gray("Template description") + "\n" +
|
||||
lines.map((line) => (" ".repeat(leftPad) + line)).join("\n");
|
||||
},
|
||||
items: projectTemplates,
|
||||
renderItem(item, focused) {
|
||||
return renderSelectableItem(item.titleFormat != null
|
||||
? item.titleFormat(item.title)
|
||||
: item.title, focused);
|
||||
},
|
||||
aboveItemsPadding: 1,
|
||||
belowItemsPadding: 1,
|
||||
renderSummaryOnExit(item) {
|
||||
if (item == null)
|
||||
return "";
|
||||
return logSymbols.success + " Selected template " + chalk.blue(item.title);
|
||||
},
|
||||
exitOnCtrlC: true
|
||||
});
|
||||
if (selectedTemplateOption == null)
|
||||
throw new Error("No template selected");
|
||||
return selectedTemplateOption;
|
||||
}
|
||||
async function askForProjectName(currentDirectory) {
|
||||
console.info();
|
||||
const projectName = await consolePromptQuestion(chalk.bold("Enter a project name:") + chalk.dim(" (node-llama-cpp-project) "), {
|
||||
defaultValue: "node-llama-cpp-project",
|
||||
exitOnCtrlC: true,
|
||||
async validate(input) {
|
||||
const { validForNewPackages, errors } = validateNpmPackageName(input);
|
||||
if (!validForNewPackages)
|
||||
return (errors ?? ["The given project name cannot be used in a package.json file"]).join("\n");
|
||||
if (await fs.pathExists(path.join(currentDirectory, input)))
|
||||
return "A directory with the given project name already exists";
|
||||
return null;
|
||||
},
|
||||
renderSummaryOnExit(item) {
|
||||
if (item == null)
|
||||
return "";
|
||||
return logSymbols.success + " Entered project name " + chalk.blue(item);
|
||||
}
|
||||
});
|
||||
if (projectName == null)
|
||||
throw new Error("No project name entered");
|
||||
return projectName;
|
||||
}
|
||||
function renderSelectableItem(text, focused) {
|
||||
if (focused)
|
||||
return " " + chalk.cyan(arrowChar) + " " + chalk.cyan(text);
|
||||
return " * " + text;
|
||||
}
|
||||
async function loadTemplate(templateOption) {
|
||||
const templateFilePath = path.join(packedProjectTemplatesDirectory, `${templateOption.name}.json`);
|
||||
if (!(await fs.pathExists(templateFilePath)))
|
||||
throw new Error(`Template file was not found for template "${templateOption.title}" ("${templateOption.name}")`);
|
||||
const template = await fs.readJSON(templateFilePath);
|
||||
return template;
|
||||
}
|
||||
//# sourceMappingURL=InitCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/InitCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
4
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type OnPostInstallCommand = null;
|
||||
export declare const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand>;
|
||||
export {};
|
||||
35
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.js
generated
vendored
Normal file
35
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.js
generated
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
import chalk from "chalk";
|
||||
import { defaultSkipDownload, documentationPageUrls } from "../../config.js";
|
||||
import { getLlamaForOptions } from "../../bindings/getLlama.js";
|
||||
import { setForceShowConsoleLogPrefix } from "../../state.js";
|
||||
import { isRunningUnderRosetta } from "../utils/isRunningUnderRosetta.js";
|
||||
import { getConsoleLogPrefix } from "../../utils/getConsoleLogPrefix.js";
|
||||
export const OnPostInstallCommand = {
|
||||
command: "postinstall",
|
||||
describe: false,
|
||||
async handler() {
|
||||
if (defaultSkipDownload)
|
||||
return;
|
||||
setForceShowConsoleLogPrefix(true);
|
||||
if (await isRunningUnderRosetta()) {
|
||||
console.error(getConsoleLogPrefix(false, false), chalk.red("llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
|
||||
"Ensure that you're using a native arm64 node.js installation."));
|
||||
console.error(getConsoleLogPrefix(false, false), "process.platform: " + process.platform + ", process.arch: " + process.arch);
|
||||
console.error(getConsoleLogPrefix(false, false), "troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction);
|
||||
process.exit(1);
|
||||
}
|
||||
try {
|
||||
await getLlamaForOptions({
|
||||
progressLogs: true
|
||||
}, {
|
||||
updateLastBuildInfoOnCompile: true
|
||||
});
|
||||
process.exit(0);
|
||||
}
|
||||
catch (err) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
};
|
||||
//# sourceMappingURL=OnPostInstallCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/OnPostInstallCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"OnPostInstallCommand.js","sourceRoot":"","sources":["../../../src/cli/commands/OnPostInstallCommand.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAC,mBAAmB,EAAE,qBAAqB,EAAC,MAAM,iBAAiB,CAAC;AAC3E,OAAO,EAAC,kBAAkB,EAAC,MAAM,4BAA4B,CAAC;AAC9D,OAAO,EAAC,4BAA4B,EAAC,MAAM,gBAAgB,CAAC;AAC5D,OAAO,EAAC,qBAAqB,EAAC,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAC,mBAAmB,EAAC,MAAM,oCAAoC,CAAC;AAIvE,MAAM,CAAC,MAAM,oBAAoB,GAAgD;IAC7E,OAAO,EAAE,aAAa;IACtB,QAAQ,EAAE,KAAK;IACf,KAAK,CAAC,OAAO;QACT,IAAI,mBAAmB;YACnB,OAAO;QAEX,4BAA4B,CAAC,IAAI,CAAC,CAAC;QAEnC,IAAI,MAAM,qBAAqB,EAAE,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CACT,mBAAmB,CAAC,KAAK,EAAE,KAAK,CAAC,EACjC,KAAK,CAAC,GAAG,CACL,mEAAmE;gBACnE,+DAA+D,CAClE,CACJ,CAAC;YACF,OAAO,CAAC,KAAK,CACT,mBAAmB,CAAC,KAAK,EAAE,KAAK,CAAC,EACjC,oBAAoB,GAAG,OAAO,CAAC,QAAQ,GAAG,kBAAkB,GAAG,OAAO,CAAC,IAAI,CAC9E,CAAC;YACF,OAAO,CAAC,KAAK,CACT,mBAAmB,CAAC,KAAK,EAAE,KAAK,CAAC,EACjC,mBAAmB,GAAG,qBAAqB,CAAC,eAAe,CAAC,iCAAiC,CAChG,CAAC;YAEF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;QAED,IAAI,CAAC;YACD,MAAM,kBAAkB,CAAC;gBACrB,YAAY,EAAE,IAAI;aACrB,EAAE;gBACC,4BAA4B,EAAE,IAAI;aACrC,CAAC,CAAC;YAEH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;IACL,CAAC;CACJ,CAAC"}
|
||||
13
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.d.ts
generated
vendored
Normal file
13
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type PullCommand = {
|
||||
urls: string[];
|
||||
header?: string[];
|
||||
override: boolean;
|
||||
noProgress: boolean;
|
||||
noTempFile: boolean;
|
||||
directory: string;
|
||||
filename?: string;
|
||||
parallel?: number;
|
||||
};
|
||||
export declare const PullCommand: CommandModule<object, PullCommand>;
|
||||
export {};
|
||||
158
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.js
generated
vendored
Normal file
158
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.js
generated
vendored
Normal file
@@ -0,0 +1,158 @@
|
||||
import process from "process";
|
||||
import fs from "fs-extra";
|
||||
import chalk from "chalk";
|
||||
import { cliModelsDirectory, documentationPageUrls } from "../../config.js";
|
||||
import { combineModelDownloaders, createModelDownloader } from "../../utils/createModelDownloader.js";
|
||||
import { getReadablePath } from "../utils/getReadablePath.js";
|
||||
import { ConsoleInteraction, ConsoleInteractionKey } from "../utils/ConsoleInteraction.js";
|
||||
import { getIsInDocumentationMode } from "../../state.js";
|
||||
import { resolveHeaderFlag } from "../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
export const PullCommand = {
|
||||
command: "pull [urls..]",
|
||||
aliases: ["get"],
|
||||
describe: withCliCommandDescriptionDocsUrl("Download models from URLs", documentationPageUrls.CLI.Pull),
|
||||
builder(yargs) {
|
||||
const isInDocumentationMode = getIsInDocumentationMode();
|
||||
return yargs
|
||||
.option("urls", {
|
||||
type: "string",
|
||||
alias: ["url", "uris", "uri"],
|
||||
array: true,
|
||||
description: [
|
||||
"A `.gguf` model URI to pull.",
|
||||
!isInDocumentationMode && "Automatically handles split and binary-split models files, so only pass the URI to the first file of a model.",
|
||||
!isInDocumentationMode && "If a file already exists and its size matches the expected size, it will not be downloaded again unless the `--override` flag is used.",
|
||||
"Pass multiple URIs to download multiple models at once."
|
||||
].filter(Boolean).join(isInDocumentationMode
|
||||
? "\n"
|
||||
: " "),
|
||||
demandOption: true,
|
||||
group: "Required:"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("override", {
|
||||
alias: ["o"],
|
||||
type: "boolean",
|
||||
description: "Override existing model files",
|
||||
default: false,
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noProgress", {
|
||||
type: "boolean",
|
||||
description: "Do not show a progress bar while downloading",
|
||||
default: false,
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noTempFile", {
|
||||
alias: ["noTemp"],
|
||||
type: "boolean",
|
||||
description: "Delete the temporary file when canceling the download",
|
||||
default: false,
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("directory", {
|
||||
alias: ["d", "dir"],
|
||||
type: "string",
|
||||
description: "Directory to save the model to",
|
||||
default: cliModelsDirectory,
|
||||
defaultDescription: isInDocumentationMode
|
||||
? "`" + getReadablePath(cliModelsDirectory) + "`"
|
||||
: getReadablePath(cliModelsDirectory),
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("filename", {
|
||||
alias: ["n", "name"],
|
||||
type: "string",
|
||||
description: "Filename to save the model as. Can only be used if a single URL is passed",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("parallel", {
|
||||
alias: ["p"],
|
||||
type: "number",
|
||||
description: "Maximum parallel downloads",
|
||||
default: 4,
|
||||
group: "Optional:"
|
||||
});
|
||||
},
|
||||
async handler({ urls, header: headerArg, override, noProgress, noTempFile, directory, filename, parallel }) {
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
if (urls.length === 0)
|
||||
throw new Error("At least one URI must be provided");
|
||||
else if (urls.length > 1 && filename != null)
|
||||
throw new Error("The `--filename` flag can only be used when a single URI is passed");
|
||||
if (urls.length === 1) {
|
||||
const downloader = await createModelDownloader({
|
||||
modelUri: urls[0],
|
||||
dirPath: directory,
|
||||
headers,
|
||||
showCliProgress: !noProgress,
|
||||
deleteTempFileOnCancel: noTempFile,
|
||||
skipExisting: !override,
|
||||
fileName: filename || undefined,
|
||||
parallelDownloads: parallel,
|
||||
_showUriResolvingProgress: !noProgress
|
||||
});
|
||||
if (!override && downloader.totalFiles === 1 && await fs.pathExists(downloader.entrypointFilePath)) {
|
||||
const fileStats = await fs.stat(downloader.entrypointFilePath);
|
||||
if (downloader.totalSize === fileStats.size) {
|
||||
console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
|
||||
console.info("Skipping download of an existing file: " + chalk.yellow(getReadablePath(downloader.entrypointFilePath)));
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
const consoleInteraction = new ConsoleInteraction();
|
||||
consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
|
||||
await downloader.cancel();
|
||||
consoleInteraction.stop();
|
||||
process.exit(0);
|
||||
});
|
||||
if (!noProgress) {
|
||||
console.info(`Downloading to ${chalk.yellow(getReadablePath(directory))}${downloader.splitBinaryParts != null
|
||||
? chalk.gray(` (combining ${downloader.splitBinaryParts} parts into a single file)`)
|
||||
: ""}`);
|
||||
consoleInteraction.start();
|
||||
}
|
||||
await downloader.download();
|
||||
if (!noProgress)
|
||||
consoleInteraction.stop();
|
||||
console.info(`Downloaded to ${chalk.yellow(getReadablePath(downloader.entrypointFilePath))}`);
|
||||
}
|
||||
else {
|
||||
const downloader = await combineModelDownloaders(urls.map((uri) => createModelDownloader({
|
||||
modelUri: uri,
|
||||
dirPath: directory,
|
||||
headers,
|
||||
showCliProgress: false,
|
||||
deleteTempFileOnCancel: noTempFile,
|
||||
skipExisting: !override
|
||||
})), {
|
||||
showCliProgress: !noProgress,
|
||||
parallelDownloads: parallel
|
||||
});
|
||||
const consoleInteraction = new ConsoleInteraction();
|
||||
consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
|
||||
await downloader.cancel();
|
||||
consoleInteraction.stop();
|
||||
process.exit(0);
|
||||
});
|
||||
if (!noProgress) {
|
||||
console.info(`Downloading to ${chalk.yellow(getReadablePath(directory))}`);
|
||||
consoleInteraction.start();
|
||||
}
|
||||
await downloader.download();
|
||||
if (!noProgress)
|
||||
consoleInteraction.stop();
|
||||
console.info(`Downloaded ${downloader.modelDownloaders.length} models to ${chalk.yellow(getReadablePath(directory))}\n${chalk.gray("*")} ` +
|
||||
downloader.modelDownloaders.map((downloader) => chalk.yellow(downloader.entrypointFilename))
|
||||
.join(`\n${chalk.gray("*")} `));
|
||||
}
|
||||
}
|
||||
};
|
||||
//# sourceMappingURL=PullCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/PullCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectCommand = {};
|
||||
export declare const InspectCommand: CommandModule<object, InspectCommand>;
|
||||
export {};
|
||||
21
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js
generated
vendored
Normal file
21
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../config.js";
|
||||
import { InspectGgufCommand } from "./commands/InspectGgufCommand.js";
|
||||
import { InspectGpuCommand } from "./commands/InspectGpuCommand.js";
|
||||
import { InspectMeasureCommand } from "./commands/InspectMeasureCommand.js";
|
||||
import { InspectEstimateCommand } from "./commands/InspectEstimateCommand.js";
|
||||
export const InspectCommand = {
|
||||
command: "inspect <command>",
|
||||
describe: withCliCommandDescriptionDocsUrl("Inspect the inner workings of `node-llama-cpp`", documentationPageUrls.CLI.Inspect.index),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.command(InspectGpuCommand)
|
||||
.command(InspectGgufCommand)
|
||||
.command(InspectMeasureCommand)
|
||||
.command(InspectEstimateCommand);
|
||||
},
|
||||
async handler() {
|
||||
// this function must exist, even though we do nothing here
|
||||
}
|
||||
};
|
||||
//# sourceMappingURL=InspectCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/InspectCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"InspectCommand.js","sourceRoot":"","sources":["../../../../src/cli/commands/inspect/InspectCommand.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,gCAAgC,EAAC,MAAM,iDAAiD,CAAC;AACjG,OAAO,EAAC,qBAAqB,EAAC,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAC,kBAAkB,EAAC,MAAM,kCAAkC,CAAC;AACpE,OAAO,EAAC,iBAAiB,EAAC,MAAM,iCAAiC,CAAC;AAClE,OAAO,EAAC,qBAAqB,EAAC,MAAM,qCAAqC,CAAC;AAC1E,OAAO,EAAC,sBAAsB,EAAC,MAAM,sCAAsC,CAAC;AAM5E,MAAM,CAAC,MAAM,cAAc,GAA0C;IACjE,OAAO,EAAE,mBAAmB;IAC5B,QAAQ,EAAE,gCAAgC,CACtC,gDAAgD,EAChD,qBAAqB,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAC1C;IACD,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,OAAO,CAAC,iBAAiB,CAAC;aAC1B,OAAO,CAAC,kBAAkB,CAAC;aAC3B,OAAO,CAAC,qBAAqB,CAAC;aAC9B,OAAO,CAAC,sBAAsB,CAAC,CAAC;IACzC,CAAC;IACD,KAAK,CAAC,OAAO;QACT,2DAA2D;IAC/D,CAAC;CACJ,CAAC"}
|
||||
14
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts
generated
vendored
Normal file
14
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type InspectEstimateCommand = {
|
||||
modelPath: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
gpuLayers?: number | "max";
|
||||
contextSize?: number | "train";
|
||||
embedding?: boolean;
|
||||
noMmap?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
};
|
||||
export declare const InspectEstimateCommand: CommandModule<object, InspectEstimateCommand>;
|
||||
export {};
|
||||
248
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js
generated
vendored
Normal file
248
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js
generated
vendored
Normal file
@@ -0,0 +1,248 @@
|
||||
import process from "process";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import { printInfoLine } from "../../../utils/printInfoLine.js";
|
||||
import { renderModelCompatibilityPercentageWithColors } from "../../../utils/renderModelCompatibilityPercentageWithColors.js";
|
||||
import { getReadableContextSize } from "../../../../utils/getReadableContextSize.js";
|
||||
import { GgufInsights } from "../../../../gguf/insights/GgufInsights.js";
|
||||
import { getLlama } from "../../../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { defaultTrainContextSizeForEstimationPurposes } from "../../../../gguf/insights/GgufInsightsConfigurationResolver.js";
|
||||
import { getGgufFileTypeName } from "../../../../gguf/utils/getGgufFileTypeName.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { resolveModelArgToFilePathOrUrl } from "../../../../utils/resolveModelDestination.js";
|
||||
import { printModelDestination } from "../../../utils/printModelDestination.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { printDidYouMeanUri } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { isModelUri } from "../../../../utils/parseModelUri.js";
|
||||
export const InspectEstimateCommand = {
|
||||
command: "estimate [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Estimate the compatibility of a model with the current hardware", documentationPageUrls.CLI.Inspect.Estimate),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
description: "The path or URI of the GGUF file to use. If a URI is provided, the metadata will be read from the remote file without downloading the entire file.",
|
||||
group: "Required:"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\"",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("gpuLayers", {
|
||||
alias: "gl",
|
||||
type: "number",
|
||||
description: "number of layers to store in VRAM. Set to `max` to use all the layers the model has",
|
||||
string: true,
|
||||
coerce: (value) => {
|
||||
if (value === "max")
|
||||
return -2;
|
||||
return parseInt(value);
|
||||
},
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("contextSize", {
|
||||
alias: "c",
|
||||
type: "number",
|
||||
description: "Context size to use for the model context. Set to `max` or `train` to use the training context size. " +
|
||||
"Note that the train context size is not necessarily what you should use for inference, " +
|
||||
"and a big context size will use a lot of memory",
|
||||
string: true,
|
||||
coerce: (value) => {
|
||||
if (value === "max" || value === "train")
|
||||
return -2;
|
||||
return parseInt(value);
|
||||
},
|
||||
default: -1,
|
||||
defaultDescription: "Automatically determined based on the available VRAM",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("embedding", {
|
||||
alias: "e",
|
||||
type: "boolean",
|
||||
description: "Whether to estimate for creating an embedding context",
|
||||
default: false,
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, swaFullCache }) {
|
||||
if (gpuLayers === -1)
|
||||
gpuLayers = undefined;
|
||||
if (gpuLayers === -2)
|
||||
gpuLayers = "max";
|
||||
if (contextSizeArg === -1)
|
||||
contextSizeArg = undefined;
|
||||
if (contextSizeArg === -2)
|
||||
contextSizeArg = "train";
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
const [resolvedModelDestination, resolvedGgufPath] = isModelUri(ggufPath)
|
||||
? await withOra({
|
||||
loading: chalk.blue("Resolving model URI"),
|
||||
success: chalk.blue("Resolved model URI"),
|
||||
fail: chalk.blue("Failed to resolve model URI"),
|
||||
noSuccessLiveStatus: true
|
||||
}, () => resolveModelArgToFilePathOrUrl(ggufPath, headers))
|
||||
: await resolveModelArgToFilePathOrUrl(ggufPath, headers);
|
||||
if (resolvedModelDestination.type === "file" && !await fs.pathExists(resolvedGgufPath)) {
|
||||
console.error(`${chalk.red("File does not exist:")} ${resolvedGgufPath}`);
|
||||
printDidYouMeanUri(ggufPath);
|
||||
process.exit(1);
|
||||
}
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
printModelDestination(resolvedModelDestination);
|
||||
if (embedding)
|
||||
console.info(`${chalk.yellow("Estimating for an embedding context")}`);
|
||||
const ggufFileInfo = await withOra({
|
||||
loading: chalk.blue("Reading model metadata"),
|
||||
success: chalk.blue("Read model metadata"),
|
||||
fail: chalk.blue("Failed to read model metadata"),
|
||||
noSuccessLiveStatus: true
|
||||
}, async () => {
|
||||
return await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers
|
||||
});
|
||||
});
|
||||
const ggufInsights = await GgufInsights.from(ggufFileInfo, llama);
|
||||
const contextSize = contextSizeArg === "train"
|
||||
? ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes
|
||||
: contextSizeArg;
|
||||
async function resolveCompatibilityScore(flashAttention) {
|
||||
return await ggufInsights.configurationResolver.resolveAndScoreConfig({
|
||||
flashAttention,
|
||||
targetContextSize: contextSize,
|
||||
targetGpuLayers: gpuLayers,
|
||||
embeddingContext: embedding,
|
||||
useMmap,
|
||||
swaFullCache
|
||||
});
|
||||
}
|
||||
const [compatibilityScore, compatibilityScoreWithFlashAttention] = await Promise.all([
|
||||
resolveCompatibilityScore(false),
|
||||
resolveCompatibilityScore(true)
|
||||
]);
|
||||
const longestTitle = Math.max("GPU info".length, "Model info".length, "Resolved config".length, "With flash attention".length) + 1;
|
||||
if (llama.gpu !== false) {
|
||||
const [vramState, deviceNames] = await Promise.all([
|
||||
llama.getVramState(),
|
||||
llama.getGpuDeviceNames()
|
||||
]);
|
||||
printInfoLine({
|
||||
title: "GPU info",
|
||||
padTitle: longestTitle,
|
||||
info: [{
|
||||
title: "Type",
|
||||
value: getPrettyBuildGpuName(llama.gpu)
|
||||
}, {
|
||||
title: "VRAM",
|
||||
value: toBytes(vramState.total)
|
||||
}, {
|
||||
title: "Name",
|
||||
value: toOneLine(deviceNames.join(", "))
|
||||
}]
|
||||
});
|
||||
}
|
||||
printInfoLine({
|
||||
title: "Model info",
|
||||
padTitle: longestTitle,
|
||||
info: [{
|
||||
title: "Type",
|
||||
value: toOneLine([
|
||||
ggufFileInfo.metadata?.general?.architecture,
|
||||
ggufFileInfo.metadata?.general?.size_label,
|
||||
getGgufFileTypeName(ggufFileInfo.metadata.general?.file_type)
|
||||
].filter(Boolean).join(" "))
|
||||
}, {
|
||||
title: "Size",
|
||||
value: toBytes(ggufInsights.modelSize)
|
||||
}, {
|
||||
show: ggufInsights.trainContextSize != null,
|
||||
title: "Train context size",
|
||||
value: getReadableContextSize(ggufInsights.trainContextSize ?? 0)
|
||||
}]
|
||||
});
|
||||
console.info();
|
||||
logCompatibilityScore("Resolved config", longestTitle, compatibilityScore, ggufInsights, llama, false);
|
||||
logCompatibilityScore("With flash attention", longestTitle, compatibilityScoreWithFlashAttention, ggufInsights, llama, true);
|
||||
}
|
||||
};
|
||||
function logCompatibilityScore(title, padTitle, compatibilityScore, ggufInsights, llama, flashAttention) {
|
||||
printInfoLine({
|
||||
title,
|
||||
padTitle,
|
||||
separateLines: false,
|
||||
info: [{
|
||||
title: "",
|
||||
value: renderModelCompatibilityPercentageWithColors(compatibilityScore.compatibilityScore * 100) + " compatibility"
|
||||
}, {
|
||||
show: ggufInsights.trainContextSize != null,
|
||||
title: "Context size",
|
||||
value: getReadableContextSize(compatibilityScore.resolvedValues.contextSize)
|
||||
}, {
|
||||
show: llama.gpu !== false,
|
||||
title: "GPU layers",
|
||||
value: () => (compatibilityScore.resolvedValues.gpuLayers + "/" + ggufInsights.totalLayers + " " +
|
||||
chalk.dim(`(${Math.floor((compatibilityScore.resolvedValues.gpuLayers / ggufInsights.totalLayers) * 100)}%)`))
|
||||
}, {
|
||||
show: llama.gpu !== false,
|
||||
title: "VRAM usage",
|
||||
value: () => toBytes(compatibilityScore.resolvedValues.totalVramUsage)
|
||||
}, {
|
||||
title: "RAM usage",
|
||||
value: () => toBytes(compatibilityScore.resolvedValues.totalRamUsage)
|
||||
}, {
|
||||
show: flashAttention,
|
||||
title: "Flash attention",
|
||||
value: "enabled"
|
||||
}]
|
||||
});
|
||||
}
|
||||
function toOneLine(text) {
|
||||
return text.replaceAll("\n", chalk.gray("\\n"));
|
||||
}
|
||||
//# sourceMappingURL=InspectEstimateCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
13
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts
generated
vendored
Normal file
13
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectGgufCommand = {
|
||||
modelPath: string;
|
||||
header?: string[];
|
||||
key?: string;
|
||||
noSplice: boolean;
|
||||
fullTensorInfo: boolean;
|
||||
fullMetadataArrays: boolean;
|
||||
plainJson: boolean;
|
||||
outputToJsonFile?: string;
|
||||
};
|
||||
export declare const InspectGgufCommand: CommandModule<object, InspectGgufCommand>;
|
||||
export {};
|
||||
225
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js
generated
vendored
Normal file
225
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js
generated
vendored
Normal file
@@ -0,0 +1,225 @@
|
||||
import path from "path";
|
||||
import process from "process";
|
||||
import chalk from "chalk";
|
||||
import fs from "fs-extra";
|
||||
import { Template } from "@huggingface/jinja";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { prettyPrintObject } from "../../../../utils/prettyPrintObject.js";
|
||||
import { getGgufFileTypeName } from "../../../../gguf/utils/getGgufFileTypeName.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { resolveModelArgToFilePathOrUrl } from "../../../../utils/resolveModelDestination.js";
|
||||
import { printModelDestination } from "../../../utils/printModelDestination.js";
|
||||
import { getGgufMetadataKeyValue } from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { printDidYouMeanUri } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { isModelUri } from "../../../../utils/parseModelUri.js";
|
||||
const chatTemplateKey = ".chatTemplate";
|
||||
export const InspectGgufCommand = {
|
||||
command: "gguf [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Inspect a GGUF file", documentationPageUrls.CLI.Inspect.GGUF),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
description: "The path or URI of the GGUF file to inspect. If a URI is provided, the metadata will be read from the remote file without downloading the entire file.",
|
||||
group: "Required:"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("key", {
|
||||
alias: ["k"],
|
||||
type: "string",
|
||||
description: "A single metadata key to print the value of. If not provided, all metadata will be printed. " +
|
||||
"If the key is `" + chatTemplateKey + "` then the chat template of the model will be formatted and printed.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("noSplice", {
|
||||
alias: "s",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "When split files are detected, it reads the metadata of the first file and splices the tensorInfo from all the parts. Use this flag to disable that behavior and read only the given file",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("fullTensorInfo", {
|
||||
alias: "t",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Show the full tensor info",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("fullMetadataArrays", {
|
||||
alias: "ma",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print the full arrays in the metadata. Caution: those arrays can be extremely large and cover the entire terminal screen. Use with caution.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("plainJson", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Print the output as plain JSON with no formatting. Useful for piping the output to other commands. The output won't truncate any values, so it may be extremely large. Use with caution.",
|
||||
group: "Optional:"
|
||||
})
|
||||
.option("outputToJsonFile", {
|
||||
type: "string",
|
||||
description: "Path to a file to write the output to as JSON. The output won't truncate any values. The output won't be printed to the console",
|
||||
group: "Optional:"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, key, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile }) {
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
const [resolvedModelDestination, resolvedGgufPath] = (!plainJson && isModelUri(ggufPath))
|
||||
? await withOra({
|
||||
loading: chalk.blue("Resolving model URI"),
|
||||
success: chalk.blue("Resolved model URI"),
|
||||
fail: chalk.blue("Failed to resolve model URI"),
|
||||
noSuccessLiveStatus: true
|
||||
}, () => resolveModelArgToFilePathOrUrl(ggufPath, headers))
|
||||
: await resolveModelArgToFilePathOrUrl(ggufPath, headers);
|
||||
if (resolvedModelDestination.type === "file" && !await fs.pathExists(resolvedGgufPath)) {
|
||||
console.error(`${chalk.red("File does not exist:")} ${resolvedGgufPath}`);
|
||||
printDidYouMeanUri(ggufPath);
|
||||
process.exit(1);
|
||||
}
|
||||
if (!plainJson)
|
||||
printModelDestination(resolvedModelDestination);
|
||||
const parsedMetadata = plainJson
|
||||
? await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers,
|
||||
spliceSplitFiles: !noSplice
|
||||
})
|
||||
: await withOra({
|
||||
loading: chalk.blue("Reading model metadata"),
|
||||
success: chalk.blue("Read model metadata"),
|
||||
fail: chalk.blue("Failed to read model metadata"),
|
||||
noSuccessLiveStatus: true
|
||||
}, async () => {
|
||||
return await readGgufFileInfo(resolvedGgufPath, {
|
||||
fetchHeaders: resolvedModelDestination.type === "file"
|
||||
? undefined
|
||||
: headers,
|
||||
spliceSplitFiles: !noSplice
|
||||
});
|
||||
});
|
||||
removeAdditionalTensorInfoFields(parsedMetadata.fullTensorInfo);
|
||||
const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type);
|
||||
if (plainJson || outputToJsonFile != null) {
|
||||
const getOutputJson = () => {
|
||||
if (key != null) {
|
||||
const keyValue = key === chatTemplateKey
|
||||
? tryFormattingJinja(getGgufMetadataKeyValue(parsedMetadata.metadata, "tokenizer.chat_template"))
|
||||
: getGgufMetadataKeyValue(parsedMetadata.metadata, key);
|
||||
if (keyValue === undefined) {
|
||||
console.log(`Key not found: ${key}`);
|
||||
process.exit(1);
|
||||
}
|
||||
return JSON.stringify(keyValue, undefined, 4);
|
||||
}
|
||||
return JSON.stringify({
|
||||
splicedParts: parsedMetadata.splicedParts,
|
||||
version: parsedMetadata.version,
|
||||
fileType: fileTypeName,
|
||||
tensorCount: parsedMetadata.totalTensorCount,
|
||||
metadataSize: parsedMetadata.totalMetadataSize,
|
||||
tensorInfoSize: parsedMetadata.totalTensorInfoSize,
|
||||
metadata: parsedMetadata.metadata,
|
||||
tensorInfo: parsedMetadata.fullTensorInfo
|
||||
}, undefined, 4);
|
||||
};
|
||||
const outputJson = getOutputJson();
|
||||
if (outputToJsonFile != null) {
|
||||
const filePath = path.resolve(process.cwd(), outputToJsonFile);
|
||||
await fs.writeFile(filePath, outputJson, "utf8");
|
||||
console.info(`${chalk.yellow("JSON written to file:")} ${filePath}`);
|
||||
}
|
||||
else {
|
||||
console.info(outputJson);
|
||||
}
|
||||
}
|
||||
else if (key != null) {
|
||||
const keyValue = key === chatTemplateKey
|
||||
? tryFormattingJinja(getGgufMetadataKeyValue(parsedMetadata.metadata, "tokenizer.chat_template"))
|
||||
: getGgufMetadataKeyValue(parsedMetadata.metadata, key);
|
||||
if (keyValue === undefined) {
|
||||
console.log(`${chalk.red("Metadata key not found:")} ${key}`);
|
||||
process.exit(1);
|
||||
}
|
||||
const metadataPrettyPrintOptions = {
|
||||
maxArrayValues: fullMetadataArrays
|
||||
? undefined
|
||||
: 10,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1
|
||||
};
|
||||
console.info(`${chalk.yellow("Metadata key:")} ${prettyPrintObject(key)}`);
|
||||
console.info(`${chalk.yellow("Metadata:")} ${typeof keyValue === "string"
|
||||
? keyValue
|
||||
: prettyPrintObject(keyValue, undefined, metadataPrettyPrintOptions)}`);
|
||||
}
|
||||
else {
|
||||
const metadataPrettyPrintOptions = {
|
||||
maxArrayValues: fullMetadataArrays
|
||||
? undefined
|
||||
: 10,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1
|
||||
};
|
||||
const tensorInfoPrettyPrintOptions = {
|
||||
maxArrayValues: fullTensorInfo
|
||||
? undefined
|
||||
: 4,
|
||||
useNumberGrouping: true,
|
||||
maxArrayItemsWidth: process.stdout.columns - 1,
|
||||
multilineObjects: false
|
||||
};
|
||||
const numberLocaleFormattingOptions = {
|
||||
style: "decimal",
|
||||
useGrouping: true
|
||||
};
|
||||
if (parsedMetadata.splicedParts > 1)
|
||||
console.info(`${chalk.yellow("Spliced parts:")} ${parsedMetadata.splicedParts}`);
|
||||
console.info(`${chalk.yellow("GGUF version:")} ${parsedMetadata.version}`);
|
||||
console.info(`${chalk.yellow("Tensor count:")} ${parsedMetadata.totalTensorCount.toLocaleString("en-US", numberLocaleFormattingOptions)}`);
|
||||
console.info(`${chalk.yellow("Metadata size:")} ${toBytes(parsedMetadata.totalMetadataSize)}`);
|
||||
console.info(`${chalk.yellow("Tensor info size:")} ${toBytes(parsedMetadata.totalTensorInfoSize)}`);
|
||||
console.info(`${chalk.yellow("File type:")} ${fileTypeName ?? ""} ${chalk.white(`(${parsedMetadata.metadata.general?.file_type})`)}`);
|
||||
console.info(`${chalk.yellow("Metadata:")} ${prettyPrintObject(parsedMetadata.metadata, undefined, metadataPrettyPrintOptions)}`);
|
||||
console.info(`${chalk.yellow("Tensor info:")} ${prettyPrintObject(parsedMetadata.fullTensorInfo, undefined, tensorInfoPrettyPrintOptions)}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
// these fields are added by the parser for ease of use and are not found in the gguf file itself
|
||||
function removeAdditionalTensorInfoFields(tensorInfo) {
|
||||
if (tensorInfo == null)
|
||||
return;
|
||||
for (const tensor of tensorInfo) {
|
||||
delete tensor.fileOffset;
|
||||
delete tensor.filePart;
|
||||
}
|
||||
}
|
||||
function tryFormattingJinja(template) {
|
||||
if (typeof template !== "string")
|
||||
return template;
|
||||
try {
|
||||
const parsedTemplate = new Template(template);
|
||||
return parsedTemplate.format({
|
||||
indent: 4
|
||||
}) ?? template;
|
||||
}
|
||||
catch (err) {
|
||||
return template;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=InspectGgufCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type InspectGpuCommand = {};
|
||||
export declare const InspectGpuCommand: CommandModule<object, InspectGpuCommand>;
|
||||
export {};
|
||||
249
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js
generated
vendored
Normal file
249
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js
generated
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
import os from "os";
|
||||
import chalk from "chalk";
|
||||
import { getLlamaForOptions } from "../../../../bindings/getLlama.js";
|
||||
import { detectAvailableComputeLayers } from "../../../../bindings/utils/detectAvailableComputeLayers.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
import { LlamaLogLevel } from "../../../../bindings/types.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getModuleVersion } from "../../../../utils/getModuleVersion.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { builtinLlamaCppGitHubRepo, documentationPageUrls } from "../../../../config.js";
|
||||
import { getPlatformInfo } from "../../../../bindings/utils/getPlatformInfo.js";
|
||||
import { getLinuxDistroInfo } from "../../../../bindings/utils/getLinuxDistroInfo.js";
|
||||
import { isRunningUnderRosetta } from "../../../utils/isRunningUnderRosetta.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { getBinariesGithubRelease } from "../../../../bindings/utils/binariesGithubRelease.js";
|
||||
import { getClonedLlamaCppRepoReleaseInfo } from "../../../../bindings/utils/cloneLlamaCppRepo.js";
|
||||
export const InspectGpuCommand = {
|
||||
command: "gpu",
|
||||
describe: withCliCommandDescriptionDocsUrl("Show the detected GPU types and their VRAM usage", documentationPageUrls.CLI.Inspect.GPU),
|
||||
async handler() {
|
||||
const platform = getPlatform();
|
||||
const arch = process.arch;
|
||||
const availableComputeLayers = await detectAvailableComputeLayers({ platform });
|
||||
const gpusToLogVramUsageOf = [];
|
||||
const gpuToLlama = new Map();
|
||||
let lastLlama;
|
||||
async function loadLlamaForGpu(gpu) {
|
||||
if (!gpuToLlama.has(gpu)) {
|
||||
const loadedLlama = await getLlamaForGpu(gpu);
|
||||
gpuToLlama.set(gpu, loadedLlama);
|
||||
if (loadedLlama != null)
|
||||
lastLlama = loadedLlama;
|
||||
}
|
||||
return gpuToLlama.get(gpu);
|
||||
}
|
||||
if (platform === "linux") {
|
||||
const linuxDistroInfo = await getLinuxDistroInfo();
|
||||
if (linuxDistroInfo.prettyName !== "")
|
||||
console.info(`${chalk.yellow("OS:")} ${linuxDistroInfo.prettyName} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
else
|
||||
console.info(`${chalk.yellow("OS:")} ${linuxDistroInfo.name || os.type()} ${linuxDistroInfo.version || os.release()} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
}
|
||||
else {
|
||||
const platformInfo = await getPlatformInfo();
|
||||
const osName = platformInfo.name === "Unknown"
|
||||
? os.type()
|
||||
: platformInfo.name;
|
||||
console.info(`${chalk.yellow("OS:")} ${osName} ${platformInfo.version} ${chalk.dim("(" + os.arch() + ")")}`);
|
||||
}
|
||||
if (process.versions.node != null)
|
||||
console.info(`${chalk.yellow("Node:")} ${process.versions.node} ${chalk.dim("(" + arch + ")")}`);
|
||||
if (process.versions.bun != null)
|
||||
console.info(`${chalk.yellow("Bun:")} ${process.versions.bun}`);
|
||||
const typeScriptVersion = await getInstalledTypescriptVersion();
|
||||
if (typeScriptVersion != null)
|
||||
console.info(`${chalk.yellow("TypeScript:")} ${typeScriptVersion}`);
|
||||
try {
|
||||
const moduleVersion = await getModuleVersion();
|
||||
if (moduleVersion != null) {
|
||||
console.info();
|
||||
console.info(`${chalk.yellow("node-llama-cpp:")} ${moduleVersion}`);
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
try {
|
||||
const prebuiltBinariesRelease = await getBinariesGithubRelease();
|
||||
console.info(`${chalk.yellow("Prebuilt binaries:")} ${prebuiltBinariesRelease}`);
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
try {
|
||||
const clonedLlamaCppRelease = await getClonedLlamaCppRepoReleaseInfo();
|
||||
if (clonedLlamaCppRelease != null)
|
||||
console.info(`${chalk.yellow("Cloned source:")} ${clonedLlamaCppRelease.tag}` + (clonedLlamaCppRelease.llamaCppGithubRepo !== builtinLlamaCppGitHubRepo
|
||||
? ` (${clonedLlamaCppRelease.llamaCppGithubRepo})`
|
||||
: ""));
|
||||
}
|
||||
catch (err) {
|
||||
// do nothing
|
||||
}
|
||||
console.info();
|
||||
if (platform === "mac" && arch === "arm64") {
|
||||
const llama = await loadLlamaForGpu("metal");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.red("Metal is detected, but using it failed")}`);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("metal");
|
||||
}
|
||||
}
|
||||
else if (platform === "mac") {
|
||||
if (await isRunningUnderRosetta()) {
|
||||
console.error(chalk.red("llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
|
||||
"Ensure that you're using a native arm64 node.js installation."));
|
||||
console.error("process.platform: " + process.platform + ", process.arch: " + process.arch);
|
||||
console.error("troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction);
|
||||
}
|
||||
console.info(`${chalk.yellow("Metal:")} ${chalk.red("not supported by llama.cpp on Intel Macs")}`);
|
||||
const llama = await loadLlamaForGpu(false);
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("CPU:")} ${chalk.red("Loading a binding with only CPU support failed")}`);
|
||||
}
|
||||
}
|
||||
if (availableComputeLayers.cuda.hasNvidiaDriver && !availableComputeLayers.cuda.hasCudaRuntime) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("NVIDIA driver is installed, but CUDA runtime is not")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else if (availableComputeLayers.cuda.hasCudaRuntime && !availableComputeLayers.cuda.hasNvidiaDriver) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA runtime is installed, but NVIDIA driver is not")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else if (availableComputeLayers.cuda.hasCudaRuntime && availableComputeLayers.cuda.hasNvidiaDriver) {
|
||||
const llama = await loadLlamaForGpu("cuda");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA is detected, but using it failed")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("CUDA:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("cuda");
|
||||
if (llama._hadErrorLogs)
|
||||
console.info(chalk.yellow("To resolve errors related to CUDA, see the CUDA guide: ") + documentationPageUrls.CUDA);
|
||||
}
|
||||
}
|
||||
if (availableComputeLayers.vulkan) {
|
||||
const llama = await loadLlamaForGpu("vulkan");
|
||||
if (llama == null) {
|
||||
console.info(`${chalk.yellow("Vulkan:")} ${chalk.red("Vulkan is detected, but using it failed")}`);
|
||||
console.info(chalk.yellow("To resolve errors related to Vulkan, see the Vulkan guide: ") + documentationPageUrls.Vulkan);
|
||||
}
|
||||
else {
|
||||
console.info(`${chalk.yellow("Vulkan:")} ${chalk.green("available")}`);
|
||||
gpusToLogVramUsageOf.push("vulkan");
|
||||
if (llama._hadErrorLogs)
|
||||
console.info(chalk.yellow("To resolve errors related to Vulkan, see the Vulkan guide: ") + documentationPageUrls.Vulkan);
|
||||
}
|
||||
}
|
||||
if (lastLlama == null)
|
||||
await loadLlamaForGpu(false);
|
||||
for (const gpu of gpusToLogVramUsageOf) {
|
||||
const llama = gpuToLlama.get(gpu);
|
||||
if (llama == null || llama.gpu !== gpu)
|
||||
continue;
|
||||
console.info();
|
||||
await logGpuVramUsage(llama);
|
||||
}
|
||||
console.info();
|
||||
await logRamUsage(lastLlama?.cpuMathCores);
|
||||
if (lastLlama != null) {
|
||||
await logSwapUsage(lastLlama);
|
||||
console.info(`${chalk.yellow("mmap:")} ${lastLlama.supportsMmap ? "supported" : "unsupported"}`);
|
||||
}
|
||||
}
|
||||
};
|
||||
async function getLlamaForGpu(gpu) {
|
||||
try {
|
||||
// if you're reading this line, then you're probably looking for the `dryRun` option on `getLlama`
|
||||
return await getLlamaForOptions({
|
||||
gpu: gpu,
|
||||
build: "never",
|
||||
progressLogs: false,
|
||||
logLevel: LlamaLogLevel.warn,
|
||||
vramPadding: 0
|
||||
}, {
|
||||
skipLlamaInit: true,
|
||||
pipeBinaryTestErrorLogs: true
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
async function logGpuVramUsage(llama) {
|
||||
try {
|
||||
const gpuName = getPrettyBuildGpuName(llama.gpu);
|
||||
const vramState = await llama.getVramState();
|
||||
const gpuDeviceNames = await llama.getGpuDeviceNames();
|
||||
if (gpuDeviceNames.length > 0)
|
||||
console.info(`${chalk.yellow(`${gpuName} device${gpuDeviceNames.length > 1 ? "s" : ""}:`)} ${gpuDeviceNames.join(", ")}`);
|
||||
console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramState.used, vramState.total)}% ${chalk.gray("(" + toBytes(vramState.used) + "/" + toBytes(vramState.total) + ")")}`);
|
||||
console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramState.free, vramState.total)}% ${chalk.gray("(" + toBytes(vramState.free) + "/" + toBytes(vramState.total) + ")")}`);
|
||||
if (vramState.unifiedSize > 0)
|
||||
console.info(`${chalk.yellow(`${gpuName} unified memory:`)} ${toBytes(vramState.unifiedSize)} ${chalk.gray("(" + getPercentageString(vramState.unifiedSize, vramState.total) + "%)")}`);
|
||||
}
|
||||
catch (err) { }
|
||||
}
|
||||
async function logRamUsage(cpuMathCores) {
|
||||
const totalMemory = os.totalmem();
|
||||
const freeMemory = os.freemem();
|
||||
const usedMemory = totalMemory - freeMemory;
|
||||
const cpuDeviceNames = Array.from(new Set(os.cpus()
|
||||
.map((cpu) => (cpu.model?.trim?.() ?? ""))
|
||||
.filter((deviceName) => deviceName.length > 0)));
|
||||
if (cpuDeviceNames.length > 0)
|
||||
console.info(`${chalk.yellow("CPU model" + (cpuDeviceNames.length > 1 ? "s" : "") + ":")} ${cpuDeviceNames.join(", ")}`);
|
||||
if (cpuMathCores != null)
|
||||
console.info(`${chalk.yellow("Math cores:")} ${cpuMathCores}`);
|
||||
console.info(`${chalk.yellow("Used RAM:")} ${getPercentageString(usedMemory, totalMemory)}% ${chalk.gray("(" + toBytes(usedMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + toBytes(freeMemory) + "/" + toBytes(totalMemory) + ")")}`);
|
||||
}
|
||||
async function logSwapUsage(llama) {
|
||||
const swapState = await llama.getSwapState();
|
||||
console.info(`${chalk.yellow("Used swap:")} ${getPercentageString(swapState.used, swapState.allocated)}% ${chalk.gray("(" + toBytes(swapState.used) + "/" + toBytes(swapState.allocated) + ")")}`);
|
||||
console.info(`${chalk.yellow("Max swap size:")} ${swapState.maxSize === Infinity ? "dynamic" : toBytes(swapState.maxSize)}`);
|
||||
}
|
||||
function getPercentageString(amount, total) {
|
||||
if (total === 0)
|
||||
return "0";
|
||||
return String(Math.floor((amount / total) * 100 * 100) / 100);
|
||||
}
|
||||
async function getInstalledTypescriptVersion() {
|
||||
try {
|
||||
const ts = await import("typescript");
|
||||
const version = ts?.version ?? ts?.default?.version;
|
||||
if (version != null && typeof version === "string" && version.length > 0)
|
||||
return version;
|
||||
return null;
|
||||
}
|
||||
catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
// // simple script to copy console logs as ansi to clipboard. Used to update the documentation
|
||||
// import {spawn} from "child_process";
|
||||
// const pendingLog: string[] = [];
|
||||
// const originalConsoleInfo = console.info;
|
||||
// console.info = function info(...args: any[]) {
|
||||
// originalConsoleInfo.call(console, ...args);
|
||||
// pendingLog.push(args.join(" "));
|
||||
// };
|
||||
//
|
||||
// function copyLogs() {
|
||||
// const res = pendingLog.join("\n");
|
||||
//
|
||||
// pbcopy(res);
|
||||
// originalConsoleInfo.call(console, "Copied logs to clipboard");
|
||||
// }
|
||||
// function pbcopy(text: string) {
|
||||
// const pbcopyProcess = spawn("pbcopy");
|
||||
// pbcopyProcess.stdin.write(text);
|
||||
// pbcopyProcess.stdin.end();
|
||||
// }
|
||||
//
|
||||
// process.on("exit", copyLogs);
|
||||
//# sourceMappingURL=InspectGpuCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
23
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts
generated
vendored
Normal file
23
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type InspectMeasureCommand = {
|
||||
modelPath?: string;
|
||||
header?: string[];
|
||||
gpu?: BuildGpu | "auto";
|
||||
minLayers: number;
|
||||
maxLayers?: number;
|
||||
minContextSize: number;
|
||||
maxContextSize?: number;
|
||||
flashAttention?: boolean;
|
||||
swaFullCache?: boolean;
|
||||
batchSize?: number;
|
||||
measures: number;
|
||||
memory: "vram" | "ram" | "all";
|
||||
noMmap: boolean;
|
||||
noDirectIo: boolean;
|
||||
printHeaderBeforeEachLayer?: boolean;
|
||||
evaluateText?: string;
|
||||
repeatEvaluateText?: number;
|
||||
};
|
||||
export declare const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>;
|
||||
export {};
|
||||
828
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js
generated
vendored
Normal file
828
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js
generated
vendored
Normal file
@@ -0,0 +1,828 @@
|
||||
import path from "path";
|
||||
import process from "process";
|
||||
import { fileURLToPath } from "url";
|
||||
import { fork } from "node:child_process";
|
||||
import os from "os";
|
||||
import chalk from "chalk";
|
||||
import stripAnsi from "strip-ansi";
|
||||
import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
|
||||
import { resolveCommandGgufPath } from "../../../utils/resolveCommandGgufPath.js";
|
||||
import { getLlama } from "../../../../bindings/getLlama.js";
|
||||
import { LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { getConsoleLogPrefix } from "../../../../utils/getConsoleLogPrefix.js";
|
||||
import { ConsoleTable } from "../../../utils/ConsoleTable.js";
|
||||
import { GgufInsights } from "../../../../gguf/insights/GgufInsights.js";
|
||||
import { resolveHeaderFlag } from "../../../utils/resolveHeaderFlag.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getReadablePath } from "../../../utils/getReadablePath.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../../config.js";
|
||||
import { toBytes } from "../../../utils/toBytes.js";
|
||||
import { padSafeContextSize } from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
export const InspectMeasureCommand = {
|
||||
command: "measure [modelPath]",
|
||||
describe: withCliCommandDescriptionDocsUrl("Measure VRAM consumption of a GGUF model file with all possible combinations of gpu layers and context sizes", documentationPageUrls.CLI.Inspect.Measure),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("modelPath", {
|
||||
alias: ["m", "model", "path", "url", "uri"],
|
||||
type: "string",
|
||||
description: "Model file to use for the measurements. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
|
||||
})
|
||||
.option("header", {
|
||||
alias: ["H"],
|
||||
type: "string",
|
||||
array: true,
|
||||
description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: (value) => {
|
||||
if (value == null || value == "")
|
||||
return undefined;
|
||||
return parseNodeLlamaCppGpuOption(value);
|
||||
},
|
||||
defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
|
||||
description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
|
||||
})
|
||||
.option("minLayers", {
|
||||
alias: "mnl",
|
||||
type: "number",
|
||||
default: 1,
|
||||
description: "Minimum number of layers to offload to the GPU"
|
||||
})
|
||||
.option("maxLayers", {
|
||||
alias: "mxl",
|
||||
type: "number",
|
||||
default: -1,
|
||||
defaultDescription: "All layers",
|
||||
description: "Maximum number of layers to offload to the GPU"
|
||||
})
|
||||
.option("minContextSize", {
|
||||
alias: "mncs",
|
||||
type: "number",
|
||||
default: 512,
|
||||
description: "Minimum context size"
|
||||
})
|
||||
.option("maxContextSize", {
|
||||
alias: "mxcs",
|
||||
type: "number",
|
||||
default: -1,
|
||||
defaultDescription: "Train context size",
|
||||
description: "Maximum context size"
|
||||
})
|
||||
.option("flashAttention", {
|
||||
alias: "fa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Enable flash attention for the context"
|
||||
})
|
||||
.option("swaFullCache", {
|
||||
alias: "noSwa",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable SWA (Sliding Window Attention) on supported models"
|
||||
})
|
||||
.option("batchSize", {
|
||||
alias: "b",
|
||||
type: "number",
|
||||
description: "Batch size to use for the model context"
|
||||
})
|
||||
.option("measures", {
|
||||
alias: "n",
|
||||
type: "number",
|
||||
default: 10,
|
||||
description: "Number of context size measures to take for each gpu layers count"
|
||||
})
|
||||
.option("memory", {
|
||||
type: "string",
|
||||
choices: ["vram", "ram", "all"],
|
||||
default: "vram",
|
||||
description: "Type of memory to measure"
|
||||
})
|
||||
.option("noMmap", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable mmap (memory-mapped file) usage"
|
||||
})
|
||||
.option("noDirectIo", {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Disable Direct I/O usage when available"
|
||||
})
|
||||
.option("printHeaderBeforeEachLayer", {
|
||||
alias: "ph",
|
||||
type: "boolean",
|
||||
default: true,
|
||||
description: "Print header before each layer's measures"
|
||||
})
|
||||
.option("evaluateText", {
|
||||
alias: ["evaluate", "et"],
|
||||
type: "string",
|
||||
description: "Text to evaluate with the model"
|
||||
})
|
||||
.option("repeatEvaluateText", {
|
||||
alias: ["repeatEvaluate", "ret"],
|
||||
type: "number",
|
||||
default: 1,
|
||||
description: "Number of times to repeat the evaluation text before sending it for evaluation, in order to make it longer"
|
||||
});
|
||||
},
|
||||
async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
|
||||
if (maxLayers === -1)
|
||||
maxLayers = undefined;
|
||||
if (maxContextSize === -1)
|
||||
maxContextSize = undefined;
|
||||
if (minLayers < 1)
|
||||
minLayers = 1;
|
||||
const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
|
||||
const headers = resolveHeaderFlag(headerArg);
|
||||
// ensure a llama build is available
|
||||
const llama = gpu == null
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu,
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
const platform = getPlatform();
|
||||
const useMmap = !noMmap && llama.supportsMmap;
|
||||
const useDirectIo = !noDirectIo;
|
||||
const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
|
||||
flashAttention, swaFullCache, useMmap
|
||||
});
|
||||
console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
|
||||
console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
|
||||
console.info(chalk.yellow("mmap:") + " " + (!llama.supportsMmap
|
||||
? "unsupported"
|
||||
: useMmap
|
||||
? "enabled"
|
||||
: "disabled"));
|
||||
if (platform !== "mac") // Direct I/O is not supported on macOS
|
||||
console.info(chalk.yellow("Direct I/O:") + " " + (useDirectIo
|
||||
? "enabled"
|
||||
: "disabled"));
|
||||
if (measureMemoryType === "ram" || measureMemoryType === "all")
|
||||
console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
|
||||
console.info();
|
||||
const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
|
||||
sourceType: "filesystem"
|
||||
});
|
||||
const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
|
||||
const totalVram = (await llama.getVramState()).total;
|
||||
const totalRam = os.totalmem();
|
||||
let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
|
||||
let previousContextSizeCheck = undefined;
|
||||
const measureTable = getMeasureTable(measureMemoryType);
|
||||
measureTable.logHeader({ drawRowSeparator: !printHeaderBeforeEachLayer });
|
||||
while (lastGpuLayers >= (minLayers ?? 0)) {
|
||||
let printedAlreadyWithThisProcess = false;
|
||||
let hadSuccessInThisProcess = false;
|
||||
const getNewProccessValue = () => {
|
||||
if (printedAlreadyWithThisProcess)
|
||||
return undefined;
|
||||
printedAlreadyWithThisProcess = true;
|
||||
return chalk.green("*");
|
||||
};
|
||||
const done = await measureModel({
|
||||
modelPath: resolvedGgufPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
gpu: gpu == null
|
||||
? undefined
|
||||
: llama.gpu,
|
||||
maxGpuLayers: lastGpuLayers,
|
||||
minGpuLayers: minLayers,
|
||||
initialMaxContextSize: previousContextSizeCheck,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
tests: measures,
|
||||
evaluateText: evaluateText == null
|
||||
? undefined
|
||||
: evaluateText.repeat(repeatEvaluateText ?? 1),
|
||||
exitAfterMeasurement: exitAfterEachMeasurement,
|
||||
onInfo({ gpuLayers, result }) {
|
||||
if (lastGpuLayers !== gpuLayers) {
|
||||
lastGpuLayers = gpuLayers;
|
||||
previousContextSizeCheck = undefined;
|
||||
measureTable.logLine({});
|
||||
if (printHeaderBeforeEachLayer)
|
||||
measureTable.logHeader({ drawRowSeparator: false });
|
||||
}
|
||||
if (result.type === "crash") {
|
||||
if (!hadSuccessInThisProcess) {
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: chalk.redBright("Crash"),
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: chalk.red(result.result),
|
||||
estimatedModelVram: previousContextSizeCheck == null
|
||||
? undefined
|
||||
: chalk.red(result.result)
|
||||
});
|
||||
lastGpuLayers--;
|
||||
}
|
||||
}
|
||||
else if (result.type === "error") {
|
||||
previousContextSizeCheck = result.contextSize;
|
||||
hadSuccessInThisProcess = true;
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: chalk.red("Error"),
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: chalk.red(result.error),
|
||||
estimatedModelVram: previousContextSizeCheck == null
|
||||
? undefined
|
||||
: chalk.red(result.error)
|
||||
});
|
||||
}
|
||||
else if (result.type === "success") {
|
||||
previousContextSizeCheck = result.contextSize;
|
||||
hadSuccessInThisProcess = true;
|
||||
const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
|
||||
gpuLayers: lastGpuLayers,
|
||||
useMmap
|
||||
});
|
||||
const modelVramEstimation = modelResourceEstimation.gpuVram;
|
||||
const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.modelVramUsage - modelVramEstimation));
|
||||
const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
|
||||
const modelRamEstimation = modelResourceEstimation.cpuRam;
|
||||
const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.modelRamUsage - modelRamEstimation));
|
||||
const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
|
||||
const contextResourceEstimation = previousContextSizeCheck == null
|
||||
? undefined
|
||||
: ggufInsights.estimateContextResourceRequirements({
|
||||
contextSize: previousContextSizeCheck,
|
||||
modelGpuLayers: lastGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize
|
||||
});
|
||||
const contextVramEstimation = contextResourceEstimation?.gpuVram;
|
||||
const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
|
||||
? undefined
|
||||
: ((contextVramEstimation < result.contextVramUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
|
||||
const contextVramEstimationDiffText = (contextVramEstimation == null || contextVramEstimationDiffBytes == null || result.contextVramUsage == null)
|
||||
? undefined
|
||||
: (contextVramEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9));
|
||||
const contextRamEstimation = contextResourceEstimation?.cpuRam;
|
||||
const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
|
||||
? undefined
|
||||
: ((contextRamEstimation < result.contextRamUsage ? "-" : "") +
|
||||
toBytes(Math.abs(result.contextRamUsage - contextRamEstimation)));
|
||||
const contextRamEstimationDiffText = (contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null)
|
||||
? undefined
|
||||
: (contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
|
||||
padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9));
|
||||
measureTable.logLine({
|
||||
newProcess: getNewProccessValue(),
|
||||
type: previousContextSizeCheck == null
|
||||
? "Model"
|
||||
: "Context",
|
||||
gpuLayers: String(lastGpuLayers),
|
||||
contextSize: previousContextSizeCheck != null
|
||||
? String(previousContextSizeCheck)
|
||||
: undefined,
|
||||
estimatedModelVram: toBytes(modelVramEstimation),
|
||||
actualModelVram: toBytes(result.modelVramUsage),
|
||||
modelVramEstimationDiff: modelVramEstimationDiffText,
|
||||
estimatedModelRam: toBytes(modelRamEstimation),
|
||||
actualModelRam: toBytes(result.modelRamUsage),
|
||||
modelRamEstimationDiff: modelRamEstimationDiffText,
|
||||
estimatedContextVram: contextVramEstimation == null
|
||||
? undefined
|
||||
: toBytes(contextVramEstimation),
|
||||
actualContextVram: result.contextVramUsage == null
|
||||
? undefined
|
||||
: toBytes(result.contextVramUsage),
|
||||
contextVramEstimationDiff: contextVramEstimationDiffText,
|
||||
totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
|
||||
chalk.gray("(" + toBytes(result.totalVramUsage) + "/" + toBytes(totalVram) + ")"),
|
||||
estimatedContextRam: contextRamEstimation == null
|
||||
? undefined
|
||||
: toBytes(contextRamEstimation),
|
||||
actualContextRam: result.contextRamUsage == null
|
||||
? undefined
|
||||
: toBytes(result.contextRamUsage),
|
||||
contextRamEstimationDiff: contextRamEstimationDiffText,
|
||||
totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
|
||||
chalk.gray("(" + toBytes(result.totalRamUsage) + "/" + toBytes(totalRam) + ")")
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
function getMeasureTable(memoryType) {
|
||||
return new ConsoleTable([{
|
||||
key: "newProcess",
|
||||
title: " ",
|
||||
width: 1
|
||||
}, {
|
||||
key: "type",
|
||||
title: "Type",
|
||||
width: Math.max("Type".length, "Model".length, "Context".length),
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "gpuLayers",
|
||||
title: "Layers",
|
||||
width: "Layers".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "contextSize",
|
||||
title: "Context size",
|
||||
width: "Context size".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "estimatedModelVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Estimated model VRAM",
|
||||
width: "Estimated model VRAM".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "actualModelVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Model VRAM",
|
||||
width: "Model VRAM".length
|
||||
}, {
|
||||
key: "modelVramEstimationDiff",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "estimatedModelRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Estimated model RAM",
|
||||
width: "Estimated model RAM".length,
|
||||
canSpanOverEmptyColumns: true
|
||||
}, {
|
||||
key: "actualModelRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Model RAM",
|
||||
width: "Model RAM".length
|
||||
}, {
|
||||
key: "modelRamEstimationDiff",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "estimatedContextVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Estimated context VRAM",
|
||||
width: "Estimated context VRAM".length
|
||||
}, {
|
||||
key: "actualContextVram",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Context VRAM",
|
||||
width: "Context VRAM".length
|
||||
}, {
|
||||
key: "contextVramEstimationDiff",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "totalVramUsage",
|
||||
visible: memoryType === "vram" || memoryType === "all",
|
||||
title: "VRAM usage",
|
||||
width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
|
||||
}, {
|
||||
key: "estimatedContextRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Estimated context RAM",
|
||||
width: "Estimated context RAM".length
|
||||
}, {
|
||||
key: "actualContextRam",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Context RAM",
|
||||
width: "Context RAM".length
|
||||
}, {
|
||||
key: "contextRamEstimationDiff",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "Diff",
|
||||
width: Math.max("Diff".length, 9 + 1 + 9)
|
||||
}, {
|
||||
key: "totalRamUsage",
|
||||
visible: memoryType === "ram" || memoryType === "all",
|
||||
title: "RAM usage",
|
||||
width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
|
||||
}]);
|
||||
}
|
||||
function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6, yellow = 10, yellowBright = 14 } = {}) {
|
||||
const percentageText = percentage.toFixed(2).padStart(5, "0") + "%";
|
||||
const absPercentage = Math.abs(percentage);
|
||||
if (absPercentage < greenBright)
|
||||
return chalk.greenBright(percentageText);
|
||||
else if (absPercentage < green)
|
||||
return chalk.green(percentageText);
|
||||
else if (absPercentage < yellow)
|
||||
return chalk.yellow(percentageText);
|
||||
else if (absPercentage < yellowBright)
|
||||
return chalk.yellowBright(percentageText);
|
||||
return chalk.red(percentageText);
|
||||
}
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const detectedFileName = path.basename(__filename);
|
||||
const expectedFileName = "InspectMeasureCommand";
|
||||
async function measureModel({ modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo }) {
|
||||
if (!detectedFileName.startsWith(expectedFileName)) {
|
||||
console.warn(getConsoleLogPrefix() +
|
||||
`"${expectedFileName}.js" file is not independent, so running sub-process tests cannot be done with it\n` +
|
||||
getConsoleLogPrefix() +
|
||||
'To resolve this issue, make sure that "node-llama-cpp" is not bundled together with other code.');
|
||||
throw new Error("Sub-process tests cannot be done with the current file");
|
||||
}
|
||||
const subProcess = fork(__filename, [], {
|
||||
detached: false,
|
||||
stdio: [null, null, null, "ipc"],
|
||||
env: {
|
||||
...process.env,
|
||||
MEASURE_MODEL_CP: "true",
|
||||
MEASURE_MODEL_CP_GPU: gpu == null
|
||||
? undefined
|
||||
: JSON.stringify(gpu)
|
||||
}
|
||||
});
|
||||
let isPlannedExit = false;
|
||||
let isDone = false;
|
||||
let forkSucceeded = false;
|
||||
let timeoutHandle = null;
|
||||
const processCreationTimeout = 1000 * 60 * 5;
|
||||
const stdTexts = [];
|
||||
let lastGpuLayers = maxGpuLayers;
|
||||
function cleanup() {
|
||||
if (subProcess.exitCode == null)
|
||||
subProcess.kill("SIGKILL");
|
||||
if (timeoutHandle != null)
|
||||
clearTimeout(timeoutHandle);
|
||||
process.off("exit", cleanup);
|
||||
}
|
||||
process.on("exit", cleanup);
|
||||
subProcess.stdout?.on("data", (data) => {
|
||||
stdTexts.push(data.toString());
|
||||
});
|
||||
subProcess.stderr?.on("data", (data) => {
|
||||
stdTexts.push(data.toString());
|
||||
});
|
||||
return Promise.race([
|
||||
new Promise((_, reject) => {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
if (!forkSucceeded) {
|
||||
reject(new Error("Measuring using a sub-process timed out"));
|
||||
cleanup();
|
||||
}
|
||||
}, processCreationTimeout);
|
||||
}),
|
||||
new Promise((resolve, reject) => {
|
||||
function done() {
|
||||
if (!forkSucceeded)
|
||||
reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
|
||||
else if (isPlannedExit)
|
||||
resolve(isPlannedExit && isDone);
|
||||
cleanup();
|
||||
}
|
||||
subProcess.on("message", (message) => {
|
||||
if (message.type === "ready") {
|
||||
forkSucceeded = true;
|
||||
subProcess.send({
|
||||
type: "start",
|
||||
modelPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
tests,
|
||||
initialMaxContextSize,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
maxGpuLayers,
|
||||
minGpuLayers,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
evaluateText,
|
||||
exitAfterMeasurement
|
||||
});
|
||||
if (timeoutHandle != null) {
|
||||
clearTimeout(timeoutHandle);
|
||||
timeoutHandle = null;
|
||||
}
|
||||
}
|
||||
else if (message.type === "done") {
|
||||
isPlannedExit = true;
|
||||
isDone = true;
|
||||
subProcess.send({ type: "exit" });
|
||||
}
|
||||
else if (message.type === "exit") {
|
||||
isPlannedExit = true;
|
||||
subProcess.send({ type: "exit" });
|
||||
}
|
||||
else if (message.type === "error") {
|
||||
lastGpuLayers = message.gpuLayers;
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "error",
|
||||
error: message.error,
|
||||
contextSize: message.contextSize
|
||||
}
|
||||
});
|
||||
}
|
||||
else if (message.type === "stats") {
|
||||
lastGpuLayers = message.gpuLayers;
|
||||
onInfo({
|
||||
gpuLayers: message.gpuLayers,
|
||||
result: {
|
||||
type: "success",
|
||||
modelVramUsage: message.modelVramUsage,
|
||||
modelRamUsage: message.modelRamUsage,
|
||||
contextSize: message.contextSize,
|
||||
contextVramUsage: message.contextVramUsage,
|
||||
contextRamUsage: message.contextRamUsage,
|
||||
contextStateSize: message.contextStateSize,
|
||||
totalVramUsage: message.totalVramUsage,
|
||||
totalRamUsage: message.totalRamUsage
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
subProcess.on("exit", (code) => {
|
||||
if (code !== 0 || !isPlannedExit)
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "crash",
|
||||
result: stdTexts.join("")
|
||||
}
|
||||
});
|
||||
done();
|
||||
});
|
||||
if (subProcess.killed || subProcess.exitCode != null) {
|
||||
if (subProcess.exitCode !== 0 || !isPlannedExit)
|
||||
onInfo({
|
||||
gpuLayers: lastGpuLayers,
|
||||
result: {
|
||||
type: "crash",
|
||||
result: stdTexts.join("")
|
||||
}
|
||||
});
|
||||
done();
|
||||
}
|
||||
})
|
||||
]);
|
||||
}
|
||||
if (process.env.MEASURE_MODEL_CP === "true" && process.send != null) {
|
||||
void runTestWorkerLogic();
|
||||
}
|
||||
async function runTestWorkerLogic() {
|
||||
const gpuEnvVar = process.env.MEASURE_MODEL_CP_GPU;
|
||||
const llama = (gpuEnvVar == null || gpuEnvVar === "")
|
||||
? await getLlama("lastBuild", {
|
||||
logLevel: LlamaLogLevel.error
|
||||
})
|
||||
: await getLlama({
|
||||
gpu: JSON.parse(gpuEnvVar),
|
||||
logLevel: LlamaLogLevel.error
|
||||
});
|
||||
if (process.send == null)
|
||||
throw new Error("No IPC channel to parent process");
|
||||
function sendInfoBack(info) {
|
||||
if (process.send == null)
|
||||
process.exit(1);
|
||||
process.send(info);
|
||||
}
|
||||
async function testContextSizes({ model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }) {
|
||||
let measurementsDone = 0;
|
||||
const contextSizeCheckPlan = getContextSizesCheckPlan(maxContextSize != null
|
||||
? Math.min(model.trainContextSize, maxContextSize)
|
||||
: model.trainContextSize, tests, minContextSize);
|
||||
let currentContextSizeCheck = startContextSize == null
|
||||
? -1
|
||||
: getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, startContextSize);
|
||||
while (currentContextSizeCheck != null) {
|
||||
if (currentContextSizeCheck === -1)
|
||||
currentContextSizeCheck = null;
|
||||
try {
|
||||
const preContextVramUsage = (await llama.getVramState()).used;
|
||||
const preContextRamUsage = getMemoryUsage(llama);
|
||||
const context = await model.createContext({
|
||||
contextSize: currentContextSizeCheck ?? (maxContextSize != null
|
||||
? { max: maxContextSize }
|
||||
: undefined),
|
||||
ignoreMemorySafetyChecks: currentContextSizeCheck != null,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
failedCreationRemedy: false
|
||||
});
|
||||
if (evaluateText != null && evaluateText != "") {
|
||||
const sequence = context.getSequence();
|
||||
await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText));
|
||||
}
|
||||
const postContextVramUsage = (await llama.getVramState()).used;
|
||||
const postContextRamUsage = getMemoryUsage(llama);
|
||||
measurementsDone++;
|
||||
sendInfoBack({
|
||||
type: "stats",
|
||||
gpuLayers: model.gpuLayers,
|
||||
modelVramUsage,
|
||||
modelRamUsage,
|
||||
contextSize: context.contextSize,
|
||||
contextVramUsage: postContextVramUsage - preContextVramUsage,
|
||||
contextRamUsage: postContextRamUsage - preContextRamUsage,
|
||||
contextStateSize: context.stateSize,
|
||||
totalVramUsage: postContextVramUsage,
|
||||
totalRamUsage: postContextRamUsage
|
||||
});
|
||||
currentContextSizeCheck = context.contextSize;
|
||||
await context.dispose();
|
||||
}
|
||||
catch (err) {
|
||||
sendInfoBack({
|
||||
type: "error",
|
||||
error: String(err),
|
||||
gpuLayers: model.gpuLayers,
|
||||
contextSize: currentContextSizeCheck == null
|
||||
? undefined
|
||||
: currentContextSizeCheck
|
||||
});
|
||||
if (currentContextSizeCheck == null) {
|
||||
currentContextSizeCheck = contextSizeCheckPlan[0];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
|
||||
if (exitAfterMeasurement)
|
||||
return measurementsDone;
|
||||
}
|
||||
return measurementsDone;
|
||||
}
|
||||
async function testWithGpuLayers({ modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }) {
|
||||
try {
|
||||
const preModelVramUsage = (await llama.getVramState()).used;
|
||||
const preModelRamUsage = getMemoryUsage(llama);
|
||||
const model = await llama.loadModel({
|
||||
modelPath,
|
||||
useMmap,
|
||||
useDirectIo,
|
||||
gpuLayers,
|
||||
defaultContextFlashAttention: flashAttention,
|
||||
defaultContextSwaFullCache: swaFullCache,
|
||||
ignoreMemorySafetyChecks: true
|
||||
});
|
||||
const postModelVramUsage = (await llama.getVramState()).used;
|
||||
const postModelRamUsage = getMemoryUsage(llama);
|
||||
sendInfoBack({
|
||||
type: "stats",
|
||||
gpuLayers: model.gpuLayers,
|
||||
modelVramUsage: postModelVramUsage - preModelVramUsage,
|
||||
modelRamUsage: postModelRamUsage - preModelRamUsage,
|
||||
totalVramUsage: postModelVramUsage,
|
||||
totalRamUsage: postModelRamUsage
|
||||
});
|
||||
const measurementsDone = await testContextSizes({
|
||||
model,
|
||||
modelVramUsage: postModelVramUsage - preModelVramUsage,
|
||||
modelRamUsage: postModelRamUsage - preModelRamUsage,
|
||||
startContextSize,
|
||||
maxContextSize,
|
||||
minContextSize,
|
||||
flashAttention,
|
||||
swaFullCache,
|
||||
batchSize,
|
||||
tests,
|
||||
evaluateText,
|
||||
exitAfterMeasurement
|
||||
});
|
||||
await model.dispose();
|
||||
return measurementsDone;
|
||||
}
|
||||
catch (err) {
|
||||
sendInfoBack({
|
||||
type: "error",
|
||||
error: String(err),
|
||||
gpuLayers: gpuLayers
|
||||
});
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
process.on("message", async (message) => {
|
||||
if (message.type === "start") {
|
||||
for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
|
||||
if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
|
||||
const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
|
||||
const contextSizeCheckPlan = getContextSizesCheckPlan(message.maxContextSize != null
|
||||
? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
|
||||
: ggufInsights.trainContextSize ?? 4096, message.tests, message.minContextSize);
|
||||
const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
|
||||
if (firstContextSizeCheck == null)
|
||||
continue;
|
||||
}
|
||||
const measurementsDone = await testWithGpuLayers({
|
||||
modelPath: message.modelPath,
|
||||
useMmap: message.useMmap,
|
||||
useDirectIo: message.useDirectIo,
|
||||
gpuLayers,
|
||||
tests: message.tests,
|
||||
startContextSize: gpuLayers == message.maxGpuLayers
|
||||
? message.initialMaxContextSize
|
||||
: undefined,
|
||||
maxContextSize: message.maxContextSize,
|
||||
minContextSize: message.minContextSize,
|
||||
flashAttention: message.flashAttention,
|
||||
swaFullCache: message.swaFullCache,
|
||||
batchSize: message.batchSize,
|
||||
evaluateText: message.evaluateText,
|
||||
exitAfterMeasurement: message.exitAfterMeasurement
|
||||
});
|
||||
if (measurementsDone > 0 && message.exitAfterMeasurement) {
|
||||
sendInfoBack({ type: "exit" });
|
||||
return;
|
||||
}
|
||||
}
|
||||
sendInfoBack({ type: "done" });
|
||||
}
|
||||
else if (message.type === "exit") {
|
||||
await llama.dispose();
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
process.send({ type: "ready" });
|
||||
}
|
||||
function getContextSizesCheckPlan(trainContextSize, tests = 10, minContextSize) {
|
||||
const res = [];
|
||||
let shouldStop = false;
|
||||
const attemptToCoverSizes = [256, 512, 1024, 2048, 4096];
|
||||
function addSize(size) {
|
||||
if (size > trainContextSize) {
|
||||
size = trainContextSize;
|
||||
shouldStop = true;
|
||||
}
|
||||
if (size < 2)
|
||||
size = 2;
|
||||
size = padSafeContextSize(size, "up");
|
||||
if (res[res.length - 1] === size) {
|
||||
shouldStop = true;
|
||||
return;
|
||||
}
|
||||
res.push(size);
|
||||
}
|
||||
while (!shouldStop && res.length < tests) {
|
||||
const lastSize = res[res.length - 1];
|
||||
if (lastSize == null) {
|
||||
addSize(Math.max(minContextSize ?? 0, Math.min(attemptToCoverSizes[0], trainContextSize / tests)));
|
||||
continue;
|
||||
}
|
||||
const stepSizesLeft = Math.floor((trainContextSize - Math.min(lastSize, attemptToCoverSizes[attemptToCoverSizes.length - 1])) / (tests - res.length));
|
||||
let stopAddingAttemptedSizes = false;
|
||||
for (const size of attemptToCoverSizes) {
|
||||
if (stepSizesLeft > lastSize && lastSize < size && size <= trainContextSize) {
|
||||
addSize(size);
|
||||
stopAddingAttemptedSizes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (stopAddingAttemptedSizes)
|
||||
continue;
|
||||
addSize(lastSize + stepSizesLeft);
|
||||
}
|
||||
return res.reverse();
|
||||
}
|
||||
function getNextItemInCheckContextSizesPlan(plan, currentSize) {
|
||||
for (const size of plan) {
|
||||
if (size < currentSize)
|
||||
return size;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function padStartAnsi(text, length, padChar = " ") {
|
||||
const textWithoutAnsi = stripAnsi(text);
|
||||
return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
|
||||
}
|
||||
function getMemoryUsage(llama) {
|
||||
const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
|
||||
const vramUsage = llama._bindings.getGpuVramInfo();
|
||||
let memoryUsage = totalMemoryUsage;
|
||||
const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
|
||||
if (unifiedMemoryVramUsage <= memoryUsage)
|
||||
memoryUsage -= unifiedMemoryVramUsage;
|
||||
return memoryUsage;
|
||||
}
|
||||
//# sourceMappingURL=InspectMeasureCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
4
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.d.ts
generated
vendored
Normal file
4
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type SourceCommand = {};
|
||||
export declare const SourceCommand: CommandModule<object, SourceCommand>;
|
||||
export {};
|
||||
19
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.js
generated
vendored
Normal file
19
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.js
generated
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
import { documentationPageUrls } from "../../../config.js";
|
||||
import { DownloadCommand } from "./commands/DownloadCommand.js";
|
||||
import { BuildCommand } from "./commands/BuildCommand.js";
|
||||
import { ClearCommand } from "./commands/ClearCommand.js";
|
||||
export const SourceCommand = {
|
||||
command: "source <command>",
|
||||
describe: withCliCommandDescriptionDocsUrl("Manage `llama.cpp` source code", documentationPageUrls.CLI.Source.index),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.command(DownloadCommand)
|
||||
.command(BuildCommand)
|
||||
.command(ClearCommand);
|
||||
},
|
||||
async handler() {
|
||||
// this function must exist, even though we do nothing here
|
||||
}
|
||||
};
|
||||
//# sourceMappingURL=SourceCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/source/SourceCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"SourceCommand.js","sourceRoot":"","sources":["../../../../src/cli/commands/source/SourceCommand.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,gCAAgC,EAAC,MAAM,iDAAiD,CAAC;AACjG,OAAO,EAAC,qBAAqB,EAAC,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAC,eAAe,EAAC,MAAM,+BAA+B,CAAC;AAC9D,OAAO,EAAC,YAAY,EAAC,MAAM,4BAA4B,CAAC;AACxD,OAAO,EAAC,YAAY,EAAC,MAAM,4BAA4B,CAAC;AAMxD,MAAM,CAAC,MAAM,aAAa,GAAyC;IAC/D,OAAO,EAAE,kBAAkB;IAC3B,QAAQ,EAAE,gCAAgC,CACtC,gCAAgC,EAChC,qBAAqB,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CACzC;IACD,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,OAAO,CAAC,eAAe,CAAC;aACxB,OAAO,CAAC,YAAY,CAAC;aACrB,OAAO,CAAC,YAAY,CAAC,CAAC;IAC/B,CAAC;IACD,KAAK,CAAC,OAAO;QACT,2DAA2D;IAC/D,CAAC;CACJ,CAAC"}
|
||||
16
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.d.ts
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
import process from "process";
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type BuildCommand = {
|
||||
arch?: typeof process.arch;
|
||||
nodeTarget?: string;
|
||||
gpu?: BuildGpu | "auto";
|
||||
noUsageExample?: boolean;
|
||||
};
|
||||
export declare const BuildCommand: CommandModule<object, BuildCommand>;
|
||||
export declare function BuildLlamaCppCommand({ arch, nodeTarget, gpu, noUsageExample,
|
||||
/** @internal */
|
||||
noCustomCmakeBuildOptionsInBinaryFolderName,
|
||||
/** @internal */
|
||||
ciMode }: BuildCommand): Promise<void>;
|
||||
export {};
|
||||
148
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.js
generated
vendored
Normal file
148
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.js
generated
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
import process from "process";
|
||||
import chalk from "chalk";
|
||||
import { compileLlamaCpp } from "../../../../bindings/utils/compileLLamaCpp.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { clearTempFolder } from "../../../../utils/clearTempFolder.js";
|
||||
import { builtinLlamaCppGitHubRepo, builtinLlamaCppRelease, isCI, defaultLlamaCppGpuSupport, documentationPageUrls } from "../../../../config.js";
|
||||
import { downloadCmakeIfNeeded } from "../../../../utils/cmake.js";
|
||||
import withStatusLogs from "../../../../utils/withStatusLogs.js";
|
||||
import { logBinaryUsageExampleToConsole } from "../../../../bindings/utils/logBinaryUsageExampleToConsole.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
import { resolveCustomCmakeOptions } from "../../../../bindings/utils/resolveCustomCmakeOptions.js";
|
||||
import { getClonedLlamaCppRepoReleaseInfo, isLlamaCppRepoCloned } from "../../../../bindings/utils/cloneLlamaCppRepo.js";
|
||||
import { nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { logUsedGpuTypeOption } from "../../../utils/logUsedGpuTypeOption.js";
|
||||
import { getGpuTypesToUseForOption } from "../../../../bindings/utils/getGpuTypesToUseForOption.js";
|
||||
import { getConsoleLogPrefix } from "../../../../utils/getConsoleLogPrefix.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getPlatformInfo } from "../../../../bindings/utils/getPlatformInfo.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
export const BuildCommand = {
|
||||
command: "build",
|
||||
aliases: ["compile"],
|
||||
describe: withCliCommandDescriptionDocsUrl("Compile the currently downloaded `llama.cpp` source code", documentationPageUrls.CLI.Source.Build),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("arch", {
|
||||
alias: "a",
|
||||
type: "string",
|
||||
coerce: (value) => value,
|
||||
description: "The architecture to compile llama.cpp for"
|
||||
})
|
||||
.option("nodeTarget", {
|
||||
alias: "t",
|
||||
type: "string",
|
||||
description: "The Node.js version to compile llama.cpp for. Example: `v18.0.0`"
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
default: defaultLlamaCppGpuSupport,
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: parseNodeLlamaCppGpuOption,
|
||||
description: "Compute layer implementation type to use for llama.cpp"
|
||||
})
|
||||
.option("noUsageExample", {
|
||||
alias: "nu",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Don't print code usage example after building"
|
||||
})
|
||||
.option("noCustomCmakeBuildOptionsInBinaryFolderName", {
|
||||
type: "boolean",
|
||||
hidden: true, // this is only for the CI to use
|
||||
default: false,
|
||||
description: "Don't include custom CMake build options in build folder name"
|
||||
})
|
||||
.option("ciMode", {
|
||||
type: "boolean",
|
||||
hidden: true, // this is only for the CI to use
|
||||
default: false,
|
||||
description: "Enable CI only build options"
|
||||
});
|
||||
},
|
||||
handler: BuildLlamaCppCommand
|
||||
};
|
||||
export async function BuildLlamaCppCommand({ arch = undefined, nodeTarget = undefined, gpu = defaultLlamaCppGpuSupport, noUsageExample = false,
|
||||
/** @internal */
|
||||
noCustomCmakeBuildOptionsInBinaryFolderName = false,
|
||||
/** @internal */
|
||||
ciMode = false }) {
|
||||
if (!(await isLlamaCppRepoCloned())) {
|
||||
console.log(chalk.red('llama.cpp is not downloaded. Please run "node-llama-cpp source download" first'));
|
||||
process.exit(1);
|
||||
}
|
||||
const includeBuildOptionsInBinaryFolderName = !noCustomCmakeBuildOptionsInBinaryFolderName || !isCI;
|
||||
const clonedLlamaCppRepoReleaseInfo = await getClonedLlamaCppRepoReleaseInfo();
|
||||
const platform = getPlatform();
|
||||
const platformInfo = await getPlatformInfo();
|
||||
const customCmakeOptions = resolveCustomCmakeOptions();
|
||||
const buildGpusToTry = await getGpuTypesToUseForOption(gpu, { platform, arch });
|
||||
let downloadedCmake = false;
|
||||
for (let i = 0; i < buildGpusToTry.length; i++) {
|
||||
const gpuToTry = buildGpusToTry[i];
|
||||
const isLastItem = i === buildGpusToTry.length - 1;
|
||||
if (gpuToTry == null)
|
||||
continue;
|
||||
logUsedGpuTypeOption(gpuToTry);
|
||||
if (!downloadedCmake) {
|
||||
await downloadCmakeIfNeeded(true);
|
||||
downloadedCmake = true;
|
||||
}
|
||||
const buildOptions = {
|
||||
customCmakeOptions,
|
||||
progressLogs: true,
|
||||
platform,
|
||||
platformInfo,
|
||||
arch: arch
|
||||
? arch
|
||||
: process.arch,
|
||||
gpu: gpuToTry,
|
||||
llamaCpp: {
|
||||
repo: clonedLlamaCppRepoReleaseInfo?.llamaCppGithubRepo ?? builtinLlamaCppGitHubRepo,
|
||||
release: clonedLlamaCppRepoReleaseInfo?.tag ?? builtinLlamaCppRelease
|
||||
}
|
||||
};
|
||||
try {
|
||||
await withStatusLogs({
|
||||
loading: chalk.blue("Compiling llama.cpp"),
|
||||
success: chalk.blue("Compiled llama.cpp"),
|
||||
fail: chalk.blue("Failed to compile llama.cpp")
|
||||
}, async () => {
|
||||
await compileLlamaCpp(buildOptions, {
|
||||
nodeTarget: nodeTarget ? nodeTarget : undefined,
|
||||
updateLastBuildInfo: true,
|
||||
downloadCmakeIfNeeded: false,
|
||||
ensureLlamaCppRepoIsCloned: false,
|
||||
includeBuildOptionsInBinaryFolderName,
|
||||
ciMode: isCI && ciMode
|
||||
});
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
console.error(getConsoleLogPrefix() +
|
||||
`Failed to build llama.cpp with ${getPrettyBuildGpuName(gpuToTry)} support. ` +
|
||||
(!isLastItem
|
||||
? `falling back to building llama.cpp with ${getPrettyBuildGpuName(buildGpusToTry[i + 1])} support. `
|
||||
: "") +
|
||||
"Error:", err);
|
||||
if (isLastItem)
|
||||
throw err;
|
||||
continue;
|
||||
}
|
||||
await withOra({
|
||||
loading: chalk.blue("Removing temporary files"),
|
||||
success: chalk.blue("Removed temporary files"),
|
||||
fail: chalk.blue("Failed to remove temporary files")
|
||||
}, async () => {
|
||||
await clearTempFolder();
|
||||
});
|
||||
if (!noUsageExample) {
|
||||
console.log();
|
||||
logBinaryUsageExampleToConsole(buildOptions, gpu !== "auto", true);
|
||||
console.log();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=BuildCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/BuildCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"BuildCommand.js","sourceRoot":"","sources":["../../../../../src/cli/commands/source/commands/BuildCommand.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAE9B,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAC,eAAe,EAAC,MAAM,+CAA+C,CAAC;AAC9E,OAAO,OAAO,MAAM,8BAA8B,CAAC;AACnD,OAAO,EAAC,eAAe,EAAC,MAAM,sCAAsC,CAAC;AACrE,OAAO,EAAC,yBAAyB,EAAE,sBAAsB,EAAE,IAAI,EAAE,yBAAyB,EAAE,qBAAqB,EAAC,MAAM,uBAAuB,CAAC;AAChJ,OAAO,EAAC,qBAAqB,EAAC,MAAM,4BAA4B,CAAC;AACjE,OAAO,cAAc,MAAM,qCAAqC,CAAC;AACjE,OAAO,EAAC,8BAA8B,EAAC,MAAM,8DAA8D,CAAC;AAC5G,OAAO,EAAC,WAAW,EAAC,MAAM,2CAA2C,CAAC;AACtE,OAAO,EAAC,yBAAyB,EAAC,MAAM,yDAAyD,CAAC;AAClG,OAAO,EAAC,gCAAgC,EAAE,oBAAoB,EAAC,MAAM,iDAAiD,CAAC;AACvH,OAAO,EAAyB,sBAAsB,EAAE,0BAA0B,EAAC,MAAM,+BAA+B,CAAC;AACzH,OAAO,EAAC,oBAAoB,EAAC,MAAM,wCAAwC,CAAC;AAC5E,OAAO,EAAC,yBAAyB,EAAC,MAAM,yDAAyD,CAAC;AAClG,OAAO,EAAC,mBAAmB,EAAC,MAAM,0CAA0C,CAAC;AAC7E,OAAO,EAAC,qBAAqB,EAAC,MAAM,gCAAgC,CAAC;AACrE,OAAO,EAAC,eAAe,EAAC,MAAM,+CAA+C,CAAC;AAC9E,OAAO,EAAC,gCAAgC,EAAC,MAAM,oDAAoD,CAAC;AAepG,MAAM,CAAC,MAAM,YAAY,GAAwC;IAC7D,OAAO,EAAE,OAAO;IAChB,OAAO,EAAE,CAAC,SAAS,CAAC;IACpB,QAAQ,EAAE,gCAAgC,CACtC,0DAA0D,EAC1D,qBAAqB,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CACzC;IACD,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,MAAM,CAAC,MAAM,EAAE;YACZ,KAAK,EAAE,GAAG;YACV,IAAI,EAAE,QAAQ;YACd,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK;YACxB,WAAW,EAAE,2CAA2C;SAC3D,CAAC;aACD,MAAM,CAAC,YAAY,EAAE;YAClB,KAAK,EAAE,GAAG;YACV,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,kEAAkE;SAClF,CAAC;aACD,MAAM,CAAC,KAAK,EAAE;YACX,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,yBAAyB;YAElC,2FAA2F;YAC3F,OAAO,EAAE,sBAAwF;YACjG,MAAM,EAAE,0BAA0B;YAClC,WAAW,EAAE,wDAAwD;SACxE,CAAC;aACD,MAAM,CAAC,gBAAgB,EAAE;YACtB,KAAK,EAAE,IAAI;YACX,IAAI,EAAE,SAAS;YACf,OAAO,EAAE,KAAK;YACd,WAAW,EAAE,+CAA+C;SAC/D,CAAC;aACD,MAAM,CAAC,6CAA6C,EAAE;YACnD,IAAI,EAAE,SAAS;YACf,MAAM,EAAE,IAAI,EAAE,iCAAiC;YAC/C,OAAO,EAAE,KAAK;YACd,WAAW,EAAE,+DAA+D;SAC/E,CAAC;aACD,MAAM,CAAC,QAAQ,EAAE;YACd,IAAI,EAAE,SAAS;YACf,MAAM,EAAE,IAAI,EAAE,iCAAiC;YAC/C,OAAO,EAAE,KAAK;YACd,WAAW,EAAE,8BAA8B;SAC9C,CAAC,CAAC;IACX,CAAC;IACD,OAAO,EAAE,oBAAoB;CAChC,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,EACvC,IAAI,GAAG,SAAS,EAChB,UAAU,GAAG,SAAS,EACtB,GAAG,GAAG,yBAAyB,EAC/B,cAAc,GAAG,KAAK;AAEtB,gBAAgB;AAChB,2CAA2C,GAAG,KAAK;AAEnD,gBAAgB;AAChB,MAAM,GAAG,KAAK,EACH;IACX,IAAI,CAAC,CAAC,MAAM,oBAAoB,EAAE,CAAC,EAAE,CAAC;QAClC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,gFAAgF,CAAC,CAAC,CAAC;QACzG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,MAAM,qCAAqC,GAAG,CAAC,2CAA2C,IAAI,CAAC,IAAI,CAAC;IAEpG,MAAM,6BAA6B,GAAG,MAAM,gCAAgC,EAAE,CAAC;IAE/E,MAAM,QAAQ,GAAG,WAAW,EAAE,CAAC;IAC/B,MAAM,YAAY,GAAG,MAAM,eAAe,EAAE,CAAC;IAC7C,MAAM,kBAAkB,GAAG,yBAAyB,EAAE,CAAC;IACvD,MAAM,cAAc,GAAe,MAAM,yBAAyB,CAAC,GAAG,EAAE,EAAC,QAAQ,EAAE,IAAI,EAAC,CAAC,CAAC;IAC1F,IAAI,eAAe,GAAG,KAAK,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7C,MAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,CAAC,KAAK,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC;QAEnD,IAAI,QAAQ,IAAI,IAAI;YAChB,SAAS;QAEb,oBAAoB,CAAC,QAAQ,CAAC,CAAC;QAE/B,IAAI,CAAC,eAAe,EAAE,CAAC;YACnB,MAAM,qBAAqB,CAAC,IAAI,CAAC,CAAC;YAClC,eAAe,GAAG,IAAI,CAAC;QAC3B,CAAC;QAED,MAAM,YAAY,GAAiB;YAC/B,kBAAkB;YAClB,YAAY,EAAE,IAAI;YAClB,QAAQ;YACR,YAAY;YACZ,IAAI,EAAE,IAAI;gBACN,CAAC,CAAC,IAA2B;gBAC7B,CAAC,CAAC,OAAO,CAAC,IAAI;YAClB,GAAG,EAAE,QAAQ;YACb,QAAQ,EAAE;gBACN,IAAI,EAAE,6BAA6B,EAAE,kBAAkB,IAAI,yBAAyB;gBACpF,OAAO,EAAE,6BAA6B,EAAE,GAAG,IAAI,sBAAsB;aACxE;SACJ,CAAC;QAEF,IAAI,CAAC;YACD,MAAM,cAAc,CAAC;gBACjB,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC;gBAC1C,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC;gBACzC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC;aAClD,EAAE,KAAK,IAAI,EAAE;gBACV,MAAM,eAAe,CAAC,YAAY,EAAE;oBAChC,UAAU,EAAE,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;oBAC/C,mBAAmB,EAAE,IAAI;oBACzB,qBAAqB,EAAE,KAAK;oBAC5B,0BAA0B,EAAE,KAAK;oBACjC,qCAAqC;oBACrC,MAAM,EAAE,IAAI,IAAI,MAAM;iBACzB,CAAC,CAAC;YACP,CAAC,CAAC,CAAC;QACP,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CACT,mBAAmB,EAAE;gBACrB,kCAAkC,qBAAqB,CAAC,QAAQ,CAAC,YAAY;gBAC7E,CACI,CAAC,UAAU;oBACP,CAAC,CAAC,2CAA2C,qBAAqB,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,YAAY;oBACrG,CAAC,CAAC,EAAE,CACX;gBACD,QAAQ,EACR,GAAG,CACN,CAAC;YAEF,IAAI,UAAU;gBACV,MAAM,GAAG,CAAC;YAEd,SAAS;QACb,CAAC;QAED,MAAM,OAAO,CAAC;YACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC;YAC/C,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,yBAAyB,CAAC;YAC9C,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,kCAAkC,CAAC;SACvD,EAAE,KAAK,IAAI,EAAE;YACV,MAAM,eAAe,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,cAAc,EAAE,CAAC;YAClB,OAAO,CAAC,GAAG,EAAE,CAAC;YACd,8BAA8B,CAAC,YAAY,EAAE,GAAG,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,EAAE,CAAC;QAClB,CAAC;QAED,MAAM;IACV,CAAC;AACL,CAAC"}
|
||||
7
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.d.ts
generated
vendored
Normal file
7
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
import { CommandModule } from "yargs";
|
||||
type ClearCommand = {
|
||||
type: "source" | "builds" | "cmake" | "all";
|
||||
};
|
||||
export declare const ClearCommand: CommandModule<object, ClearCommand>;
|
||||
export declare function ClearLlamaCppBuildCommand({ type }: ClearCommand): Promise<void>;
|
||||
export {};
|
||||
54
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.js
generated
vendored
Normal file
54
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.js
generated
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
import fs from "fs-extra";
|
||||
import chalk from "chalk";
|
||||
import { documentationPageUrls, llamaCppDirectory, llamaCppDirectoryInfoFilePath } from "../../../../config.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { clearAllLocalBuilds } from "../../../../bindings/utils/clearAllLocalBuilds.js";
|
||||
import { clearLocalCmake, fixXpackPermissions } from "../../../../utils/cmake.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
export const ClearCommand = {
|
||||
command: "clear [type]",
|
||||
aliases: ["clean"],
|
||||
describe: withCliCommandDescriptionDocsUrl("Clear files created by `node-llama-cpp`", documentationPageUrls.CLI.Source.Clear),
|
||||
builder(yargs) {
|
||||
return yargs
|
||||
.option("type", {
|
||||
type: "string",
|
||||
choices: ["source", "builds", "cmake", "all"],
|
||||
default: "all",
|
||||
description: "Files to clear"
|
||||
});
|
||||
},
|
||||
handler: ClearLlamaCppBuildCommand
|
||||
};
|
||||
export async function ClearLlamaCppBuildCommand({ type }) {
|
||||
if (type === "source" || type === "all") {
|
||||
await withOra({
|
||||
loading: chalk.blue("Clearing source"),
|
||||
success: chalk.blue("Cleared source"),
|
||||
fail: chalk.blue("Failed to clear source")
|
||||
}, async () => {
|
||||
await fs.remove(llamaCppDirectory);
|
||||
await fs.remove(llamaCppDirectoryInfoFilePath);
|
||||
});
|
||||
}
|
||||
if (type === "builds" || type === "all") {
|
||||
await withOra({
|
||||
loading: chalk.blue("Clearing all builds"),
|
||||
success: chalk.blue("Cleared all builds"),
|
||||
fail: chalk.blue("Failed to clear all builds")
|
||||
}, async () => {
|
||||
await clearAllLocalBuilds();
|
||||
});
|
||||
}
|
||||
if (type === "cmake" || type === "all") {
|
||||
await withOra({
|
||||
loading: chalk.blue("Clearing internal cmake"),
|
||||
success: chalk.blue("Cleared internal cmake"),
|
||||
fail: chalk.blue("Failed to clear internal cmake")
|
||||
}, async () => {
|
||||
await fixXpackPermissions();
|
||||
await clearLocalCmake();
|
||||
});
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=ClearCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/ClearCommand.js.map
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"ClearCommand.js","sourceRoot":"","sources":["../../../../../src/cli/commands/source/commands/ClearCommand.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,UAAU,CAAC;AAC1B,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAC,qBAAqB,EAAE,iBAAiB,EAAE,6BAA6B,EAAC,MAAM,uBAAuB,CAAC;AAC9G,OAAO,OAAO,MAAM,8BAA8B,CAAC;AACnD,OAAO,EAAC,mBAAmB,EAAC,MAAM,mDAAmD,CAAC;AACtF,OAAO,EAAC,eAAe,EAAE,mBAAmB,EAAC,MAAM,4BAA4B,CAAC;AAChF,OAAO,EAAC,gCAAgC,EAAC,MAAM,oDAAoD,CAAC;AAMpG,MAAM,CAAC,MAAM,YAAY,GAAwC;IAC7D,OAAO,EAAE,cAAc;IACvB,OAAO,EAAE,CAAC,OAAO,CAAC;IAClB,QAAQ,EAAE,gCAAgC,CACtC,yCAAyC,EACzC,qBAAqB,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CACzC;IACD,OAAO,CAAC,KAAK;QACT,OAAO,KAAK;aACP,MAAM,CAAC,MAAM,EAAE;YACZ,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAkC;YAC9E,OAAO,EAAE,KAA6B;YACtC,WAAW,EAAE,gBAAgB;SAChC,CAAC,CAAC;IACX,CAAC;IACD,OAAO,EAAE,yBAAyB;CACrC,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAAC,EAAC,IAAI,EAAe;IAChE,IAAI,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACtC,MAAM,OAAO,CAAC;YACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC;YACtC,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC;YACrC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,wBAAwB,CAAC;SAC7C,EAAE,KAAK,IAAI,EAAE;YACV,MAAM,EAAE,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACnC,MAAM,EAAE,CAAC,MAAM,CAAC,6BAA6B,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;IACP,CAAC;IAED,IAAI,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACtC,MAAM,OAAO,CAAC;YACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC;YAC1C,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC;YACzC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,4BAA4B,CAAC;SACjD,EAAE,KAAK,IAAI,EAAE;YACV,MAAM,mBAAmB,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;IACP,CAAC;IAED,IAAI,IAAI,KAAK,OAAO,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACrC,MAAM,OAAO,CAAC;YACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,yBAAyB,CAAC;YAC9C,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,wBAAwB,CAAC;YAC7C,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,gCAAgC,CAAC;SACrD,EAAE,KAAK,IAAI,EAAE;YACV,MAAM,mBAAmB,EAAE,CAAC;YAC5B,MAAM,eAAe,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;IACP,CAAC;AACL,CAAC"}
|
||||
16
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.d.ts
generated
vendored
Normal file
16
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.d.ts
generated
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
import process from "process";
|
||||
import { CommandModule } from "yargs";
|
||||
import { BuildGpu } from "../../../../bindings/types.js";
|
||||
type DownloadCommandArgs = {
|
||||
repo?: string;
|
||||
release?: "latest" | string;
|
||||
arch?: typeof process.arch;
|
||||
nodeTarget?: string;
|
||||
gpu?: BuildGpu | "auto";
|
||||
skipBuild?: boolean;
|
||||
noBundle?: boolean;
|
||||
noUsageExample?: boolean;
|
||||
};
|
||||
export declare const DownloadCommand: CommandModule<object, DownloadCommandArgs>;
|
||||
export declare function DownloadLlamaCppCommand(args: DownloadCommandArgs): Promise<void>;
|
||||
export {};
|
||||
219
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.js
generated
vendored
Normal file
219
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.js
generated
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
import process from "process";
|
||||
import fs from "fs-extra";
|
||||
import chalk from "chalk";
|
||||
import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, isCI, llamaCppDirectory, llamaCppDirectoryInfoFilePath, defaultLlamaCppGpuSupport, documentationPageUrls } from "../../../../config.js";
|
||||
import { compileLlamaCpp } from "../../../../bindings/utils/compileLLamaCpp.js";
|
||||
import withOra from "../../../../utils/withOra.js";
|
||||
import { clearTempFolder } from "../../../../utils/clearTempFolder.js";
|
||||
import { setBinariesGithubRelease } from "../../../../bindings/utils/binariesGithubRelease.js";
|
||||
import { downloadCmakeIfNeeded } from "../../../../utils/cmake.js";
|
||||
import withStatusLogs from "../../../../utils/withStatusLogs.js";
|
||||
import { getIsInDocumentationMode } from "../../../../state.js";
|
||||
import { getGitBundlePathForRelease, unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle } from "../../../../utils/gitReleaseBundles.js";
|
||||
import { cloneLlamaCppRepo } from "../../../../bindings/utils/cloneLlamaCppRepo.js";
|
||||
import { getPlatform } from "../../../../bindings/utils/getPlatform.js";
|
||||
import { resolveCustomCmakeOptions } from "../../../../bindings/utils/resolveCustomCmakeOptions.js";
|
||||
import { logBinaryUsageExampleToConsole } from "../../../../bindings/utils/logBinaryUsageExampleToConsole.js";
|
||||
import { resolveGithubRelease } from "../../../../utils/resolveGithubRelease.js";
|
||||
import { nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption } from "../../../../bindings/types.js";
|
||||
import { logUsedGpuTypeOption } from "../../../utils/logUsedGpuTypeOption.js";
|
||||
import { getGpuTypesToUseForOption } from "../../../../bindings/utils/getGpuTypesToUseForOption.js";
|
||||
import { getConsoleLogPrefix } from "../../../../utils/getConsoleLogPrefix.js";
|
||||
import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
|
||||
import { getPlatformInfo } from "../../../../bindings/utils/getPlatformInfo.js";
|
||||
import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
|
||||
export const DownloadCommand = {
|
||||
command: "download",
|
||||
describe: withCliCommandDescriptionDocsUrl("Download a release of `llama.cpp` and compile it", documentationPageUrls.CLI.Source.Download),
|
||||
builder(yargs) {
|
||||
const isInDocumentationMode = getIsInDocumentationMode();
|
||||
return yargs
|
||||
.option("repo", {
|
||||
type: "string",
|
||||
default: defaultLlamaCppGitHubRepo,
|
||||
description: "The GitHub repository to download a release of llama.cpp from. Can also be set via the `NODE_LLAMA_CPP_REPO` environment variable"
|
||||
})
|
||||
.option("release", {
|
||||
type: "string",
|
||||
default: isInDocumentationMode ? "<current build>" : defaultLlamaCppRelease,
|
||||
description: "The tag of the llama.cpp release to download. Set to `latest` to download the latest release. Can also be set via the `NODE_LLAMA_CPP_REPO_RELEASE` environment variable"
|
||||
})
|
||||
.option("arch", {
|
||||
alias: "a",
|
||||
type: "string",
|
||||
coerce: (value) => value,
|
||||
description: "The architecture to compile llama.cpp for"
|
||||
})
|
||||
.option("nodeTarget", {
|
||||
alias: "t",
|
||||
type: "string",
|
||||
description: "The Node.js version to compile llama.cpp for. Example: `v18.0.0`"
|
||||
})
|
||||
.option("gpu", {
|
||||
type: "string",
|
||||
default: defaultLlamaCppGpuSupport,
|
||||
// yargs types don't support passing `false` as a choice, although it is supported by yargs
|
||||
choices: nodeLlamaCppGpuOptions,
|
||||
coerce: parseNodeLlamaCppGpuOption,
|
||||
description: "Compute layer implementation type to use for llama.cpp"
|
||||
})
|
||||
.option("skipBuild", {
|
||||
alias: "sb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Skip building llama.cpp after downloading it"
|
||||
})
|
||||
.option("noBundle", {
|
||||
alias: "nb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Download a llama.cpp release only from GitHub, even if a local git bundle exists for the release"
|
||||
})
|
||||
.option("noUsageExample", {
|
||||
alias: "nu",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
description: "Don't print code usage example after building"
|
||||
})
|
||||
.option("updateBinariesReleaseMetadataAndSaveGitBundle", {
|
||||
type: "boolean",
|
||||
hidden: true, // this is only for the CI to use
|
||||
default: false,
|
||||
description: "Update the binariesGithubRelease.json file with the release of llama.cpp that was downloaded"
|
||||
});
|
||||
},
|
||||
handler: DownloadLlamaCppCommand
|
||||
};
|
||||
export async function DownloadLlamaCppCommand(args) {
|
||||
const { repo = defaultLlamaCppGitHubRepo, release = defaultLlamaCppRelease, arch = undefined, nodeTarget = undefined, gpu = defaultLlamaCppGpuSupport, skipBuild = false, noBundle = false, noUsageExample = false, updateBinariesReleaseMetadataAndSaveGitBundle = false } = args;
|
||||
const useBundle = noBundle != true;
|
||||
const platform = getPlatform();
|
||||
const platformInfo = await getPlatformInfo();
|
||||
const customCmakeOptions = resolveCustomCmakeOptions();
|
||||
const buildGpusToTry = skipBuild
|
||||
? []
|
||||
: await getGpuTypesToUseForOption(gpu, { platform, arch });
|
||||
const [githubOwner, githubRepo] = repo.split("/");
|
||||
if (githubOwner == null || githubRepo == null)
|
||||
throw new Error(`Invalid GitHub repository: ${repo}`);
|
||||
let downloadedCmake = false;
|
||||
console.log(`${chalk.yellow("Repo:")} ${repo}`);
|
||||
console.log(`${chalk.yellow("Release:")} ${release}`);
|
||||
if (!skipBuild) {
|
||||
logUsedGpuTypeOption(buildGpusToTry[0]);
|
||||
}
|
||||
console.log();
|
||||
let githubReleaseTag = (useBundle && (await getGitBundlePathForRelease(githubOwner, githubRepo, release)) != null)
|
||||
? release
|
||||
: null;
|
||||
if (githubReleaseTag == null)
|
||||
await withOra({
|
||||
loading: chalk.blue("Fetching llama.cpp info"),
|
||||
success: chalk.blue("Fetched llama.cpp info"),
|
||||
fail: chalk.blue("Failed to fetch llama.cpp info")
|
||||
}, async () => {
|
||||
githubReleaseTag = await resolveGithubRelease(githubOwner, githubRepo, release);
|
||||
});
|
||||
await clearTempFolder();
|
||||
await withOra({
|
||||
loading: chalk.blue("Removing existing llama.cpp directory"),
|
||||
success: chalk.blue("Removed existing llama.cpp directory"),
|
||||
fail: chalk.blue("Failed to remove existing llama.cpp directory")
|
||||
}, async () => {
|
||||
await fs.remove(llamaCppDirectory);
|
||||
await fs.remove(llamaCppDirectoryInfoFilePath);
|
||||
});
|
||||
await cloneLlamaCppRepo(githubOwner, githubRepo, githubReleaseTag, useBundle);
|
||||
if (!skipBuild) {
|
||||
for (let i = 0; i < buildGpusToTry.length; i++) {
|
||||
const gpuToTry = buildGpusToTry[i];
|
||||
const isLastItem = i === buildGpusToTry.length - 1;
|
||||
if (gpuToTry == null)
|
||||
continue;
|
||||
if (i > 0) // we already logged the first gpu before
|
||||
logUsedGpuTypeOption(gpuToTry);
|
||||
if (!downloadedCmake) {
|
||||
await downloadCmakeIfNeeded(true);
|
||||
downloadedCmake = true;
|
||||
}
|
||||
const buildOptions = {
|
||||
customCmakeOptions,
|
||||
progressLogs: true,
|
||||
platform,
|
||||
platformInfo,
|
||||
arch: arch
|
||||
? arch
|
||||
: process.arch,
|
||||
gpu: gpuToTry,
|
||||
llamaCpp: {
|
||||
repo,
|
||||
release: githubReleaseTag
|
||||
}
|
||||
};
|
||||
try {
|
||||
await withStatusLogs({
|
||||
loading: chalk.blue("Compiling llama.cpp"),
|
||||
success: chalk.blue("Compiled llama.cpp"),
|
||||
fail: chalk.blue("Failed to compile llama.cpp")
|
||||
}, async () => {
|
||||
await compileLlamaCpp(buildOptions, {
|
||||
nodeTarget: nodeTarget ? nodeTarget : undefined,
|
||||
updateLastBuildInfo: true,
|
||||
downloadCmakeIfNeeded: false,
|
||||
ensureLlamaCppRepoIsCloned: false,
|
||||
includeBuildOptionsInBinaryFolderName: true
|
||||
});
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
console.error(getConsoleLogPrefix() +
|
||||
`Failed to build llama.cpp with ${getPrettyBuildGpuName(gpuToTry)} support. ` +
|
||||
(!isLastItem
|
||||
? `falling back to building llama.cpp with ${getPrettyBuildGpuName(buildGpusToTry[i + 1])} support. `
|
||||
: "") +
|
||||
"Error:", err);
|
||||
if (isLastItem)
|
||||
throw err;
|
||||
continue;
|
||||
}
|
||||
if (!noUsageExample) {
|
||||
console.log();
|
||||
console.log();
|
||||
logBinaryUsageExampleToConsole(buildOptions, gpu !== "auto", true);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (!noUsageExample) {
|
||||
const buildOptions = {
|
||||
customCmakeOptions,
|
||||
progressLogs: true,
|
||||
platform,
|
||||
platformInfo,
|
||||
arch: arch
|
||||
? arch
|
||||
: process.arch,
|
||||
gpu: buildGpusToTry[0],
|
||||
llamaCpp: {
|
||||
repo,
|
||||
release: githubReleaseTag
|
||||
}
|
||||
};
|
||||
console.log();
|
||||
console.log();
|
||||
logBinaryUsageExampleToConsole(buildOptions, gpu !== "auto", true);
|
||||
}
|
||||
if (isCI && updateBinariesReleaseMetadataAndSaveGitBundle) {
|
||||
await setBinariesGithubRelease(githubReleaseTag);
|
||||
await unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle();
|
||||
}
|
||||
console.log();
|
||||
console.log();
|
||||
console.log(`${chalk.yellow("Repo:")} ${repo}`);
|
||||
console.log(chalk.yellow("Release:") + " " + release + (release === "latest"
|
||||
? (" " + chalk.gray("(" + githubReleaseTag + ")"))
|
||||
: ""));
|
||||
console.log();
|
||||
console.log(chalk.green("Done"));
|
||||
}
|
||||
//# sourceMappingURL=DownloadCommand.js.map
|
||||
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.js.map
generated
vendored
Normal file
1
node_modules/node-llama-cpp/dist/cli/commands/source/commands/DownloadCommand.js.map
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user