First upload version 0.0.1

2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions
--- a/node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js
+++ b/node_modules/node-llama-cpp/dist/cli/commands/InfillCommand.js
@@ -0,0 +1,602 @@
+import * as readline from "readline";
+import process from "process";
+import path from "path";
+import chalk from "chalk";
+import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
+import { getLlama } from "../../bindings/getLlama.js";
+import { LlamaLogLevel, LlamaLogLevelGreaterThan, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, parseNumaOption } from "../../bindings/types.js";
+import { LlamaCompletion } from "../../evaluator/LlamaCompletion.js";
+import withOra from "../../utils/withOra.js";
+import { TokenMeter } from "../../evaluator/TokenMeter.js";
+import { printInfoLine } from "../utils/printInfoLine.js";
+import { printCommonInfoLines } from "../utils/printCommonInfoLines.js";
+import { resolveCommandGgufPath } from "../utils/resolveCommandGgufPath.js";
+import { withProgressLog } from "../../utils/withProgressLog.js";
+import { resolveHeaderFlag } from "../utils/resolveHeaderFlag.js";
+import { withCliCommandDescriptionDocsUrl } from "../utils/withCliCommandDescriptionDocsUrl.js";
+import { documentationPageUrls } from "../../config.js";
+import { ConsoleInteraction, ConsoleInteractionKey } from "../utils/ConsoleInteraction.js";
+import { DraftSequenceTokenPredictor } from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
+export const InfillCommand = {
+    command: "infill [modelPath]",
+    describe: withCliCommandDescriptionDocsUrl("Generate an infill completion for a given suffix and prefix texts", documentationPageUrls.CLI.Infill),
+    builder(yargs) {
+        return yargs
+            .option("modelPath", {
+            alias: ["m", "model", "path", "url", "uri"],
+            type: "string",
+            description: "Model file to use for the infill. Can be a path to a local file or a URI of a model file to download. Leave empty to choose from a list of recommended models"
+        })
+            .option("header", {
+            alias: ["H"],
+            type: "string",
+            array: true,
+            description: "Headers to use when downloading a model from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers."
+        })
+            .option("gpu", {
+            type: "string",
+            // yargs types don't support passing `false` as a choice, although it is supported by yargs
+            choices: nodeLlamaCppGpuOptions,
+            coerce: (value) => {
+                if (value == null || value == "")
+                    return undefined;
+                return parseNodeLlamaCppGpuOption(value);
+            },
+            defaultDescription: "Uses the latest local build, and fallbacks to \"auto\"",
+            description: "Compute layer implementation type to use for llama.cpp. If omitted, uses the latest local build, and fallbacks to \"auto\""
+        })
+            .option("systemInfo", {
+            alias: "i",
+            type: "boolean",
+            default: false,
+            description: "Print llama.cpp system info"
+        })
+            .option("prefix", {
+            type: "string",
+            description: "First prefix text to automatically load"
+        })
+            .option("prefixFile", {
+            type: "string",
+            description: "Path to a file to load prefix text from automatically"
+        })
+            .option("suffix", {
+            type: "string",
+            description: "First suffix text to automatically load. Requires `prefix` or `prefixFile` to be set"
+        })
+            .option("suffixFile", {
+            type: "string",
+            description: "Path to a file to load suffix text from automatically. Requires `prefix` or `prefixFile` to be set"
+        })
+            .option("contextSize", {
+            alias: "c",
+            type: "number",
+            description: "Context size to use for the model context",
+            default: -1,
+            defaultDescription: "Automatically determined based on the available VRAM"
+        })
+            .option("batchSize", {
+            alias: "b",
+            type: "number",
+            description: "Batch size to use for the model context"
+        })
+            .option("flashAttention", {
+            alias: "fa",
+            type: "boolean",
+            default: false,
+            description: "Enable flash attention"
+        })
+            .option("swaFullCache", {
+            alias: "noSwa",
+            type: "boolean",
+            default: false,
+            description: "Disable SWA (Sliding Window Attention) on supported models"
+        })
+            .option("threads", {
+            type: "number",
+            defaultDescription: "Number of cores that are useful for math on the current machine",
+            description: "Number of threads to use for the evaluation of tokens"
+        })
+            .option("temperature", {
+            alias: "t",
+            type: "number",
+            default: 0,
+            description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable."
+        })
+            .option("minP", {
+            alias: "mp",
+            type: "number",
+            default: 0,
+            description: "From the next token candidates, discard the percentage of tokens with the lowest probability. For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded. This is useful for generating more high-quality results when using a high temperature. Set to a value between `0` and `1` to enable. Only relevant when `temperature` is set to a value greater than `0`."
+        })
+            .option("topK", {
+            alias: "k",
+            type: "number",
+            default: 40,
+            description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0."
+        })
+            .option("topP", {
+            alias: "p",
+            type: "number",
+            default: 0.95,
+            description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`."
+        })
+            .option("seed", {
+            type: "number",
+            description: "Used to control the randomness of the generated text. Only relevant when using `temperature`.",
+            defaultDescription: "The current epoch time"
+        })
+            .option("gpuLayers", {
+            alias: "gl",
+            type: "number",
+            description: "number of layers to store in VRAM",
+            default: -1,
+            defaultDescription: "Automatically determined based on the available VRAM"
+        })
+            .option("repeatPenalty", {
+            alias: "rp",
+            type: "number",
+            default: 1.1,
+            description: "Prevent the model from repeating the same token too much. Set to `1` to disable."
+        })
+            .option("lastTokensRepeatPenalty", {
+            alias: "rpn",
+            type: "number",
+            default: 64,
+            description: "Number of recent tokens generated by the model to apply penalties to repetition of"
+        })
+            .option("penalizeRepeatingNewLine", {
+            alias: "rpnl",
+            type: "boolean",
+            default: true,
+            description: "Penalize new line tokens. set `--no-penalizeRepeatingNewLine` or `--no-rpnl` to disable"
+        })
+            .option("repeatFrequencyPenalty", {
+            alias: "rfp",
+            type: "number",
+            description: "For n time a token is in the `punishTokens` array, lower its probability by `n * repeatFrequencyPenalty`. Set to a value between `0` and `1` to enable."
+        })
+            .option("repeatPresencePenalty", {
+            alias: "rpp",
+            type: "number",
+            description: "Lower the probability of all the tokens in the `punishTokens` array by `repeatPresencePenalty`. Set to a value between `0` and `1` to enable."
+        })
+            .option("maxTokens", {
+            alias: "mt",
+            type: "number",
+            default: 0,
+            description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
+        })
+            .option("tokenPredictionDraftModel", {
+            alias: ["dm", "draftModel"],
+            type: "string",
+            description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+        })
+            .option("tokenPredictionModelContextSize", {
+            alias: ["dc", "draftContextSize", "draftContext"],
+            type: "number",
+            description: "Max context size to use for the draft sequence token prediction model context",
+            default: 4096
+        })
+            .option("debug", {
+            alias: "d",
+            type: "boolean",
+            default: false,
+            description: "Print llama.cpp info and debug logs"
+        })
+            .option("numa", {
+            type: "string",
+            // yargs types don't support passing `false` as a choice, although it is supported by yargs
+            choices: llamaNumaOptions,
+            coerce: (value) => {
+                if (value == null || value == "")
+                    return false;
+                return parseNumaOption(value);
+            },
+            defaultDescription: "false",
+            description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
+        })
+            .option("meter", {
+            type: "boolean",
+            default: false,
+            description: "Log how many tokens were used as input and output for each response"
+        })
+            .option("timing", {
+            type: "boolean",
+            default: false,
+            description: "Print how how long it took to generate each response"
+        })
+            .option("noMmap", {
+            type: "boolean",
+            default: false,
+            description: "Disable mmap (memory-mapped file) usage"
+        })
+            .option("noDirectIo", {
+            type: "boolean",
+            default: false,
+            description: "Disable Direct I/O usage when available"
+        })
+            .option("printTimings", {
+            alias: "pt",
+            type: "boolean",
+            default: false,
+            description: "Print llama.cpp's internal timings after each response"
+        });
+    },
+    async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
+        try {
+            await RunInfill({
+                modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
+                swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
+                repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
+            });
+        }
+        catch (err) {
+            await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+            console.error(err);
+            process.exit(1);
+        }
+    }
+};
+async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) {
+    if (contextSize === -1)
+        contextSize = undefined;
+    if (gpuLayers === -1)
+        gpuLayers = undefined;
+    const headers = resolveHeaderFlag(headerArg);
+    if (debug)
+        console.info(`${chalk.yellow("Log level:")} debug`);
+    const llamaLogLevel = debug
+        ? LlamaLogLevel.debug
+        : LlamaLogLevel.warn;
+    const llama = gpu == null
+        ? await getLlama("lastBuild", {
+            logLevel: llamaLogLevel,
+            numa
+        })
+        : await getLlama({
+            gpu,
+            logLevel: llamaLogLevel,
+            numa
+        });
+    const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
+    const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
+        flashAttention,
+        swaFullCache,
+        useMmap
+    });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            swaFullCache,
+            useMmap,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
+    if (systemInfo)
+        console.log(llama.systemInfo);
+    if (prefixFile != null && prefixFile !== "") {
+        if (prefix != null && prefix !== "")
+            console.warn(chalk.yellow("Both `prefix` and `prefixFile` were specified. `prefixFile` will be used."));
+        prefix = await fs.readFile(path.resolve(process.cwd(), prefixFile), "utf8");
+    }
+    if (suffixFile != null && suffixFile !== "") {
+        if (suffix != null && suffix !== "")
+            console.warn(chalk.yellow("Both `suffix` and `suffixFile` were specified. `suffixFile` will be used."));
+        suffix = await fs.readFile(path.resolve(process.cwd(), suffixFile), "utf8");
+    }
+    if (suffix != null && prefix == null) {
+        console.warn(chalk.yellow("Suffix was specified but no prefix was specified. Suffix will be ignored."));
+        suffix = undefined;
+    }
+    if (batchSize != null && contextSize != null && batchSize > contextSize) {
+        console.warn(chalk.yellow("Batch size is greater than the context size. Batch size will be set to the context size."));
+        batchSize = contextSize;
+    }
+    let initialPrefix = prefix ?? null;
+    let initialSuffix = suffix ?? null;
+    const model = await withProgressLog({
+        loadingText: chalk.blue.bold("Loading model"),
+        successText: chalk.blue("Model loaded"),
+        failText: chalk.blue("Failed to load model"),
+        liveUpdates: !debug,
+        noProgress: debug,
+        liveCtrlCSendsAbortSignal: true
+    }, async (progressUpdater) => {
+        try {
+            return await llama.loadModel({
+                modelPath: resolvedModelPath,
+                gpuLayers: gpuLayers != null
+                    ? gpuLayers
+                    : contextSize != null
+                        ? { fitContext: { contextSize } }
+                        : undefined,
+                defaultContextFlashAttention: flashAttention,
+                defaultContextSwaFullCache: swaFullCache,
+                useMmap,
+                useDirectIo,
+                ignoreMemorySafetyChecks: gpuLayers != null,
+                onLoadProgress(loadProgress) {
+                    progressUpdater.setProgress(loadProgress);
+                },
+                loadSignal: progressUpdater.abortSignal
+            });
+        }
+        catch (err) {
+            if (err === progressUpdater.abortSignal?.reason)
+                process.exit(0);
+            throw err;
+        }
+        finally {
+            if (llama.logLevel === LlamaLogLevel.debug) {
+                await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                console.info();
+            }
+        }
+    });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    defaultContextSwaFullCache: swaFullCache,
+                    useMmap,
+                    useDirectIo,
+                    onLoadProgress(loadProgress) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            }
+            catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+                throw err;
+            }
+            finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: { max: tokenPredictionModelContextSize }
+                });
+            }
+            finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+    const context = await withOra({
+        loading: chalk.blue("Creating context"),
+        success: chalk.blue("Context created"),
+        fail: chalk.blue("Failed to create context"),
+        useStatusLogs: debug
+    }, async () => {
+        try {
+            return await model.createContext({
+                contextSize: contextSize != null ? contextSize : undefined,
+                batchSize: batchSize != null ? batchSize : undefined,
+                threads: threads === null ? undefined : threads,
+                ignoreMemorySafetyChecks: gpuLayers != null || contextSize != null,
+                performanceTracking: printTimings
+            });
+        }
+        finally {
+            if (llama.logLevel === LlamaLogLevel.debug) {
+                await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                console.info();
+            }
+        }
+    });
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
+    const completion = new LlamaCompletion({
+        contextSequence
+    });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
+    let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
+    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+    const padTitle = await printCommonInfoLines({
+        context,
+        draftContext,
+        useMmap,
+        useDirectIo,
+        logBatchSize,
+        tokenMeterEnabled: meter
+    });
+    printInfoLine({
+        title: "Infill",
+        padTitle: padTitle,
+        info: [{
+                title: "Repeat penalty",
+                value: `${repeatPenalty} (apply to last ${lastTokensRepeatPenalty} tokens)`
+            }, {
+                show: repeatFrequencyPenalty != null,
+                title: "Repeat frequency penalty",
+                value: String(repeatFrequencyPenalty)
+            }, {
+                show: repeatPresencePenalty != null,
+                title: "Repeat presence penalty",
+                value: String(repeatPresencePenalty)
+            }, {
+                show: !penalizeRepeatingNewLine,
+                title: "Penalize repeating new line",
+                value: "disabled"
+            }, {
+                show: timing,
+                title: "Response timing",
+                value: "enabled"
+            }]
+    });
+    // this is for ora to not interfere with readline
+    await new Promise((resolve) => setTimeout(resolve, 1));
+    if (!completion.infillSupported) {
+        console.log(chalk.red("Infill is not supported for this model"));
+        process.exit(1);
+    }
+    const replPrefixHistory = [];
+    const replSuffixHistory = [];
+    async function getInput(name) {
+        const rl = readline.createInterface({
+            input: process.stdin,
+            output: process.stdout,
+            history: name === "Prefix"
+                ? replPrefixHistory.slice()
+                : replSuffixHistory.slice()
+        });
+        const res = await new Promise((accept) => rl.question(chalk.yellow(name + "> "), accept));
+        rl.close();
+        return res;
+    }
+    while (true) {
+        const prefixInput = initialPrefix != null
+            ? initialPrefix
+            : await getInput("Prefix");
+        if (initialPrefix != null) {
+            console.log(chalk.green("Prefix> ") + initialPrefix);
+            initialPrefix = null;
+        }
+        else
+            await replPrefixHistory.push(prefixInput);
+        if (prefixInput === ".exit")
+            break;
+        const suffixInput = initialSuffix != null
+            ? initialSuffix
+            : await getInput("Suffix");
+        if (initialSuffix != null) {
+            console.log(chalk.green("Suffix> ") + initialSuffix);
+            initialSuffix = null;
+        }
+        else
+            await replSuffixHistory.push(suffixInput);
+        if (suffixInput === ".exit")
+            break;
+        process.stdout.write(chalk.yellow("Infill: "));
+        const [startColor, endColor] = chalk.blue("MIDDLE").split("MIDDLE");
+        const abortController = new AbortController();
+        const consoleInteraction = new ConsoleInteraction();
+        consoleInteraction.onKey(ConsoleInteractionKey.ctrlC, async () => {
+            abortController.abort();
+            consoleInteraction.stop();
+        });
+        const timeBeforePrompt = Date.now();
+        try {
+            process.stdout.write(startColor);
+            consoleInteraction.start();
+            await completion.generateInfillCompletion(prefixInput, suffixInput, {
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed: seed ?? undefined,
+                signal: abortController.signal,
+                repeatPenalty: {
+                    penalty: repeatPenalty,
+                    frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,
+                    presencePenalty: repeatPresencePenalty != null ? repeatPresencePenalty : undefined,
+                    penalizeNewLine: penalizeRepeatingNewLine,
+                    lastTokens: lastTokensRepeatPenalty
+                },
+                maxTokens: maxTokens === -1
+                    ? context.contextSize
+                    : maxTokens <= 0
+                        ? undefined
+                        : maxTokens,
+                onTextChunk(chunk) {
+                    process.stdout.write(chunk);
+                }
+            });
+        }
+        catch (err) {
+            if (!(abortController.signal.aborted && err === abortController.signal.reason))
+                throw err;
+        }
+        finally {
+            consoleInteraction.stop();
+            if (abortController.signal.aborted)
+                process.stdout.write(endColor + chalk.yellow("[generation aborted by user]"));
+            else
+                process.stdout.write(endColor);
+            console.log();
+        }
+        const timeAfterPrompt = Date.now();
+        if (printTimings) {
+            if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
+                llama.logLevel = LlamaLogLevel.info;
+            await context.printTimings();
+            await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+            llama.logLevel = llamaLogLevel;
+        }
+        if (timing)
+            console.info(chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                }));
+        if (meter) {
+            const newTokenMeterState = contextSequence.tokenMeter.getState();
+            const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
+            lastTokenMeterState = newTokenMeterState;
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+            console.info([
+                showDraftTokenMeterDiff && (chalk.yellow("Main".padEnd("Drafter".length))),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")),
+                showDraftTokenMeterDiff && (chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")),
+                showDraftTokenMeterDiff && (chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")),
+                showDraftTokenMeterDiff && (chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " "))
+            ].filter(Boolean).join("  "));
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
+        }
+    }
+}
+//# sourceMappingURL=InfillCommand.js.map