First upload version 0.0.1

2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions
--- a/node_modules/node-llama-cpp/dist/bindings/Llama.js
+++ b/node_modules/node-llama-cpp/dist/bindings/Llama.js
@@ -0,0 +1,546 @@
+import os from "os";
+import path from "path";
+import chalk from "chalk";
+import { DisposedError, EventRelay, withLock } from "lifecycle-utils";
+import { getConsoleLogPrefix } from "../utils/getConsoleLogPrefix.js";
+import { LlamaModel } from "../evaluator/LlamaModel/LlamaModel.js";
+import { DisposeGuard } from "../utils/DisposeGuard.js";
+import { LlamaJsonSchemaGrammar } from "../evaluator/LlamaJsonSchemaGrammar.js";
+import { LlamaGrammar } from "../evaluator/LlamaGrammar.js";
+import { ThreadsSplitter } from "../utils/ThreadsSplitter.js";
+import { getLlamaClasses } from "../utils/getLlamaClasses.js";
+import { LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual } from "./types.js";
+import { MemoryOrchestrator } from "./utils/MemoryOrchestrator.js";
+export const LlamaLogLevelToAddonLogLevel = new Map([
+    [LlamaLogLevel.disabled, 0],
+    [LlamaLogLevel.fatal, 1],
+    [LlamaLogLevel.error, 2],
+    [LlamaLogLevel.warn, 3],
+    [LlamaLogLevel.info, 4],
+    [LlamaLogLevel.log, 5],
+    [LlamaLogLevel.debug, 6]
+]);
+const addonLogLevelToLlamaLogLevel = new Map([...LlamaLogLevelToAddonLogLevel.entries()].map(([key, value]) => [value, key]));
+const defaultLogLevel = 5;
+const defaultCPUMinThreadSplitterThreads = 4;
+export class Llama {
+    /** @internal */ _bindings;
+    /** @internal */ _backendDisposeGuard = new DisposeGuard();
+    /** @internal */ _memoryLock = {};
+    /** @internal */ _consts;
+    /** @internal */ _vramOrchestrator;
+    /** @internal */ _vramPadding;
+    /** @internal */ _ramOrchestrator;
+    /** @internal */ _ramPadding;
+    /** @internal */ _swapOrchestrator;
+    /** @internal */ _debug;
+    /** @internal */ _threadsSplitter;
+    /** @internal */ _hadErrorLogs = false;
+    /** @internal */ _gpu;
+    /** @internal */ _numa;
+    /** @internal */ _buildType;
+    /** @internal */ _cmakeOptions;
+    /** @internal */ _supportsGpuOffloading;
+    /** @internal */ _supportsMmap;
+    /** @internal */ _gpuSupportsMmap;
+    /** @internal */ _supportsMlock;
+    /** @internal */ _mathCores;
+    /** @internal */ _llamaCppRelease;
+    /** @internal */ _logger;
+    /** @internal */ _logLevel;
+    /** @internal */ _pendingLog = null;
+    /** @internal */ _pendingLogLevel = null;
+    /** @internal */ _logDispatchQueuedMicrotasks = 0;
+    /** @internal */ _previousLog = null;
+    /** @internal */ _previousLogLevel = null;
+    /** @internal */ _nextLogNeedNewLine = false;
+    /** @internal */ _disposed = false;
+    _classes;
+    onDispose = new EventRelay();
+    constructor({ bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu, maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit }) {
+        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
+        this._onAddonLog = this._onAddonLog.bind(this);
+        this._bindings = bindings;
+        this._debug = debug;
+        this._numa = numa ?? false;
+        this._logLevel = this._debug
+            ? LlamaLogLevel.debug
+            : (logLevel ?? LlamaLogLevel.debug);
+        const previouslyLoaded = bindings.markLoaded();
+        if (!this._debug && (!skipLlamaInit || !previouslyLoaded)) {
+            this._bindings.setLogger(this._onAddonLog);
+            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
+        }
+        bindings.loadBackends();
+        let loadedGpu = bindings.getGpuType();
+        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) {
+            const backendsPath = path.dirname(bindingPath);
+            const fallbackBackendsDir = path.join(extBackendsPath ?? backendsPath, "fallback");
+            bindings.loadBackends(backendsPath);
+            loadedGpu = bindings.getGpuType();
+            if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
+                bindings.loadBackends(fallbackBackendsDir);
+        }
+        bindings.ensureGpuDeviceIsSupported();
+        if (this._numa !== false)
+            bindings.setNuma(numa);
+        this._gpu = bindings.getGpuType() ?? false;
+        this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
+        this._supportsMmap = bindings.getSupportsMmap();
+        this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
+        this._supportsMlock = bindings.getSupportsMlock();
+        this._mathCores = Math.floor(bindings.getMathCores());
+        this._consts = bindings.getConsts();
+        this._vramOrchestrator = vramOrchestrator;
+        this._vramPadding = vramPadding;
+        this._ramOrchestrator = ramOrchestrator;
+        this._ramPadding = ramPadding;
+        this._swapOrchestrator = swapOrchestrator;
+        this._threadsSplitter = new ThreadsSplitter(maxThreads ?? (this._gpu === false
+            ? Math.max(defaultCPUMinThreadSplitterThreads, this._mathCores)
+            : 0));
+        this._logger = logger;
+        this._buildType = buildType;
+        this._cmakeOptions = Object.freeze({ ...cmakeOptions });
+        this._llamaCppRelease = Object.freeze({
+            repo: llamaCppRelease.repo,
+            release: llamaCppRelease.release
+        });
+        this._onExit = this._onExit.bind(this);
+        process.on("exit", this._onExit);
+    }
+    async dispose() {
+        if (this._disposed)
+            return;
+        this._disposed = true;
+        this.onDispose.dispatchEvent();
+        await this._backendDisposeGuard.acquireDisposeLock();
+        await this._bindings.dispose();
+    }
+    /** @hidden */
+    async [Symbol.asyncDispose]() {
+        await this.dispose();
+    }
+    get disposed() {
+        return this._disposed;
+    }
+    get classes() {
+        if (this._classes == null)
+            this._classes = getLlamaClasses();
+        return this._classes;
+    }
+    get gpu() {
+        return this._gpu;
+    }
+    get supportsGpuOffloading() {
+        return this._supportsGpuOffloading;
+    }
+    get supportsMmap() {
+        return this._supportsMmap;
+    }
+    get gpuSupportsMmap() {
+        return this._gpuSupportsMmap;
+    }
+    get supportsMlock() {
+        return this._supportsMlock;
+    }
+    /** The number of CPU cores that are useful for math */
+    get cpuMathCores() {
+        return this._mathCores;
+    }
+    /**
+     * The maximum number of threads that can be used by the Llama instance.
+     *
+     * If set to `0`, the Llama instance will have no limit on the number of threads.
+     *
+     * See the `maxThreads` option of `getLlama` for more information.
+     */
+    get maxThreads() {
+        return this._threadsSplitter.maxThreads;
+    }
+    set maxThreads(value) {
+        this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
+    }
+    /**
+     * See the `numa` option of `getLlama` for more information
+     */
+    get numa() {
+        return this._numa;
+    }
+    get logLevel() {
+        return this._logLevel;
+    }
+    set logLevel(value) {
+        this._ensureNotDisposed();
+        if (value === this._logLevel || this._debug)
+            return;
+        this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(value) ?? defaultLogLevel);
+        this._logLevel = value;
+    }
+    get logger() {
+        return this._logger;
+    }
+    set logger(value) {
+        this._logger = value;
+        if (value !== Llama.defaultConsoleLogger)
+            this._nextLogNeedNewLine = false;
+    }
+    get buildType() {
+        return this._buildType;
+    }
+    get cmakeOptions() {
+        return this._cmakeOptions;
+    }
+    get llamaCppRelease() {
+        return this._llamaCppRelease;
+    }
+    get systemInfo() {
+        this._ensureNotDisposed();
+        return this._bindings.systemInfo();
+    }
+    /**
+     * VRAM padding used for memory size calculations, as these calculations are not always accurate.
+     * This is set by default to ensure stability, but can be configured when you call `getLlama`.
+     *
+     * See `vramPadding` on `getLlama` for more information.
+     */
+    get vramPaddingSize() {
+        return this._vramPadding.size;
+    }
+    /**
+     * The total amount of VRAM that is currently being used.
+     *
+     * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU.
+     * On SoC devices, this is usually the same as `total`.
+     */
+    async getVramState() {
+        this._ensureNotDisposed();
+        const { total, used, unifiedSize } = this._bindings.getGpuVramInfo();
+        return {
+            total,
+            used,
+            free: Math.max(0, total - used),
+            unifiedSize
+        };
+    }
+    /**
+     * Get the state of the swap memory.
+     *
+     * **`maxSize`** - The maximum size of the swap memory that the system can allocate.
+     * If the swap size is dynamic (like on macOS), this will be `Infinity`.
+     *
+     * **`allocated`** - The total size allocated by the system for swap memory.
+     *
+     * **`used`** - The amount of swap memory that is currently being used from the `allocated` size.
+     *
+     * On Windows, this will return the info for the page file.
+     */
+    async getSwapState() {
+        this._ensureNotDisposed();
+        const { total, maxSize, free } = this._bindings.getSwapInfo();
+        return {
+            maxSize: maxSize === -1
+                ? Infinity
+                : maxSize,
+            allocated: total,
+            used: total - free
+        };
+    }
+    async getGpuDeviceNames() {
+        this._ensureNotDisposed();
+        const { deviceNames } = this._bindings.getGpuDeviceInfo();
+        return deviceNames;
+    }
+    async loadModel(options) {
+        this._ensureNotDisposed();
+        return await withLock([this._memoryLock, LlamaLocks.loadToMemory], options.loadSignal, async () => {
+            this._ensureNotDisposed();
+            const preventDisposalHandle = this._backendDisposeGuard.createPreventDisposalHandle();
+            try {
+                return await LlamaModel._create(options, { _llama: this });
+            }
+            finally {
+                preventDisposalHandle.dispose();
+            }
+        });
+    }
+    /* eslint-disable @stylistic/max-len */
+    /**
+     * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+     * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+     */
+    async createGrammarForJsonSchema(schema) {
+        return new LlamaJsonSchemaGrammar(this, schema);
+    }
+    /* eslint-enable @stylistic/max-len */
+    async getGrammarFor(type) {
+        return await LlamaGrammar.getFor(this, type);
+    }
+    /**
+     * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+     */
+    async createGrammar(options) {
+        return new LlamaGrammar(this, options);
+    }
+    /** @internal */
+    async _init() {
+        await this._bindings.init();
+    }
+    /**
+     * Log messages related to the Llama instance
+     * @internal
+     */
+    _log(level, message) {
+        this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
+    }
+    /** @internal */
+    _onAddonLog(level, message) {
+        const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal;
+        if (this._pendingLog != null && this._pendingLogLevel != null && this._pendingLogLevel != llamaLogLevel) {
+            this._callLogger(this._pendingLogLevel, this._pendingLog);
+            this._pendingLog = null;
+        }
+        const sourceMessage = (this._pendingLog ?? "") + message;
+        const lastNewLineIndex = sourceMessage.lastIndexOf("\n");
+        const currentLog = lastNewLineIndex < 0
+            ? sourceMessage
+            : sourceMessage.slice(0, lastNewLineIndex);
+        const nextLog = lastNewLineIndex < 0
+            ? ""
+            : sourceMessage.slice(lastNewLineIndex + 1);
+        if (currentLog !== "")
+            this._callLogger(llamaLogLevel, currentLog);
+        if (nextLog !== "") {
+            this._pendingLog = nextLog;
+            this._pendingLogLevel = llamaLogLevel;
+            queueMicrotask(this._dispatchPendingLogMicrotask);
+            this._logDispatchQueuedMicrotasks++;
+        }
+        else
+            this._pendingLog = null;
+    }
+    /** @internal */
+    _dispatchPendingLogMicrotask() {
+        this._logDispatchQueuedMicrotasks--;
+        if (this._logDispatchQueuedMicrotasks !== 0)
+            return;
+        if (this._pendingLog != null && this._pendingLogLevel != null) {
+            this._callLogger(this._pendingLogLevel, this._pendingLog);
+            this._pendingLog = null;
+        }
+    }
+    /** @internal */
+    _callLogger(level, message) {
+        // llama.cpp uses dots to indicate progress, so we don't want to print them as different lines,
+        // and instead, append to the same log line
+        if (logMessageIsOnlyDots(message) && this._logger === Llama.defaultConsoleLogger) {
+            if (logMessageIsOnlyDots(this._previousLog) && level === this._previousLogLevel) {
+                process.stdout.write(message);
+            }
+            else {
+                this._nextLogNeedNewLine = true;
+                process.stdout.write(prefixAndColorMessage(message, getColorForLogLevel(level)));
+            }
+        }
+        else {
+            if (this._nextLogNeedNewLine) {
+                process.stdout.write("\n");
+                this._nextLogNeedNewLine = false;
+            }
+            try {
+                const transformedLogLevel = getTransformedLogLevel(level, message, this.gpu);
+                if (LlamaLogLevelGreaterThanOrEqual(transformedLogLevel, this._logLevel))
+                    this._logger(transformedLogLevel, message);
+            }
+            catch (err) {
+                // the native addon code calls this function, so there's no use to throw an error here
+            }
+        }
+        this._previousLog = message;
+        this._previousLogLevel = level;
+        if (!this._hadErrorLogs && LlamaLogLevelGreaterThan(level, LlamaLogLevel.error))
+            this._hadErrorLogs = true;
+    }
+    /** @internal */
+    _onExit() {
+        if (this._pendingLog != null && this._pendingLogLevel != null) {
+            this._callLogger(this._pendingLogLevel, this._pendingLog);
+            this._pendingLog = null;
+        }
+    }
+    /** @internal */
+    _ensureNotDisposed() {
+        if (this._disposed)
+            throw new DisposedError();
+    }
+    /** @internal */
+    static async _create({ bindings, bindingPath, extBackendsPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug, numa }) {
+        const vramOrchestrator = new MemoryOrchestrator(() => {
+            const { total, used, unifiedSize } = bindings.getGpuVramInfo();
+            return {
+                total,
+                free: Math.max(0, total - used),
+                unifiedSize
+            };
+        });
+        const ramOrchestrator = new MemoryOrchestrator(() => {
+            const used = process.memoryUsage().rss;
+            const total = os.totalmem();
+            return {
+                total,
+                free: Math.max(0, total - used),
+                unifiedSize: total
+            };
+        });
+        const swapOrchestrator = new MemoryOrchestrator(() => {
+            const { total, maxSize, free } = bindings.getSwapInfo();
+            const used = total - free;
+            if (maxSize === -1)
+                return {
+                    total: Infinity,
+                    free: Infinity,
+                    unifiedSize: Infinity
+                };
+            return {
+                total: maxSize,
+                free: maxSize - used,
+                unifiedSize: maxSize
+            };
+        });
+        let resolvedRamPadding;
+        if (ramPadding instanceof Function)
+            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
+        else
+            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding);
+        const llama = new Llama({
+            bindings,
+            bindingPath,
+            extBackendsPath,
+            buildType,
+            cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
+            llamaCppRelease: {
+                repo: buildMetadata.buildOptions.llamaCpp.repo,
+                release: buildMetadata.buildOptions.llamaCpp.release
+            },
+            logLevel,
+            logger,
+            debug,
+            numa,
+            buildGpu: buildMetadata.buildOptions.gpu,
+            vramOrchestrator,
+            maxThreads,
+            vramPadding: vramOrchestrator.reserveMemory(0),
+            ramOrchestrator,
+            ramPadding: resolvedRamPadding,
+            swapOrchestrator,
+            skipLlamaInit
+        });
+        if (llama.gpu === false || vramPadding === 0) {
+            // do nothing since `llama._vramPadding` is already set to 0
+        }
+        else if (vramPadding instanceof Function) {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
+            currentVramPadding.dispose();
+        }
+        else {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding);
+            currentVramPadding.dispose();
+        }
+        if (!skipLlamaInit)
+            await llama._init();
+        return llama;
+    }
+    static defaultConsoleLogger(level, message) {
+        switch (level) {
+            case LlamaLogLevel.disabled:
+                break;
+            case LlamaLogLevel.fatal:
+                // we don't use console.error here because it prints the stack trace
+                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            case LlamaLogLevel.error:
+                // we don't use console.error here because it prints the stack trace
+                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            case LlamaLogLevel.warn:
+                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            case LlamaLogLevel.info:
+                console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            case LlamaLogLevel.log:
+                console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            case LlamaLogLevel.debug:
+                console.debug(prefixAndColorMessage(message, getColorForLogLevel(level)));
+                break;
+            default:
+                void level;
+                console.warn(getConsoleLogPrefix() + getColorForLogLevel(LlamaLogLevel.warn)(`Unknown log level: ${level}`));
+                console.log(prefixAndColorMessage(message, getColorForLogLevel(level)));
+        }
+    }
+}
+function getColorForLogLevel(level) {
+    switch (level) {
+        case LlamaLogLevel.disabled: return chalk.whiteBright;
+        case LlamaLogLevel.fatal: return chalk.redBright;
+        case LlamaLogLevel.error: return chalk.red;
+        case LlamaLogLevel.warn: return chalk.yellow;
+        case LlamaLogLevel.info: return chalk.whiteBright;
+        case LlamaLogLevel.log: return chalk.white;
+        case LlamaLogLevel.debug: return chalk.gray;
+        default:
+            void level;
+            return chalk.whiteBright;
+    }
+}
+function prefixAndColorMessage(message, color) {
+    return getConsoleLogPrefix() + (message
+        .split("\n")
+        .map((line) => color(line))
+        .join("\n" + getConsoleLogPrefix()));
+}
+function logMessageIsOnlyDots(message) {
+    if (message == null)
+        return false;
+    for (let i = 0; i < message.length; i++) {
+        if (message[i] !== ".")
+            return false;
+    }
+    return true;
+}
+function getTransformedLogLevel(level, message, gpu) {
+    if (level === LlamaLogLevel.warn && message.endsWith("the full capacity of the model will not be utilized"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("ggml_metal_init: skipping kernel_") && message.endsWith("(not supported)"))
+        return LlamaLogLevel.log;
+    else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no"))
+        return LlamaLogLevel.log;
+    else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded "))
+        return LlamaLogLevel.log;
+    else if (level === LlamaLogLevel.warn && message.startsWith("make_cpu_buft_list: disabling extra buffer types"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("init: embeddings required but some input tokens were not marked as outputs -> overriding"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("load: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_init_from_model: model default pooling_type is [0], but [-1] was specified"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is enabled, disabling mmap"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is not available, using mmap"))
+        return LlamaLogLevel.info;
+    else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
+        return LlamaLogLevel.info;
+    else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for"))
+        return LlamaLogLevel.info;
+    return level;
+}
+//# sourceMappingURL=Llama.js.map