airllm-fork-nodejs/node_modules/node-llama-cpp/dist/bindings/Llama.js

import os from "os";
import path from "path";
import chalk from "chalk";
import { DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { getConsoleLogPrefix } from "../utils/getConsoleLogPrefix.js";
import { LlamaModel } from "../evaluator/LlamaModel/LlamaModel.js";
import { DisposeGuard } from "../utils/DisposeGuard.js";
import { LlamaJsonSchemaGrammar } from "../evaluator/LlamaJsonSchemaGrammar.js";
import { LlamaGrammar } from "../evaluator/LlamaGrammar.js";
import { ThreadsSplitter } from "../utils/ThreadsSplitter.js";
import { getLlamaClasses } from "../utils/getLlamaClasses.js";
import { LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual } from "./types.js";
import { MemoryOrchestrator } from "./utils/MemoryOrchestrator.js";
export const LlamaLogLevelToAddonLogLevel = new Map([
    [LlamaLogLevel.disabled, 0],
    [LlamaLogLevel.fatal, 1],
    [LlamaLogLevel.error, 2],
    [LlamaLogLevel.warn, 3],
    [LlamaLogLevel.info, 4],
    [LlamaLogLevel.log, 5],
    [LlamaLogLevel.debug, 6]
]);
const addonLogLevelToLlamaLogLevel = new Map([...LlamaLogLevelToAddonLogLevel.entries()].map(([key, value]) => [value, key]));
const defaultLogLevel = 5;
const defaultCPUMinThreadSplitterThreads = 4;
export class Llama {
    /** @internal */ _bindings;
    /** @internal */ _backendDisposeGuard = new DisposeGuard();
    /** @internal */ _memoryLock = {};
    /** @internal */ _consts;
    /** @internal */ _vramOrchestrator;
    /** @internal */ _vramPadding;
    /** @internal */ _ramOrchestrator;
    /** @internal */ _ramPadding;
    /** @internal */ _swapOrchestrator;
    /** @internal */ _debug;
    /** @internal */ _threadsSplitter;
    /** @internal */ _hadErrorLogs = false;
    /** @internal */ _gpu;
    /** @internal */ _numa;
    /** @internal */ _buildType;
    /** @internal */ _cmakeOptions;
    /** @internal */ _supportsGpuOffloading;
    /** @internal */ _supportsMmap;
    /** @internal */ _gpuSupportsMmap;
    /** @internal */ _supportsMlock;
    /** @internal */ _mathCores;
    /** @internal */ _llamaCppRelease;
    /** @internal */ _logger;
    /** @internal */ _logLevel;
    /** @internal */ _pendingLog = null;
    /** @internal */ _pendingLogLevel = null;
    /** @internal */ _logDispatchQueuedMicrotasks = 0;
    /** @internal */ _previousLog = null;
    /** @internal */ _previousLogLevel = null;
    /** @internal */ _nextLogNeedNewLine = false;
    /** @internal */ _disposed = false;
    _classes;
    onDispose = new EventRelay();
    constructor({ bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu, maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit }) {
        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
        this._onAddonLog = this._onAddonLog.bind(this);
        this._bindings = bindings;
        this._debug = debug;
        this._numa = numa ?? false;
        this._logLevel = this._debug
            ? LlamaLogLevel.debug
            : (logLevel ?? LlamaLogLevel.debug);
        const previouslyLoaded = bindings.markLoaded();
        if (!this._debug && (!skipLlamaInit || !previouslyLoaded)) {
            this._bindings.setLogger(this._onAddonLog);
            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
        }
        bindings.loadBackends();
        let loadedGpu = bindings.getGpuType();
        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) {
            const backendsPath = path.dirname(bindingPath);
            const fallbackBackendsDir = path.join(extBackendsPath ?? backendsPath, "fallback");
            bindings.loadBackends(backendsPath);
            loadedGpu = bindings.getGpuType();
            if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
                bindings.loadBackends(fallbackBackendsDir);
        }
        bindings.ensureGpuDeviceIsSupported();
        if (this._numa !== false)
            bindings.setNuma(numa);
        this._gpu = bindings.getGpuType() ?? false;
        this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
        this._supportsMmap = bindings.getSupportsMmap();
        this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
        this._supportsMlock = bindings.getSupportsMlock();
        this._mathCores = Math.floor(bindings.getMathCores());
        this._consts = bindings.getConsts();
        this._vramOrchestrator = vramOrchestrator;
        this._vramPadding = vramPadding;
        this._ramOrchestrator = ramOrchestrator;
        this._ramPadding = ramPadding;
        this._swapOrchestrator = swapOrchestrator;
        this._threadsSplitter = new ThreadsSplitter(maxThreads ?? (this._gpu === false
            ? Math.max(defaultCPUMinThreadSplitterThreads, this._mathCores)
            : 0));
        this._logger = logger;
        this._buildType = buildType;
        this._cmakeOptions = Object.freeze({ ...cmakeOptions });
        this._llamaCppRelease = Object.freeze({
            repo: llamaCppRelease.repo,
            release: llamaCppRelease.release
        });
        this._onExit = this._onExit.bind(this);
        process.on("exit", this._onExit);
    }
    async dispose() {
        if (this._disposed)
            return;
        this._disposed = true;
        this.onDispose.dispatchEvent();
        await this._backendDisposeGuard.acquireDisposeLock();
        await this._bindings.dispose();
    }
    /** @hidden */
    async [Symbol.asyncDispose]() {
        await this.dispose();
    }
    get disposed() {
        return this._disposed;
    }
    get classes() {
        if (this._classes == null)
            this._classes = getLlamaClasses();
        return this._classes;
    }
    get gpu() {
        return this._gpu;
    }
    get supportsGpuOffloading() {
        return this._supportsGpuOffloading;
    }
    get supportsMmap() {
        return this._supportsMmap;
    }
    get gpuSupportsMmap() {
        return this._gpuSupportsMmap;
    }
    get supportsMlock() {
        return this._supportsMlock;
    }
    /** The number of CPU cores that are useful for math */
    get cpuMathCores() {
        return this._mathCores;
    }
    /**
     * The maximum number of threads that can be used by the Llama instance.
     *
     * If set to `0`, the Llama instance will have no limit on the number of threads.
     *
     * See the `maxThreads` option of `getLlama` for more information.
     */
    get maxThreads() {
        return this._threadsSplitter.maxThreads;
    }
    set maxThreads(value) {
        this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
    }
    /**
     * See the `numa` option of `getLlama` for more information
     */
    get numa() {
        return this._numa;
    }
    get logLevel() {
        return this._logLevel;
    }
    set logLevel(value) {
        this._ensureNotDisposed();
        if (value === this._logLevel || this._debug)
            return;
        this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(value) ?? defaultLogLevel);
        this._logLevel = value;
    }
    get logger() {
        return this._logger;
    }
    set logger(value) {
        this._logger = value;
        if (value !== Llama.defaultConsoleLogger)
            this._nextLogNeedNewLine = false;
    }
    get buildType() {
        return this._buildType;
    }
    get cmakeOptions() {
        return this._cmakeOptions;
    }
    get llamaCppRelease() {
        return this._llamaCppRelease;
    }
    get systemInfo() {
        this._ensureNotDisposed();
        return this._bindings.systemInfo();
    }
    /**
     * VRAM padding used for memory size calculations, as these calculations are not always accurate.
     * This is set by default to ensure stability, but can be configured when you call `getLlama`.
     *
     * See `vramPadding` on `getLlama` for more information.
     */
    get vramPaddingSize() {
        return this._vramPadding.size;
    }
    /**
     * The total amount of VRAM that is currently being used.
     *
     * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU.
     * On SoC devices, this is usually the same as `total`.
     */
    async getVramState() {
        this._ensureNotDisposed();
        const { total, used, unifiedSize } = this._bindings.getGpuVramInfo();
        return {
            total,
            used,
            free: Math.max(0, total - used),
            unifiedSize
        };
    }
    /**
     * Get the state of the swap memory.
     *
     * **`maxSize`** - The maximum size of the swap memory that the system can allocate.
     * If the swap size is dynamic (like on macOS), this will be `Infinity`.
     *
     * **`allocated`** - The total size allocated by the system for swap memory.
     *
     * **`used`** - The amount of swap memory that is currently being used from the `allocated` size.
     *
     * On Windows, this will return the info for the page file.
     */
    async getSwapState() {
        this._ensureNotDisposed();
        const { total, maxSize, free } = this._bindings.getSwapInfo();
        return {
            maxSize: maxSize === -1
                ? Infinity
                : maxSize,
            allocated: total,
            used: total - free
        };
    }
    async getGpuDeviceNames() {
        this._ensureNotDisposed();
        const { deviceNames } = this._bindings.getGpuDeviceInfo();
        return deviceNames;
    }
    async loadModel(options) {
        this._ensureNotDisposed();
        return await withLock([this._memoryLock, LlamaLocks.loadToMemory], options.loadSignal, async () => {
            this._ensureNotDisposed();
            const preventDisposalHandle = this._backendDisposeGuard.createPreventDisposalHandle();
            try {
                return await LlamaModel._create(options, { _llama: this });
            }
            finally {
                preventDisposalHandle.dispose();
            }
        });
    }
    /* eslint-disable @stylistic/max-len */
    /**
     * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
     * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
     */
    async createGrammarForJsonSchema(schema) {
        return new LlamaJsonSchemaGrammar(this, schema);
    }
    /* eslint-enable @stylistic/max-len */
    async getGrammarFor(type) {
        return await LlamaGrammar.getFor(this, type);
    }
    /**
     * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
     */
    async createGrammar(options) {
        return new LlamaGrammar(this, options);
    }
    /** @internal */
    async _init() {
        await this._bindings.init();
    }
    /**
     * Log messages related to the Llama instance
     * @internal
     */
    _log(level, message) {
        this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
    }
    /** @internal */
    _onAddonLog(level, message) {
        const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal;
        if (this._pendingLog != null && this._pendingLogLevel != null && this._pendingLogLevel != llamaLogLevel) {
            this._callLogger(this._pendingLogLevel, this._pendingLog);
            this._pendingLog = null;
        }
        const sourceMessage = (this._pendingLog ?? "") + message;
        const lastNewLineIndex = sourceMessage.lastIndexOf("\n");
        const currentLog = lastNewLineIndex < 0
            ? sourceMessage
            : sourceMessage.slice(0, lastNewLineIndex);
        const nextLog = lastNewLineIndex < 0
            ? ""
            : sourceMessage.slice(lastNewLineIndex + 1);
        if (currentLog !== "")
            this._callLogger(llamaLogLevel, currentLog);
        if (nextLog !== "") {
            this._pendingLog = nextLog;
            this._pendingLogLevel = llamaLogLevel;
            queueMicrotask(this._dispatchPendingLogMicrotask);
            this._logDispatchQueuedMicrotasks++;
        }
        else
            this._pendingLog = null;
    }
    /** @internal */
    _dispatchPendingLogMicrotask() {
        this._logDispatchQueuedMicrotasks--;
        if (this._logDispatchQueuedMicrotasks !== 0)
            return;
        if (this._pendingLog != null && this._pendingLogLevel != null) {
            this._callLogger(this._pendingLogLevel, this._pendingLog);
            this._pendingLog = null;
        }
    }
    /** @internal */
    _callLogger(level, message) {
        // llama.cpp uses dots to indicate progress, so we don't want to print them as different lines,
        // and instead, append to the same log line
        if (logMessageIsOnlyDots(message) && this._logger === Llama.defaultConsoleLogger) {
            if (logMessageIsOnlyDots(this._previousLog) && level === this._previousLogLevel) {
                process.stdout.write(message);
            }
            else {
                this._nextLogNeedNewLine = true;
                process.stdout.write(prefixAndColorMessage(message, getColorForLogLevel(level)));
            }
        }
        else {
            if (this._nextLogNeedNewLine) {
                process.stdout.write("\n");
                this._nextLogNeedNewLine = false;
            }
            try {
                const transformedLogLevel = getTransformedLogLevel(level, message, this.gpu);
                if (LlamaLogLevelGreaterThanOrEqual(transformedLogLevel, this._logLevel))
                    this._logger(transformedLogLevel, message);
            }
            catch (err) {
                // the native addon code calls this function, so there's no use to throw an error here
            }
        }
        this._previousLog = message;
        this._previousLogLevel = level;
        if (!this._hadErrorLogs && LlamaLogLevelGreaterThan(level, LlamaLogLevel.error))
            this._hadErrorLogs = true;
    }
    /** @internal */
    _onExit() {
        if (this._pendingLog != null && this._pendingLogLevel != null) {
            this._callLogger(this._pendingLogLevel, this._pendingLog);
            this._pendingLog = null;
        }
    }
    /** @internal */
    _ensureNotDisposed() {
        if (this._disposed)
            throw new DisposedError();
    }
    /** @internal */
    static async _create({ bindings, bindingPath, extBackendsPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug, numa }) {
        const vramOrchestrator = new MemoryOrchestrator(() => {
            const { total, used, unifiedSize } = bindings.getGpuVramInfo();
            return {
                total,
                free: Math.max(0, total - used),
                unifiedSize
            };
        });
        const ramOrchestrator = new MemoryOrchestrator(() => {
            const used = process.memoryUsage().rss;
            const total = os.totalmem();
            return {
                total,
                free: Math.max(0, total - used),
                unifiedSize: total
            };
        });
        const swapOrchestrator = new MemoryOrchestrator(() => {
            const { total, maxSize, free } = bindings.getSwapInfo();
            const used = total - free;
            if (maxSize === -1)
                return {
                    total: Infinity,
                    free: Infinity,
                    unifiedSize: Infinity
                };
            return {
                total: maxSize,
                free: maxSize - used,
                unifiedSize: maxSize
            };
        });
        let resolvedRamPadding;
        if (ramPadding instanceof Function)
            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
        else
            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding);
        const llama = new Llama({
            bindings,
            bindingPath,
            extBackendsPath,
            buildType,
            cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
            llamaCppRelease: {
                repo: buildMetadata.buildOptions.llamaCpp.repo,
                release: buildMetadata.buildOptions.llamaCpp.release
            },
            logLevel,
            logger,
            debug,
            numa,
            buildGpu: buildMetadata.buildOptions.gpu,
            vramOrchestrator,
            maxThreads,
            vramPadding: vramOrchestrator.reserveMemory(0),
            ramOrchestrator,
            ramPadding: resolvedRamPadding,
            swapOrchestrator,
            skipLlamaInit
        });
        if (llama.gpu === false || vramPadding === 0) {
            // do nothing since `llama._vramPadding` is already set to 0
        }
        else if (vramPadding instanceof Function) {
            const currentVramPadding = llama._vramPadding;
            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
            currentVramPadding.dispose();
        }
        else {
            const currentVramPadding = llama._vramPadding;
            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding);
            currentVramPadding.dispose();
        }
        if (!skipLlamaInit)
            await llama._init();
        return llama;
    }
    static defaultConsoleLogger(level, message) {
        switch (level) {
            case LlamaLogLevel.disabled:
                break;
            case LlamaLogLevel.fatal:
                // we don't use console.error here because it prints the stack trace
                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            case LlamaLogLevel.error:
                // we don't use console.error here because it prints the stack trace
                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            case LlamaLogLevel.warn:
                console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            case LlamaLogLevel.info:
                console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            case LlamaLogLevel.log:
                console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            case LlamaLogLevel.debug:
                console.debug(prefixAndColorMessage(message, getColorForLogLevel(level)));
                break;
            default:
                void level;
                console.warn(getConsoleLogPrefix() + getColorForLogLevel(LlamaLogLevel.warn)(`Unknown log level: ${level}`));
                console.log(prefixAndColorMessage(message, getColorForLogLevel(level)));
        }
    }
}
function getColorForLogLevel(level) {
    switch (level) {
        case LlamaLogLevel.disabled: return chalk.whiteBright;
        case LlamaLogLevel.fatal: return chalk.redBright;
        case LlamaLogLevel.error: return chalk.red;
        case LlamaLogLevel.warn: return chalk.yellow;
        case LlamaLogLevel.info: return chalk.whiteBright;
        case LlamaLogLevel.log: return chalk.white;
        case LlamaLogLevel.debug: return chalk.gray;
        default:
            void level;
            return chalk.whiteBright;
    }
}
function prefixAndColorMessage(message, color) {
    return getConsoleLogPrefix() + (message
        .split("\n")
        .map((line) => color(line))
        .join("\n" + getConsoleLogPrefix()));
}
function logMessageIsOnlyDots(message) {
    if (message == null)
        return false;
    for (let i = 0; i < message.length; i++) {
        if (message[i] !== ".")
            return false;
    }
    return true;
}
function getTransformedLogLevel(level, message, gpu) {
    if (level === LlamaLogLevel.warn && message.endsWith("the full capacity of the model will not be utilized"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("ggml_metal_init: skipping kernel_") && message.endsWith("(not supported)"))
        return LlamaLogLevel.log;
    else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no"))
        return LlamaLogLevel.log;
    else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded "))
        return LlamaLogLevel.log;
    else if (level === LlamaLogLevel.warn && message.startsWith("make_cpu_buft_list: disabling extra buffer types"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("init: embeddings required but some input tokens were not marked as outputs -> overriding"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("load: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("llama_init_from_model: model default pooling_type is [0], but [-1] was specified"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is enabled, disabling mmap"))
        return LlamaLogLevel.info;
    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is not available, using mmap"))
        return LlamaLogLevel.info;
    else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
        return LlamaLogLevel.info;
    else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for"))
        return LlamaLogLevel.info;
    return level;
}
//# sourceMappingURL=Llama.js.map