airllm-fork-nodejs/node_modules/node-llama-cpp/dist/gguf/insights/GgufInsights.js

import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js";
import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
import { GgufArchitectureType } from "../types/GgufMetadataTypes.js";
import { getReadablePath } from "../../cli/utils/getReadablePath.js";
import { padSafeContextSize } from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js";
import { GgufInsightsTokens } from "./GgufInsightsTokens.js";
export class GgufInsights {
    /** @internal */ _llama;
    /** @internal */ _modelSize;
    /** @internal */ _totalFileLayers = null;
    /** @internal */ _supportsRanking;
    /** @internal */ _ggufFileInfo;
    /** @internal */ _configurationResolver;
    /** @internal */ _tokens;
    constructor(ggufFileInfo, llama) {
        this._llama = llama;
        this._ggufFileInfo = ggufFileInfo;
        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
        this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
        this._tokens = GgufInsightsTokens._create(this);
    }
    /**
     * Get warnings about the model file that would affect its usage.
     *
     * Most of these warnings are also generated by `llama.cpp`
     */
    getWarnings(modelFilePath) {
        const warnings = [];
        const modelFilePathText = (modelFilePath != null && modelFilePath !== "")
            ? ` ("${getReadablePath(modelFilePath)}")`
            : "";
        if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" &&
            this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) {
            // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'"
            warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` +
                "This may cause incorrect tokenization and thus degrade the generation quality. " +
                "Consider using a newer model or regenerating this GGUF model file");
        }
        return warnings;
    }
    get ggufFileInfo() {
        return this._ggufFileInfo;
    }
    get configurationResolver() {
        return this._configurationResolver;
    }
    get tokens() {
        return this._tokens;
    }
    /** The context size the model was trained on */
    get trainContextSize() {
        return this._ggufFileInfo.architectureMetadata.context_length;
    }
    /** The size of an embedding vector the model can produce */
    get embeddingVectorSize() {
        return this._ggufFileInfo.architectureMetadata.embedding_length;
    }
    get totalLayers() {
        const outputLayers = 1;
        return this._getTotalFileLayers() + outputLayers;
    }
    get modelSize() {
        return this._modelSize;
    }
    get flashAttentionSupported() {
        // source: `llama_new_context_with_model` in `llama.cpp`
        if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
            return false;
        else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
            return false;
        else {
            const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0;
            const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0;
            const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
            const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
            if (nEmbdHeadK !== nEmbdHeadV)
                return false;
        }
        return true;
    }
    get hasEncoder() {
        switch (this._ggufFileInfo.metadata?.general?.architecture) {
            case GgufArchitectureType.t5:
            case GgufArchitectureType.t5encoder:
                return true;
        }
        return false;
    }
    get hasDecoder() {
        switch (this._ggufFileInfo.metadata?.general?.architecture) {
            case GgufArchitectureType.t5encoder:
                return false;
        }
        return true;
    }
    get isRecurrent() {
        switch (this._ggufFileInfo.metadata?.general?.architecture) {
            case GgufArchitectureType.mamba:
            case GgufArchitectureType.mamba2:
            case GgufArchitectureType.rwkv6:
            case GgufArchitectureType.rwkv6qwen2:
            case GgufArchitectureType.rwkv7:
            case GgufArchitectureType.arwkv7:
                return true;
        }
        return false;
    }
    get supportsRanking() {
        if (this._supportsRanking != null)
            return this._supportsRanking;
        const layers = this._ggufFileInfo.fullTensorInfo ?? [];
        for (let i = layers.length - 1; i >= 0; i--) {
            const tensor = layers[i];
            if (tensor == null)
                continue;
            if (tensor.name === "cls.weight" || tensor.name === "cls.output.weight") {
                this._supportsRanking = this.tokens.sepToken != null || this.tokens.eosToken != null ||
                    isRankingTemplateValid(parseRankingTemplate(this._ggufFileInfo.metadata?.tokenizer?.["chat_template.rerank"]));
                this._supportsRanking &&= !(this.hasEncoder && this.hasDecoder); // encoder-decoder models are not supported
                return this._supportsRanking;
            }
        }
        this._supportsRanking = false;
        return this._supportsRanking;
    }
    /**
     * The size of the SWA (Sliding Window Attention).
     *
     * When `undefined`, the model does not use sliding window attention.
     */
    get swaSize() {
        const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window;
        if (slidingWindow == null || slidingWindow <= 0)
            return undefined;
        const trainContextSize = this.trainContextSize;
        if (trainContextSize != null && slidingWindow >= trainContextSize)
            return undefined;
        return slidingWindow;
    }
    estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) {
        const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers);
        return {
            cpuRam: calculateTensorsSize(cpu, this._llama, false),
            gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
        };
    }
    /**
     * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`.
     * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that.
     * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
     */
    estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, swaFullCache = false }) {
        if (sequences == null)
            sequences = getDefaultContextSequences();
        if (batchSize == null)
            batchSize = getDefaultContextBatchSize({ contextSize, sequences });
        const llmData = this._ggufFileInfo.architectureMetadata;
        const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
        const slidingWindow = this.swaSize ?? 0;
        const kvUnified = false;
        const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
            (this.trainContextSize == null || slidingWindow < this.trainContextSize);
        const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture);
        const nonSwaPercent = swaPattern <= 1
            ? 1
            : (1 / (swaPattern + (flashAttention ? -0.5 : -1)));
        // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
        const kvCachePadding = 1;
        const actualContextSize = kvUnified
            ? padSafeContextSize(sequences * contextSize, "up")
            : sequences * padSafeContextSize(contextSize, "up");
        const kvSize = usingSWA
            ? ((1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
                nonSwaPercent * actualContextSize)
            : actualContextSize;
        const totalFileLayers = this._getTotalFileLayers();
        const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers));
        const finalCpuLayers = totalFileLayers - finalGpuLayers;
        const usingGpu = finalGpuLayers !== 0;
        const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
        const embeddingSize = llmData.embedding_length ?? 0;
        const floatBytes = 4; // sizeof(float)
        const int32TBytes = 4; // sizeof(int32_t)
        const estimateOutput = (nOutputs) => {
            // source: `llama_context::output_reserve` in `llama-context.cpp`
            const nOutputsMax = Math.max(batchSize, nOutputs);
            const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5;
            const hasLogits = isT5 || !isEmbeddingContext;
            const hasEmbd = isT5 || isEmbeddingContext;
            const logitsSize = hasLogits
                ? (vocabularySize * nOutputsMax)
                : 0;
            const embdSize = hasEmbd
                ? (embeddingSize * nOutputsMax)
                : 0;
            const outputBufferSize = (logitsSize + embdSize) * floatBytes;
            const outputIdsArr = int32TBytes * batchSize;
            return outputBufferSize + outputIdsArr;
        };
        const estimateGraphOverheadMemory = () => {
            const s1MB = Math.pow(1024, 2);
            const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
            const expertCount = llmData?.expert_count ?? 0;
            const headCount = llmData?.attention?.head_count ?? 0;
            const embeddingLength = llmData?.embedding_length ?? 0;
            let defaultCalculationAdjustment = 0;
            if (batchSize == null)
                return 0;
            if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) {
                if (expertCount > 0) {
                    const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
                    return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
                }
                return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
            }
            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) {
                if (modelGpuLayers === this.totalLayers) {
                    defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null
                        ? 1
                        : kvSize / this.trainContextSize);
                }
                else {
                    defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null
                        ? 1
                        : kvSize / this.trainContextSize));
                }
            }
            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) {
                // only works properly when all layers are on the GPU, which is why it's commented out:
                // return int32TBytes * batchSize * ((llmData.embedding_length ?? 0));
                if (modelGpuLayers === this.totalLayers) {
                    defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null
                        ? 1
                        : kvSize / this.trainContextSize));
                }
                else {
                    defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null
                        ? 1
                        : Math.max(0, (1 - (kvSize / this.trainContextSize)))));
                }
            }
            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) {
                const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
                return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB);
                // if (modelGpuLayers === this.totalLayers) {
                //     defaultCalculationAdjustment += -(s1MB * 20) + (
                //         (s1MB * 250) * (
                //             this.trainContextSize == null
                //                 ? 1
                //                 : kvSize / this.trainContextSize
                //         )
                //     );
                // } else {
                //     defaultCalculationAdjustment += -(s1MB * 40) + (
                //         (s1MB * 300) * (
                //             this.trainContextSize == null
                //                 ? 1
                //                 : kvSize / this.trainContextSize
                //         )
                //     );
                // }
            }
            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) {
                return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
            }
            else if (expertCount > 0) {
                const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
                return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
            }
            const totalElements = tensorInfo.length === 0
                ? this.totalLayers * (((llmData.embedding_length ?? 0) +
                    (llmData.feed_forward_length ?? 0)) / 2)
                : tensorInfo.reduce((res, tensor) => {
                    return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0);
                }, 0);
            if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) {
                // magic numbers for estimation. will be improved in the future
                return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment;
            }
            // magic numbers for estimation. will be improved in the future
            return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment;
        };
        const gpuKVCacheSize = usingGpu
            ? this._estimateKvMemorySizeInBytes(kvSize, finalGpuLayers < totalFileLayers
                ? (finalGpuLayers + 1)
                : finalGpuLayers)
            : 0;
        const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers);
        // source: `llama_context::graph_max_nodes` in `llama-context.cpp`
        const getMaxNodesMultiplier = (arch, nTokens) => {
            if (arch === GgufArchitectureType.qwen3next)
                return {
                    min: nTokens * 40,
                    multiplier: 32
                };
            return {
                min: 1024,
                multiplier: 8
            };
        };
        const maxNodesMultiplier = getMaxNodesMultiplier(this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize));
        const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length);
        const cpuNodes = maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
        const gpuNodes = maxNodes - cpuNodes;
        const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) +
            this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false);
        const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) +
            this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false);
        const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
            ? 0
            : estimateGraphOverheadMemory();
        const graphOverheadGpuSize = usingGpu
            ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers))
            : 0;
        const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize;
        const outputBufferSize = estimateOutput(sequences);
        const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize;
        const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize;
        return {
            cpuRam,
            gpuVram: usingGpu
                ? gpuVram
                : 0
        };
    }
    /**
     * Get the split tensor resources for CPU and GPU based on the number of GPU layers
     * @internal
     */
    _getTensorResourceSplit(gpuLayers) {
        const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
        const architecture = this._ggufFileInfo.metadata?.general?.architecture;
        if (gpuLayers === 0) {
            return {
                cpu: tensorInfo,
                gpu: []
            };
        }
        const fileLayers = this._getFileLayers();
        const startGpuLayer = Math.max(0, fileLayers - gpuLayers);
        const gpuTensors = [];
        const cpuTensors = [];
        let tokenEmbedLayer;
        let mainOutputLayer;
        for (const singleTensorInfo of tensorInfo) {
            if (isMainOutputLayer(singleTensorInfo.name))
                mainOutputLayer = singleTensorInfo;
            else if (isTokenEmbedLayer(singleTensorInfo.name))
                tokenEmbedLayer = singleTensorInfo;
            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
            // loaded with `model.dev_input`, which is always set to the CPU
            if (isInputLayer(singleTensorInfo.name)) {
                cpuTensors.push(singleTensorInfo);
                continue;
                // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
                // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
            }
            else if (isOutputLayer(singleTensorInfo.name)) {
                if (gpuLayers === this.totalLayers) {
                    gpuTensors.push(singleTensorInfo);
                    continue;
                }
                else {
                    cpuTensors.push(singleTensorInfo);
                    continue;
                }
            }
            const { layerNumber } = parseTensorName(singleTensorInfo.name);
            if (gpuLayers !== this.totalLayers) {
                if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
                    if (layerNumber != null && layerNumber >= startGpuLayer)
                        gpuTensors.push(singleTensorInfo);
                    else
                        cpuTensors.push(singleTensorInfo);
                    continue;
                }
            }
            if (layerNumber == null || layerNumber >= startGpuLayer)
                gpuTensors.push(singleTensorInfo);
            else
                cpuTensors.push(singleTensorInfo);
        }
        if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
            gpuTensors.push(tokenEmbedLayer);
        return {
            cpu: cpuTensors,
            gpu: gpuTensors
        };
    }
    /** @internal */
    _determineNumberOfLayersFromTensorInfo() {
        const layerNumbers = new Set();
        for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) {
            const { layerNumber } = parseTensorName(singleTensorInfo.name);
            if (layerNumber != null)
                layerNumbers.add(layerNumber);
        }
        return layerNumbers.size;
    }
    /** @internal */
    _getFileLayers() {
        return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo();
    }
    /** @internal */
    _estimateKvMemorySizeInBytes(kvSize, layers) {
        // source: `llama_kv_cache_init` in `llama.cpp`
        const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
        const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
        const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
        const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
        const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
        const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0;
        const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0;
        const modelNEmbdKS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0
            ? (this._ggufFileInfo.architectureMetadata.token_shift_count ?? 0) * nEmbd
            : (ssmDConv > 0 ? (ssmDConv - 1) : 0) * ssmDInner;
        const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0;
        const modelNEmbdVS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0
            ? nEmbd * (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0)
            : ssmDState * ssmDInner;
        let totalElementsK = 0;
        let totalElementsV = 0;
        for (let i = 0; i < layers; i++) {
            const nHeadKvArrayItem = (typeof nHeadKv === "number")
                ? nHeadKv
                : nHeadKv[i] !== 0
                    ? nHeadKv[i]
                    : nHead;
            const nEmbdKGqa = nEmbdHeadK * nHeadKvArrayItem;
            const nEmbdVGqa = nEmbdHeadV * nHeadKvArrayItem;
            const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS;
            const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS;
            totalElementsK += totalNEmbdKGqa * kvSize;
            totalElementsV += totalNEmbdVGqa * kvSize;
        }
        const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
            // if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`,
            // this would have to depend on that value
            ? this._llama._consts.ggmlTypeF32Size
            : this._llama._consts.ggmlTypeF16Size;
        const valueTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
            // if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`,
            // this would have to depend on that value
            ? this._llama._consts.ggmlTypeF32Size
            : this._llama._consts.ggmlTypeF16Size;
        return ((totalElementsK * keyTypeSize) +
            (totalElementsV * valueTypeSize));
    }
    /** @internal */
    _getTotalFileLayers() {
        if (this._totalFileLayers != null)
            return this._totalFileLayers;
        this._totalFileLayers = this._getFileLayers();
        return this._totalFileLayers;
    }
    /**
     * @param ggufFileInfo
     * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
     * If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that
     * doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances
     * that need a fallback `Llama` instance.
     */
    static async from(ggufFileInfo, llama) {
        let resolvedLlama = llama;
        if (resolvedLlama == null)
            resolvedLlama = await getLlamaWithoutBackend();
        return new GgufInsights(ggufFileInfo, resolvedLlama);
    }
}
function parseTensorName(tensorName) {
    if (tensorName == null)
        return { layerNumber: undefined };
    const layerTensorPrefix = "blk.";
    if (!tensorName.startsWith(layerTensorPrefix))
        return { layerNumber: undefined };
    const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length);
    const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0
        ? tensorName.length
        : dotIndex);
    const layerNumber = parseInt(layerNumberString);
    if (Number.isFinite(layerNumber))
        return { layerNumber };
    return { layerNumber: undefined };
}
function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) {
    if (!useMmap) {
        let size = 0;
        for (const tensorInfo of tensorsInfo)
            size += calculateTensorSize(tensorInfo, llama);
        return size;
    }
    const fileStats = new Map();
    for (const tensorInfo of tensorsInfo) {
        let stats = fileStats.get(tensorInfo.filePart);
        if (stats == null) {
            stats = {
                tensorsSize: 0
            };
            fileStats.set(tensorInfo.filePart, stats);
        }
        const tensorSize = calculateTensorSize(tensorInfo, llama);
        stats.tensorsSize += tensorSize;
        const startOffset = tensorInfo.offset;
        const endOffset = typeof startOffset === "number"
            ? startOffset + tensorSize
            : startOffset + BigInt(tensorSize);
        if (startFromTensorDataOffset)
            stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
        else if (stats.startOffset == null || startOffset < stats.startOffset)
            stats.startOffset = startOffset;
        if (stats.endOffset == null || endOffset > stats.endOffset)
            stats.endOffset = endOffset;
    }
    let size = 0;
    for (const [, stats] of fileStats) {
        const offsetSize = (stats.endOffset == null || stats.startOffset == null)
            ? 0
            : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
        const tensorsSize = stats.tensorsSize;
        size += Math.max(offsetSize, tensorsSize);
    }
    return size;
}
function calculateTensorSize(tensor, llama) {
    const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType);
    const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType);
    const ggmlMaxDims = llama._consts.ggmlMaxDims;
    if (typeSize == null || blockSize == null)
        throw new Error("Invalid type or block size");
    const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims });
    if (blockSize === 1) {
        let totalBytes = typeSize;
        for (let i = 0; i < ggmlMaxDims; i++) {
            totalBytes += (ne[i] - 1) * nb[i];
        }
        return totalBytes;
    }
    else {
        let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize);
        for (let i = 1; i < ggmlMaxDims; i++) {
            totalBytes += (ne[i] - 1) * nb[i];
        }
        return totalBytes;
    }
}
function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) {
    // number of elements
    // source: `ggml_new_tensor_impl` in `ggml.c`
    const ne = [
        ...tensor.dimensions,
        ...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1))
    ].slice(0, ggmlMaxDims);
    // number of bytes
    // source: `ggml_new_tensor_impl` in `ggml.c`
    const nb = [
        typeSize,
        Math.floor(typeSize * (ne[0] / blockSize)),
        ...Array(ggmlMaxDims - 2).fill(0)
    ];
    for (let i = 2; i < ggmlMaxDims; i++) {
        nb[i] = nb[i - 1] * ne[i - 1];
    }
    return {
        ne,
        nb
    };
}
function isInputLayer(layerName) {
    const [firstPart] = layerName.split(".");
    if (firstPart == null)
        return false;
    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
    switch (firstPart) {
        case "token_embd":
        case "token_embd_norm":
        case "token_types":
        case "position_embd":
            return true;
    }
    return false;
}
function isOutputLayer(layerName) {
    const [firstPart, secondPart] = layerName.split(".");
    if (firstPart == null)
        return false;
    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
    switch (firstPart) {
        case "output":
        case "output_norm":
        case "cls":
            return true;
    }
    if (secondPart == null)
        return false;
    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
    switch (firstPart + "." + secondPart) {
        case "cls.output":
        case "dec.output_norm":
        case "enc.output_norm":
            return true;
    }
    return false;
}
function isMainOutputLayer(layerName) {
    const [firstPart] = layerName.split(".");
    return firstPart === "output";
}
function isTokenEmbedLayer(layerName) {
    const [firstPart] = layerName.split(".");
    return firstPart === "token_embd";
}
function ggmlPad(value, padding) {
    return ((value + padding - 1) & ~(padding - 1));
}
function getSwaPatternForArchitecture(architecture) {
    // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern`
    switch (architecture) {
        case GgufArchitectureType.llama4:
            return 4;
        case GgufArchitectureType.phi3:
            return 1;
        case GgufArchitectureType.gemma2:
            return 2;
        case GgufArchitectureType.gemma3:
            return 6;
        case GgufArchitectureType.gemma3n:
            return 5;
        case GgufArchitectureType.cohere2:
            return 4;
        case GgufArchitectureType.exaone4:
            return 4;
        case GgufArchitectureType.gptOss:
            return 2;
        case GgufArchitectureType.smallthinker:
            return 4;
    }
    return 1;
}
export function parseRankingTemplate(template) {
    if (template == null)
        return undefined;
    return template
        .replaceAll("{query}", "{{query}}")
        .replaceAll("{document}", "{{document}}");
}
export function isRankingTemplateValid(template) {
    return template != null && template.includes("{{query}}") && template.includes("{{document}}");
}
//# sourceMappingURL=GgufInsights.js.map