First upload version 0.0.1

This commit is contained in:
Neyra
2026-02-05 15:27:49 +08:00
commit 8e9b7201ed
4182 changed files with 593136 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { Token, Tokenizer } from "../../types.js";
import { LlamaText } from "../../utils/LlamaText.js";
/**
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
*
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
*
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
* @experimental - this API is experimental and may change or be removed in subsequent releases
* @hidden
*/
export declare function experimentalChunkDocument(options: {
contextSequence: LlamaContextSequence;
document: string;
/**
* The tokens to use as separators for chunking the document.
* Passed to the `getSystemPrompt` function to generate the prompt.
*/
separatorTokens?: Token[];
getSystemPrompt?(options: {
separatorTokens: Token[];
tokenizer: Tokenizer;
maxChunkSize?: number;
}): LlamaText | string;
/**
* Maximum number of tokens to allow in a chunk.
*
* As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
*
* Set to `0` to disable this mechanism.
*
* Defaults to `500`.
*/
maxChunkSize?: number;
/**
* The alignment curve for the maximum chunk size mechanism.
*
* Adjust the value based on the behavior of the model.
*
* Play around with values between `1` and `4` to see what works best for you.
*
* Set to `1` to disable this mechanism.
*
* Defaults to `4`.
*/
maxChunkSizeAlignmentCurve?: number;
/**
* Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
* matches any of the texts in `trimmedTexts`
*/
syntaxAlignment?: {
/**
* The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
*
* Default: `4`
*/
maxTokens?: number;
/**
* The trimmed texts to match for, to append the token to the current chunk.
*
* Default: `["", ".", ";"]`
*/
trimmedTexts?: string[];
};
/**
* The number of tokens to skip before starting to use the generated separator tokens to split the document.
*/
skipFirstTokens?: number;
/**
* The number of recent probabilities to keep in the trail for normalization.
*
* Adjust the value based on the behavior of the model.
*
* Defaults to `200`.
*/
normalizationTrailSize?: number;
/**
* Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
*/
onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void;
/**
* Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
*/
onChunkText?(chunkText: string, usedSeparatorToken: Token): void;
}): Promise<string[]>;

View File

@@ -0,0 +1,212 @@
import { LlamaText, SpecialTokensText } from "../../utils/LlamaText.js";
import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { maxRecentDetokenizerTokens } from "../../consts.js";
/**
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
*
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
*
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
* @experimental - this API is experimental and may change or be removed in subsequent releases
* @hidden
*/
export async function experimentalChunkDocument(options) {
const { contextSequence, document, separatorTokens = findAppropriateSeparatorTokens(contextSequence.model), getSystemPrompt = getDefaultPrompt, maxChunkSize = 500, maxChunkSizeAlignmentCurve = 4, syntaxAlignment: { maxTokens: maxSyntaxAlignment = 4, trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"] } = {}, skipFirstTokens = 3, normalizationTrailSize = 100 } = options;
const onChunkTokens = safeEventCallback(options.onChunkTokens);
const onChunkText = safeEventCallback(options.onChunkText);
if (separatorTokens.length === 0)
throw new Error("Separator tokens must be provided");
const chatHistory = [{
type: "system",
text: LlamaText(getSystemPrompt({
separatorTokens,
tokenizer: contextSequence.model.tokenizer,
maxChunkSize: maxChunkSize <= 0
? undefined
: maxChunkSize
})).toJSON()
}, {
type: "user",
text: document
}, {
type: "model",
response: [""]
}];
const chatWrapper = resolveChatWrapper(contextSequence.model);
const { contextText } = chatWrapper.generateContextState({ chatHistory });
const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
throw new Error("The context size is too small to chunk the given document");
const evaluateInput = initialContextTokens.slice();
for (let i = 0; i < documentTokens.length - 1; i++) {
const token = documentTokens[i];
evaluateInput.push([token, {
generateNext: {
probabilities: true
}
}]);
}
let weight = 1;
const recentProbabilitiesTrail = [];
let chunkStartIndex = 0;
let lastPushedSeparatorIndex = 0;
const chunks = [];
const res = [];
function pushSeparatorIndex(separateIndex, separatorToken) {
lastPushedSeparatorIndex = separateIndex;
if (separateIndex <= chunkStartIndex)
return;
let endIndex = separateIndex;
for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
const text = contextSequence.model.detokenize([documentTokens[endIndex + i]]);
if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
break;
endIndex++;
}
const chunk = documentTokens.slice(chunkStartIndex, endIndex);
const text = contextSequence.model.detokenize(chunk, false, documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex));
chunks.push(chunk);
chunkStartIndex = endIndex;
onChunkTokens?.(chunk, separatorToken);
onChunkText?.(text, separatorToken);
res.push(text);
}
await contextSequence.controlledEvaluate(evaluateInput, {
onTokenResult(inputTokenIndex, result) {
const i = inputTokenIndex - initialContextTokens.length;
const nextProbabilities = result?.next?.probabilities;
const nextDocumentToken = documentTokens[i + 1];
if (nextProbabilities == null)
throw new Error("received no result for token " + i);
const topProbabilityScore = nextProbabilities.entries()
.next().value?.[1];
const [usedSeparatorToken, separatorProbability] = separatorTokens
.filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
.map((token) => [token, nextProbabilities.get(token)])
.filter((pair) => pair[1] != null)
.reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
if (probabilityA >= probabilityB)
return [tokenA, probabilityA];
return [tokenB, probabilityB];
}, [separatorTokens[0], 0]);
if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
return;
// console.log(
// i, contextSequence.model.detokenize([documentTokens[i]!]),
// Array.from(nextProbabilities.entries()).slice(0, 5)
// .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
// );
if (separatorProbability >= topProbabilityScore)
pushSeparatorIndex(i + 1, usedSeparatorToken);
else if (i > skipFirstTokens) {
const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
let maxChunkSizeAlignment = 0;
if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
const leftProbability = 1 - adjustedProbability;
const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
maxChunkSizeAlignment = currentChunkSize === 0
? 0
: adjustExponential(leftProbability * Math.min(1, currentChunkSize / maxChunkSize), maxChunkSizeAlignmentCurve <= 0
? 1
: maxChunkSizeAlignmentCurve, 0.8);
if (currentChunkSize === maxChunkSize)
maxChunkSizeAlignment = 1;
}
if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
pushSeparatorIndex(i + 1, usedSeparatorToken);
// update the weight of the current token with the adjusted probability in the trail
if (recentProbabilitiesTrail.length > 1) {
weight /= recentProbabilitiesTrail.pop();
recentProbabilitiesTrail.push(adjustedProbability);
weight *= adjustedProbability;
}
}
}
const nextDocumentTokenProbability = nextDocumentToken == null
? undefined
: nextProbabilities.get(nextDocumentToken);
if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
recentProbabilitiesTrail.push(nextDocumentTokenProbability);
weight *= nextDocumentTokenProbability;
if (recentProbabilitiesTrail.length > normalizationTrailSize)
weight /= recentProbabilitiesTrail.shift();
}
}
});
if (lastPushedSeparatorIndex !== documentTokens.length)
pushSeparatorIndex(documentTokens.length, separatorTokens[0]);
return res;
}
const idealTokenTexts = [
"\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
"\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
"\u00a1", // inverted exclamation mark
"|",
"_"
];
function findAppropriateSeparatorTokens(model, maxTokens = 2) {
const idealTextsSet = new Set(idealTokenTexts);
const foundTokens = [];
for (const token of model.iterateAllTokens()) {
if (model.isSpecialToken(token))
continue;
const text = model.detokenize([token]);
const trimmedText = text.trim();
if (idealTextsSet.has(trimmedText)) {
const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
if (foundTokens[textIndex] == null || text === trimmedText)
foundTokens[textIndex] = token;
}
}
const res = [];
for (let i = 0; i < idealTokenTexts.length; i++) {
const token = foundTokens[i];
if (token != null)
res.push(token);
}
return res.slice(0, maxTokens);
}
function getDefaultPrompt({ separatorTokens, tokenizer, maxChunkSize = 500 }) {
if (separatorTokens.length === 0)
throw new Error("No separator tokens provided");
else if (separatorTokens.length > 2)
throw new Error("Maximum of 2 separator tokens are supported");
return LlamaText.joinValues("\n", [
'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
"",
"You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
"",
"# Instructions",
LlamaText([
"- For splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]])),
'` as the "big split token" separator.'
]),
separatorTokens.length > 1 && (LlamaText([
"- For small splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]])),
'` as the "big split token" separator.'
])),
"- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
LlamaText([
"- You may get a user message that is unstructured or not structured cleanly. " +
"Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
" characters, and a big split every ", Math.floor(maxChunkSize), " characters."
]),
"- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
"- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
'- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
"- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
].filter((x) => x !== false));
}
function adjustExponential(value, exponent, weight) {
if (value < 0)
return 0;
else if (value > 1)
return 1;
return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
}
//# sourceMappingURL=chunkDocument.js.map

File diff suppressed because one or more lines are too long