diff --git a/packages/README.md b/packages/README.md new file mode 100644 index 0000000000..020157a990 --- /dev/null +++ b/packages/README.md @@ -0,0 +1,3 @@ +A WASM implementation of blake3 using assemblyscript. + +Implementation based on https://github.com/BLAKE3-team/BLAKE3/blob/master/reference_impl/reference_impl.rs \ No newline at end of file diff --git a/packages/blake3-wasm/.npmignore b/packages/blake3-wasm/.npmignore new file mode 100644 index 0000000000..5657f6ea7d --- /dev/null +++ b/packages/blake3-wasm/.npmignore @@ -0,0 +1 @@ +vendor \ No newline at end of file diff --git a/packages/blake3-wasm/asconfig.json b/packages/blake3-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/blake3-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/blake3-wasm/assembly/blake3.ts b/packages/blake3-wasm/assembly/blake3.ts new file mode 100644 index 0000000000..880abace75 --- /dev/null +++ b/packages/blake3-wasm/assembly/blake3.ts @@ -0,0 +1,373 @@ +// Constants from the reference implementation +const OUT_LEN: i32 = 32; +// const KEY_LEN: usize = 32; +const BLOCK_LEN: i32 = 64; +const CHUNK_LEN: i32 = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +//const KEYED_HASH: u32 = 1 << 4; +//const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: StaticArray = [ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +]; + +const MSG_PERMUTATION: StaticArray = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +function g(state: StaticArray, a: i32, b: i32, c: i32, d: i32, mx: u32, my: u32): void { + state[a] = state[a] + state[b] + mx; + state[d] = rotr(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + my; + state[d] = rotr(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 7); +} + +function round(state: StaticArray, m: StaticArray): void { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +function permute(m: StaticArray): void { + const permuted = new StaticArray(16); + for (let i = 0; i < 16; i++) { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + for (let i = 0; i < 16; i++) { + m[i] = permuted[i]; + } +} + +function compress( + chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 +): StaticArray { + const counter_low = counter as u32; + const counter_high = (counter >> 32) as u32; + const state = new StaticArray(16); + + // Initialize state + for (let i = 0; i < 8; i++) { + state[i] = chaining_value[i]; + state[i + 8] = IV[i]; + } + state[12] = counter_low; + state[13] = counter_high; + state[14] = block_len; + state[15] = flags; + + const block = new StaticArray(16); + for (let i = 0; i < 16; i++) { + block[i] = block_words[i]; + } + + // Apply rounds + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + + // Final mixing + for (let i = 0; i < 8; i++) { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + + return state; +} + +function words_from_little_endian_bytes(bytes: Uint8Array, words: StaticArray): void { + for (let i = 0; i < words.length; i++) { + const offset = i * 4; + words[i] = + bytes[offset] | + ((bytes[offset + 1] as u32) << 8) | + ((bytes[offset + 2] as u32) << 16) | + ((bytes[offset + 3] as u32) << 24); + } +} + +class Blake3Hasher { + private chunk_state: ChunkState; + private key_words: StaticArray; + private cv_stack: StaticArray>; + private cv_stack_len: u8; + private flags: u32; + + constructor() { + const key_words = new StaticArray(8); + for (let i = 0; i < 8; i++) { + key_words[i] = IV[i]; + } + this.key_words = key_words; + this.chunk_state = new ChunkState(key_words, 0, 0); + this.cv_stack = new StaticArray>(54); + this.cv_stack_len = 0; + this.flags = 0; + + for (let i = 0; i < 54; i++) { + this.cv_stack[i] = new StaticArray(8); + } + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.chunk_state.len() == CHUNK_LEN) { + const chunk_cv = this.chunk_state.output().chaining_value(); + const total_chunks = this.chunk_state.chunk_counter + 1; + this.add_chunk_chaining_value(chunk_cv, total_chunks); + this.chunk_state = new ChunkState(this.key_words, total_chunks, this.flags); + } + + const want = CHUNK_LEN - this.chunk_state.len(); + const take = min(want, input.length - inputPos); + this.chunk_state.update(input.subarray(inputPos, inputPos + take)); + inputPos += take; + } + } + + finalize(out: Uint8Array): void { + let output = this.chunk_state.output(); + let parent_nodes_remaining = this.cv_stack_len; + + while (parent_nodes_remaining > 0) { + parent_nodes_remaining--; + output = parent_output( + this.cv_stack[parent_nodes_remaining], + output.chaining_value(), + this.key_words, + this.flags + ); + } + + output.root_output_bytes(out); + } + + private add_chunk_chaining_value(new_cv: StaticArray, total_chunks: u64): void { + let mut_new_cv = new_cv; + let mut_total_chunks = total_chunks; + + while ((mut_total_chunks & 1) == 0) { + mut_new_cv = parent_cv(this.pop_stack(), mut_new_cv, this.key_words, this.flags); + mut_total_chunks >>= 1; + } + + this.push_stack(mut_new_cv); + } + + private push_stack(cv: StaticArray): void { + for (let i = 0; i < 8; i++) { + this.cv_stack[this.cv_stack_len][i] = cv[i]; + } + this.cv_stack_len++; + } + + private pop_stack(): StaticArray { + this.cv_stack_len--; + return this.cv_stack[this.cv_stack_len]; + } +} + +class ChunkState { + chaining_value: StaticArray; + chunk_counter: u64; + block: Uint8Array; + block_len: u8; + blocks_compressed: u8; + flags: u32; + + constructor(key_words: StaticArray, chunk_counter: u64, flags: u32) { + this.chaining_value = new StaticArray(8); + this.chunk_counter = chunk_counter; + this.block = new Uint8Array(BLOCK_LEN); + this.block_len = 0; + this.blocks_compressed = 0; + this.flags = flags; + + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = key_words[i]; + } + } + + len(): i32 { + return BLOCK_LEN * this.blocks_compressed + this.block_len; + } + + start_flag(): u32 { + return this.blocks_compressed == 0 ? CHUNK_START : 0; + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.block_len == BLOCK_LEN) { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + const compressed = compress( + this.chaining_value, + block_words, + this.chunk_counter, + BLOCK_LEN, + this.flags | this.start_flag() + ); + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = compressed[i]; + } + this.blocks_compressed++; + this.block = new Uint8Array(BLOCK_LEN); + this.block_len = 0; + } + + const want = BLOCK_LEN - this.block_len; + const take = min(want, input.length - inputPos); + for (let i = 0; i < take; i++) { + this.block[this.block_len + i] = input[inputPos + i]; + } + this.block_len += take as u8; + inputPos += take; + } + } + + output(): Output { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + return new Output( + this.chaining_value, + block_words, + this.chunk_counter, + this.block_len, + this.flags | this.start_flag() | CHUNK_END + ); + } +} + +class Output { + input_chaining_value: StaticArray; + block_words: StaticArray; + counter: u64; + block_len: u32; + flags: u32; + + constructor( + input_chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 + ) { + this.input_chaining_value = input_chaining_value; + this.block_words = block_words; + this.counter = counter; + this.block_len = block_len; + this.flags = flags; + } + + chaining_value(): StaticArray { + const compressed = compress(this.input_chaining_value, this.block_words, this.counter, this.block_len, this.flags); + const result = new StaticArray(8); + for (let i = 0; i < 8; i++) { + result[i] = compressed[i]; + } + return result; + } + + root_output_bytes(out: Uint8Array): void { + let output_block_counter: u64 = 0; + for (let i = 0; i < out.length; i += 2 * OUT_LEN) { + const words = compress( + this.input_chaining_value, + this.block_words, + output_block_counter, + this.block_len, + this.flags | ROOT + ); + const out_block = out.subarray(i, i + 2 * OUT_LEN); + for (let j = 0; j < words.length; j++) { + const word = words[j]; + const offset = j * 4; + if (offset < out_block.length) { + out_block[offset] = word & 0xff; + if (offset + 1 < out_block.length) { + out_block[offset + 1] = (word >> 8) & 0xff; + if (offset + 2 < out_block.length) { + out_block[offset + 2] = (word >> 16) & 0xff; + if (offset + 3 < out_block.length) { + out_block[offset + 3] = (word >> 24) & 0xff; + } + } + } + } + } + output_block_counter++; + } + } +} + +function parent_output( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): Output { + const block_words = new StaticArray(16); + for (let i = 0; i < 8; i++) { + block_words[i] = left_child_cv[i]; + block_words[i + 8] = right_child_cv[i]; + } + return new Output(key_words, block_words, 0, BLOCK_LEN, PARENT | flags); +} + +function parent_cv( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): StaticArray { + return parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value(); +} + +export function blake3(input: Uint8Array): Uint8Array { + const hasher = new Blake3Hasher(); + hasher.update(input); + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} + +export function blake3Hex(input: Uint8Array): string { + const hash = blake3(input); + const hex = new Array(64); + for (let i = 0; i < 32; i++) { + hex[i * 2] = (hash[i] >> 4).toString(16); + hex[i * 2 + 1] = (hash[i] & 0x0f).toString(16); + } + return hex.join(""); +} diff --git a/packages/blake3-wasm/assembly/index.ts b/packages/blake3-wasm/assembly/index.ts new file mode 100644 index 0000000000..8183303929 --- /dev/null +++ b/packages/blake3-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +// Re-export everything from blake3.ts +export * from "./blake3"; diff --git a/packages/blake3-wasm/assembly/tsconfig.json b/packages/blake3-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..33daff5dac --- /dev/null +++ b/packages/blake3-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.37/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/blake3-wasm/build/.gitignore b/packages/blake3-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/blake3-wasm/package.json b/packages/blake3-wasm/package.json new file mode 100644 index 0000000000..4085980507 --- /dev/null +++ b/packages/blake3-wasm/package.json @@ -0,0 +1,33 @@ +{ + "name": "@huggingface/blake3-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "blake3", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "^0.27.36" + } +} diff --git a/packages/blake3-wasm/pnpm-lock.yaml b/packages/blake3-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..f96d25e4f8 --- /dev/null +++ b/packages/blake3-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: ^0.27.36 + version: 0.27.37 + +packages: + + assemblyscript@0.27.37: + resolution: {integrity: sha512-YtY5k3PiV3SyUQ6gRlR2OCn8dcVRwkpiG/k2T5buoL2ymH/Z/YbaYWbk/f9mO2HTgEtGWjPiAQrIuvA7G/63Gg==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.37: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/blake3-wasm/tests/index.js b/packages/blake3-wasm/tests/index.js new file mode 100644 index 0000000000..55463e9495 --- /dev/null +++ b/packages/blake3-wasm/tests/index.js @@ -0,0 +1,163 @@ +// Adapted from https://github.com/mcmilk/BLAKE3-tests/blob/11a8abeceac93b5eba664eae3679efb4ffa5bc0a/blake3_test.c + +import { blake3Hex } from "../build/debug.js"; + +const buffer = new Uint8Array(102400); +let i = 0; +let j = 0; + +for (i = 0, j = 0; i < buffer.length; i++, j++) { + if (j === 251) { + j = 0; + } + buffer[i] = j; +} + +const testCases = [ + { + buf: buffer.slice(0, 0), + expected: "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262", + }, + { + buf: buffer.slice(0, 1), + expected: "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213", + }, + { + buf: buffer.slice(0, 2), + expected: "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63", + }, + { + buf: buffer.slice(0, 3), + expected: "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f", + }, + { + buf: buffer.slice(0, 4), + expected: "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f32", + }, + { + buf: buffer.slice(0, 5), + expected: "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2", + }, + { + buf: buffer.slice(0, 6), + expected: "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844", + }, + { + buf: buffer.slice(0, 7), + expected: "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe", + }, + { + buf: buffer.slice(0, 8), + expected: "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb", + }, + { + buf: buffer.slice(0, 63), + expected: "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b", + }, + { + buf: buffer.slice(0, 64), + expected: "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98", + }, + { + buf: buffer.slice(0, 65), + expected: "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee", + }, + { + buf: buffer.slice(0, 127), + expected: "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640d", + }, + { + buf: buffer.slice(0, 128), + expected: "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45ef", + }, + { + buf: buffer.slice(0, 129), + expected: "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12", + }, + { + buf: buffer.slice(0, 1023), + expected: "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11", + }, + { + buf: buffer.slice(0, 1024), + expected: "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af7", + }, + { + buf: buffer.slice(0, 1025), + expected: "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444", + }, + { + buf: buffer.slice(0, 2048), + expected: "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a", + }, + { + buf: buffer.slice(0, 2049), + expected: "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b6879522563030", + }, + { + buf: buffer.slice(0, 3072), + expected: "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd2", + }, + { + buf: buffer.slice(0, 3073), + expected: "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd3", + }, + { + buf: buffer.slice(0, 4096), + expected: "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e969", + }, + { + buf: buffer.slice(0, 4097), + expected: "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb995", + }, + { + buf: buffer.slice(0, 5120), + expected: "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833", + }, + { + buf: buffer.slice(0, 5121), + expected: "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff", + }, + { + buf: buffer.slice(0, 6144), + expected: "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca205", + }, + { + buf: buffer.slice(0, 6145), + expected: "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f", + }, + { + buf: buffer.slice(0, 7168), + expected: "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a", + }, + { + buf: buffer.slice(0, 7169), + expected: "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e7817", + }, + { + buf: buffer.slice(0, 8192), + expected: "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a63", + }, + { + buf: buffer.slice(0, 8193), + expected: "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3b", + }, + { + buf: buffer.slice(0, 102400), + expected: "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085", + }, +]; + +for (const testCase of testCases) { + const result = blake3Hex(testCase.buf); + console.log(result); + + if (result !== testCase.expected) { + console.error(`Test case failed: ${testCase.buf.length} bytes`); + console.error(`Expected: ${testCase.expected}`); + console.error(`Actual: ${result}`); + process.exit(1); + } +} + +console.log("All test cases passed"); diff --git a/packages/blake3-wasm/vendor/Cargo.lock b/packages/blake3-wasm/vendor/Cargo.lock new file mode 100644 index 0000000000..9f0162bf75 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "blake3-example" +version = "0.1.0" diff --git a/packages/blake3-wasm/vendor/Cargo.toml b/packages/blake3-wasm/vendor/Cargo.toml new file mode 100644 index 0000000000..7f31968ed3 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "blake3-example" +version = "0.1.0" +edition = "2021" + +[lib] +name = "reference_impl" +path = "src/lib.rs" + +[[bin]] +name = "blake3-example" +path = "src/main.rs" \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/README.md b/packages/blake3-wasm/vendor/README.md new file mode 100644 index 0000000000..46cce0d076 --- /dev/null +++ b/packages/blake3-wasm/vendor/README.md @@ -0,0 +1,27 @@ +# BLAKE3 Example + +This is a simple example that demonstrates using the BLAKE3 hash function with empty input. + +## Prerequisites + +- Rust and Cargo installed on your system. You can install them from [rustup.rs](https://rustup.rs/) + +## Running the Example + +1. Open a terminal in this directory +2. Run the following command: + ```bash + cargo run + ``` + +The program will output a 32-byte hash in hexadecimal format. For empty input, the expected output should be: +``` +af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 +``` + +## What the Code Does + +1. Creates a new BLAKE3 hasher +2. Updates it with empty input +3. Finalizes the hash into a 32-byte buffer +4. Prints the hash in hexadecimal format \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/blake3.rs b/packages/blake3-wasm/vendor/src/blake3.rs new file mode 100644 index 0000000000..bc701784f8 --- /dev/null +++ b/packages/blake3-wasm/vendor/src/blake3.rs @@ -0,0 +1,376 @@ +// From https://github.com/BLAKE3-team/BLAKE3/blob/master/reference_impl/reference_impl.rs + +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let counter_low = counter as u32; + let counter_high = (counter >> 32) as u32; + #[rustfmt::skip] + let mut state = [ + chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3], + chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7], + IV[0], IV[1], IV[2], IV[3], + counter_low, counter_high, block_len, flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + debug_assert_eq!(bytes.len(), 4 * words.len()); + for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { + *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key_words, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + counter: self.chunk_counter, + block_len: self.block_len as u32, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key_words, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key_words: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key_words: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key_words, 0, flags), + key_words, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key_words, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/packages/blake3-wasm/vendor/src/lib.rs b/packages/blake3-wasm/vendor/src/lib.rs new file mode 100644 index 0000000000..874b108ebf --- /dev/null +++ b/packages/blake3-wasm/vendor/src/lib.rs @@ -0,0 +1,3 @@ +mod blake3; + +pub use blake3::*; \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/main.rs b/packages/blake3-wasm/vendor/src/main.rs new file mode 100644 index 0000000000..76a1537cbd --- /dev/null +++ b/packages/blake3-wasm/vendor/src/main.rs @@ -0,0 +1,30 @@ +use std::io::Write; + +fn main() { + println!("Starting BLAKE3 hash computation for empty input"); + + // Create a new hasher + let mut hasher = reference_impl::Hasher::new(); + println!("Created new hasher"); + + // Update with empty input + let input = &[0u8, 1u8]; + println!("Input length: {} bytes", input.len()); + hasher.update(input); + println!("Updated hasher with input"); + + // Create a buffer for the output + let mut output = [0u8; 32]; + + // Get the hash + hasher.finalize(&mut output); + println!("Finalized hash computation"); + + // Print the hash in hex format + let mut stdout = std::io::stdout(); + print!("Final hash: "); + for byte in output { + write!(stdout, "{:02x}", byte).unwrap(); + } + println!(); +} \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/target/.gitignore b/packages/blake3-wasm/vendor/target/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/vendor/target/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/gearhash-wasm/README.md b/packages/gearhash-wasm/README.md new file mode 100644 index 0000000000..cf72dafbd3 --- /dev/null +++ b/packages/gearhash-wasm/README.md @@ -0,0 +1,84 @@ +JS and WASM implementations of https://github.com/srijs/rust-gearhash + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { nextMatch } from '@huggingface/gearhash-wasm'; + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1000000); // Example: 1MB of data +// ... fill data with your content ... + +// Search for a pattern with a specific mask +const mask = 0x0000d90003530000n; // Example mask as a BigInt +const match = nextMatch(data, mask); +const allMatches = nextMatches(data, mask).matches; +``` + +The `nextMatch` function takes two parameters: +- `data`: A Uint8Array containing the data to search through +- `mask`: A BigInt representing the pattern mask to search for + +The function returns an object with the `position` (i32) and `hash` (u64) properties + +You can continuously feed data like this: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; // extra length not processed +for await (const chunk of dataSource) { + let index = 0; + while (1) { + let match = nextMatch(chunk.subArray(index), mask, hash); + + if (match.position !== -1) { + console.log({ + length: match.position + length, + hash: match.hash + }) + + index += match.position; + length = 0; + hash = 0n; + } else { + length += chunk.length - index; + break; + } + } +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +or, more performant with `nextMatches`: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; +for await (const chunk of dataSource) { + const result = nextMatches(chunk, mask, hash); + let lastPosition = 0; + for (const match of result.matches) { + console.log({ + length: match.position - lastPosition + length, + hash: match.hash + }); + + length = 0; + lastPosition = match.position; + } + length = result.remaining; + hash = result.hash; +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +## Possible improvements + diff --git a/packages/gearhash-wasm/asconfig.json b/packages/gearhash-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/gearhash-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/gearhash-wasm/assembly/index.ts b/packages/gearhash-wasm/assembly/index.ts new file mode 100644 index 0000000000..447e7776f7 --- /dev/null +++ b/packages/gearhash-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +export { DEFAULT_TABLE } from "./table"; +export { nextMatch, nextMatches } from "./next-match"; diff --git a/packages/gearhash-wasm/assembly/next-match.ts b/packages/gearhash-wasm/assembly/next-match.ts new file mode 100644 index 0000000000..1093f77a80 --- /dev/null +++ b/packages/gearhash-wasm/assembly/next-match.ts @@ -0,0 +1,46 @@ +// The entry file of your WebAssembly module. + +import { DEFAULT_TABLE } from "./table"; + +// Interface for the match result +export class MatchResult { + position: i32 = -1; + hash: u64 = 0; +} + +// Function to find the next match in the buffer +export function nextMatch(buf: Uint8Array, mask: u64, hash: u64 = 0): MatchResult { + for (let i = 0; i < buf.length; i++) { + const b = buf[i]; + hash = (hash << 1) + DEFAULT_TABLE[b]; + + if ((hash & mask) == 0) { + return { position: i + 1, hash }; + } + } + + return { position: -1, hash }; // Return -1 position to indicate no match found, along with the final hash +} + +export class NextMatchesResult { + matches: MatchResult[] = []; + hash: u64 = 0; + remaining: i32 = 0; +} + +export function nextMatches(buf: Uint8Array, mask: u64, hash: u64 = 0): NextMatchesResult { + const result = new NextMatchesResult(); + + let match = nextMatch(buf, mask, hash); + let position = 0; + while (match.position !== -1) { + result.matches.push(match); + position += match.position; + match = nextMatch(buf.subarray(position), mask, 0); + } + + result.remaining = buf.length - position; + result.hash = match.hash; + + return result; +} diff --git a/packages/gearhash-wasm/assembly/table.ts b/packages/gearhash-wasm/assembly/table.ts new file mode 100644 index 0000000000..22a9e52df9 --- /dev/null +++ b/packages/gearhash-wasm/assembly/table.ts @@ -0,0 +1,57 @@ +/* eslint-disable @typescript-eslint/no-loss-of-precision */ + +// Define the Table type as a static array of u64 values +export const DEFAULT_TABLE: StaticArray = [ + 0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777, 0x368f573e8b7a31b7, + 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb, 0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, + 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad, 0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, + 0x99b07edc1570ad0f, 0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5, 0x1fba37801a9ceacc, + 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3, 0xe4accf9e6211f420, 0x2520e71f87579071, + 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873, 0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, + 0x86a6e5da1b09c2b1, 0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079, 0x484f7e9c97b2e199, + 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8, 0x000070940d87955a, 0x8ae69108139e626f, + 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf, 0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, + 0xb0b219e6977d4c47, 0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687, 0x755a99374f4a5b07, + 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8, 0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, + 0xec550255e6641b44, 0x78fb94a8449c14c6, 0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, + 0x14675f0b48ea4144, 0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c, 0x228d21f6ad450890, + 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58, 0x0c10ca932b3c0deb, 0x2727fee884afed7b, + 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523, 0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, + 0xc2861181ddf18959, 0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083, + 0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1, 0xbee1797174e22416, + 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15, 0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, + 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635, 0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, + 0xf67a02bd8784b54f, 0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5, 0x267310178e08a22e, + 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06, 0xa315f5ebfb706d26, 0x8816c34e3301bace, + 0xe9395b9cbb71fdae, 0x002ce9202e721648, 0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, + 0xb8e0be4039fbc47c, 0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489, 0x8a1872a22b01f584, + 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313, 0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, + 0x762ebf3759d75a5b, 0x207bfe823d693975, 0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, + 0xb27b1a29fc5e7816, 0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01, 0x77f8ae30ac277c5d, + 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b, 0xf3c40afa60de1104, 0x2063127aa59167c3, + 0x621de62269d1894d, 0xd188ac1de62b4726, 0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, + 0xd9d6de6611b9f602, 0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e, + 0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7, 0x2910dfc75a4b5221, + 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65, 0xf05086a71257941b, 0xfec3b215d351cead, + 0x00ae1055e0144202, 0xf54b40846f42e454, 0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, + 0x39ce4957a5e5d8d4, 0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a, 0xc22e770f4531689d, + 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e, 0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, + 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5, 0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, + 0x41fce516cd88f299, 0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b, 0xda3f90178401b18e, + 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9, 0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, + 0xecef1f410033e78a, 0x0024c2b274ac72cb, 0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, + 0xc986e3c76178739b, 0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb, + 0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87, 0x84321e13b9bbc816, + 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8, 0x00004f63381b10c3, 0x07d5b7816fcc4e10, + 0xe5a536726a6a8155, 0x57afb23447a07fdd, 0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, + 0x63c7a906c1dd187b, +]; diff --git a/packages/gearhash-wasm/assembly/tsconfig.json b/packages/gearhash-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..f81c3d55e6 --- /dev/null +++ b/packages/gearhash-wasm/assembly/tsconfig.json @@ -0,0 +1,6 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": [ + "./**/*.ts" + ] +} \ No newline at end of file diff --git a/packages/gearhash-wasm/build/.gitignore b/packages/gearhash-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/gearhash-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/gearhash-wasm/package.json b/packages/gearhash-wasm/package.json new file mode 100644 index 0000000000..ecc3ef79ff --- /dev/null +++ b/packages/gearhash-wasm/package.json @@ -0,0 +1,33 @@ +{ + "name": "@huggingface/gearhash-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "gearhash", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "^0.27.36" + } +} diff --git a/packages/gearhash-wasm/pnpm-lock.yaml b/packages/gearhash-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..cf8533a3b9 --- /dev/null +++ b/packages/gearhash-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: ^0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/gearhash-wasm/tests/index.js b/packages/gearhash-wasm/tests/index.js new file mode 100644 index 0000000000..d3d220da00 --- /dev/null +++ b/packages/gearhash-wasm/tests/index.js @@ -0,0 +1,117 @@ +import assert from "assert"; +import { nextMatch, nextMatches } from "../build/debug.js"; + +// Simple seeded random number generator +function seededRandom(seed) { + return function () { + seed = (seed * 16807) % 2147483647; + return (seed - 1) / 2147483646; + }; +} + +// Create seeded random data +const seed = 12345; // Fixed seed for deterministic results +const random = seededRandom(seed); +const randomData = new Uint8Array(150_000).map(() => Math.floor(random() * 256)); + +// Test with a known mask +assert.deepStrictEqual(nextMatch(randomData, 0x0000d90003530000n), { position: 459, hash: 9546224108073667431n }); +assert.deepStrictEqual(nextMatch(randomData.subarray(459), 0x0000d90003530000n), { + position: 3658, + hash: 4043712133052525799n, +}); + +assert.deepStrictEqual(nextMatches(randomData, 0x0000d90003530000n), { + remaining: 1206, + hash: 18262966296195680063n, + matches: [ + { position: 459, hash: 9546224108073667431n }, + { position: 3658, hash: 4043712133052525799n }, + { position: 2013, hash: 6111702085179831561n }, + { position: 1593, hash: 12901166541873917249n }, + { position: 1566, hash: 7692186462913612151n }, + { position: 211, hash: 16543980755458487441n }, + { position: 1778, hash: 15644384556715661587n }, + { position: 566, hash: 9793366463237592247n }, + { position: 2079, hash: 11221321116171663064n }, + { position: 2940, hash: 1564726223525919786n }, + { position: 809, hash: 15395839328876515337n }, + { position: 946, hash: 10585747199093122759n }, + { position: 854, hash: 4479393852251501569n }, + { position: 436, hash: 15702966577303948694n }, + { position: 2165, hash: 17148900940125069205n }, + { position: 273, hash: 11505890591385615424n }, + { position: 1459, hash: 10774060112464860369n }, + { position: 158, hash: 2233823235057951370n }, + { position: 7, hash: 1983310208686139647n }, + { position: 1926, hash: 4499661659570185271n }, + { position: 1529, hash: 16090517590946392505n }, + { position: 1751, hash: 12536054222087023458n }, + { position: 1222, hash: 334146166487300408n }, + { position: 2230, hash: 6981431015531396608n }, + { position: 826, hash: 11877997991061156988n }, + { position: 33, hash: 8454422284689001989n }, + { position: 1731, hash: 15095819886766624527n }, + { position: 8842, hash: 6362744947164356842n }, + { position: 928, hash: 3627691864743766239n }, + { position: 684, hash: 1137480049753900759n }, + { position: 5301, hash: 10541554813326859395n }, + { position: 2546, hash: 14704288147532701373n }, + { position: 11856, hash: 9653226176528805511n }, + { position: 650, hash: 12714262162290274678n }, + { position: 1346, hash: 2525679969999819421n }, + { position: 353, hash: 2532749299807420736n }, + { position: 1091, hash: 693561665209300041n }, + { position: 729, hash: 11014435606385442344n }, + { position: 1204, hash: 10083883741570968570n }, + { position: 1671, hash: 12308901096302322810n }, + { position: 1362, hash: 13399339535394154305n }, + { position: 1858, hash: 792389713896955383n }, + { position: 2248, hash: 15568664728418446816n }, + { position: 1790, hash: 4328805983976714464n }, + { position: 634, hash: 722305044694988273n }, + { position: 741, hash: 17978970776495983968n }, + { position: 901, hash: 5911861036065769110n }, + { position: 302, hash: 1334790489764850513n }, + { position: 1435, hash: 16174119877357924758n }, + { position: 61, hash: 12103430617785210167n }, + { position: 1, hash: 35334639850667n }, + { position: 2074, hash: 7449519750512442798n }, + { position: 2061, hash: 1805950971475184864n }, + { position: 1612, hash: 5837797879339327135n }, + { position: 3281, hash: 6649572008787195357n }, + { position: 39, hash: 16137242368496690753n }, + { position: 263, hash: 8133543763164586431n }, + { position: 2333, hash: 17019949823094703325n }, + { position: 1160, hash: 8949503946391874147n }, + { position: 641, hash: 18344573417262448121n }, + { position: 2588, hash: 13345294745157777411n }, + { position: 3116, hash: 7832639641689314418n }, + { position: 4671, hash: 13762161036402935807n }, + { position: 276, hash: 10924644382434953404n }, + { position: 4430, hash: 9045519457622973922n }, + { position: 32, hash: 4188636638659752674n }, + { position: 2470, hash: 1184167847892138852n }, + { position: 694, hash: 11699508361075635892n }, + { position: 1703, hash: 9012268790677532920n }, + { position: 47, hash: 6528251874505412319n }, + { position: 2672, hash: 8484789019946020371n }, + { position: 202, hash: 1365160724288031760n }, + { position: 467, hash: 10426152000837661087n }, + { position: 496, hash: 3605417399306471847n }, + { position: 3777, hash: 8410473338876477323n }, + { position: 80, hash: 3693273711429567121n }, + { position: 813, hash: 9224216742837123228n }, + { position: 3115, hash: 5150752707627454542n }, + { position: 806, hash: 8797260981186887018n }, + { position: 4915, hash: 1483374079741560715n }, + { position: 2118, hash: 1742900153494554703n }, + { position: 1515, hash: 4635371751468227093n }, + { position: 2393, hash: 15282968615371427111n }, + { position: 4331, hash: 4659818917792066036n }, + { position: 1188, hash: 3862441883651577693n }, + { position: 2663, hash: 8524789558855117254n }, + ], +}); + +console.log("ok"); diff --git a/packages/xetchunk-wasm/asconfig.json b/packages/xetchunk-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/xetchunk-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/xetchunk-wasm/assembly/index.ts b/packages/xetchunk-wasm/assembly/index.ts new file mode 100644 index 0000000000..d889a1ba0a --- /dev/null +++ b/packages/xetchunk-wasm/assembly/index.ts @@ -0,0 +1 @@ +export * from "./xet-chunker"; diff --git a/packages/xetchunk-wasm/assembly/tsconfig.json b/packages/xetchunk-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..33daff5dac --- /dev/null +++ b/packages/xetchunk-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.37/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/xetchunk-wasm/assembly/xet-chunker.ts b/packages/xetchunk-wasm/assembly/xet-chunker.ts new file mode 100644 index 0000000000..2145684eab --- /dev/null +++ b/packages/xetchunk-wasm/assembly/xet-chunker.ts @@ -0,0 +1,139 @@ +import { nextMatch } from "@huggingface/gearhash-wasm/assembly"; +import { Blake3Hasher } from "@huggingface/blake3-wasm/assembly"; + +// Constants +const TARGET_CHUNK_SIZE: usize = 64 * 1024; // 64KB +const MINIMUM_CHUNK_DIVISOR: usize = 8; +const MAXIMUM_CHUNK_MULTIPLIER: usize = 2; +const HASH_WINDOW_SIZE: usize = 64; + +export class Chunk { + hash: Uint8Array; + data: Uint8Array; + + constructor(hash: Uint8Array, data: Uint8Array) { + this.hash = hash; + this.data = data; + } +} + +// Type for the next() method return value +export class NextResult { + chunk: Chunk | null; + bytesConsumed: usize; + + constructor(chunk: Chunk | null, bytesConsumed: usize) { + this.chunk = chunk; + this.bytesConsumed = bytesConsumed; + } +} + +export class XetChunker { + private minimumChunk: usize; + private maximumChunk: usize; + private mask: u64; + private chunkBuf: Uint8Array; + private curChunkLen: usize; + private hash: u64; + + constructor(targetChunkSize: usize = TARGET_CHUNK_SIZE) { + // Validate target chunk size is a power of 2 + assert((targetChunkSize & (targetChunkSize - 1)) == 0, "Target chunk size must be a power of 2"); + assert(targetChunkSize > HASH_WINDOW_SIZE, "Target chunk size must be greater than hash window size"); + assert(targetChunkSize < u32.MAX_VALUE, "Target chunk size must be less than u32.MAX_VALUE"); + + let mask = (targetChunkSize - 1) as u64; + // Shift mask left by leading zeros count + mask = mask << (64 - clz(mask)); + + this.minimumChunk = targetChunkSize / MINIMUM_CHUNK_DIVISOR; + this.maximumChunk = targetChunkSize * MAXIMUM_CHUNK_MULTIPLIER; + this.mask = mask; + this.chunkBuf = new Uint8Array(this.maximumChunk); + this.curChunkLen = 0; + this.hash = 0; + } + + next(data: Uint8Array, isFinal: boolean): NextResult { + const nBytes = data.length; + let createChunk = false; + let consumeLen: usize = 0; + + if (nBytes != 0) { + // Skip minimum chunk size + if (this.curChunkLen + HASH_WINDOW_SIZE < this.minimumChunk) { + const maxAdvance = min(this.minimumChunk - this.curChunkLen - HASH_WINDOW_SIZE - 1, nBytes - consumeLen); + consumeLen += maxAdvance; + this.curChunkLen += maxAdvance; + } + + // Calculate read end + const readEnd = min(nBytes, consumeLen + this.maximumChunk - this.curChunkLen); + + let bytesToNextBoundary: usize; + const matchResult = nextMatch(data.subarray(consumeLen, readEnd), this.mask, this.hash); + + if (matchResult.position != -1) { + bytesToNextBoundary = matchResult.position; + createChunk = true; + this.hash = matchResult.hash; + } else { + bytesToNextBoundary = readEnd - consumeLen; + this.hash = matchResult.hash; + } + + // Check if we hit maximum chunk + if (bytesToNextBoundary + this.curChunkLen >= this.maximumChunk) { + bytesToNextBoundary = this.maximumChunk - this.curChunkLen; + createChunk = true; + } + + this.curChunkLen += bytesToNextBoundary; + consumeLen += bytesToNextBoundary; + + // Copy data to chunk buffer + this.chunkBuf.set(data.subarray(0, consumeLen), this.curChunkLen - consumeLen); + } + + if (createChunk || (isFinal && this.curChunkLen > 0)) { + const chunkData = this.chunkBuf.subarray(0, this.curChunkLen); + const chunk = new Chunk(computeDataHash(chunkData), chunkData); + this.curChunkLen = 0; + this.hash = 0; + return new NextResult(chunk, consumeLen); + } + + return new NextResult(null, consumeLen); + } + + nextBlock(data: Uint8Array, isFinal: boolean): Chunk[] { + const chunks: Chunk[] = []; + let pos: usize = 0; + + while (pos < data.length) { + const result = this.next(data.subarray(pos), isFinal); + if (result.chunk) { + chunks.push(result.chunk); + } + pos += result.bytesConsumed; + } + + return chunks; + } + + finish(): Chunk | null { + return this.next(new Uint8Array(0), true).chunk; + } +} + +function computeDataHash(data: Uint8Array): Uint8Array { + const hasher = new Blake3Hasher(); + hasher.update(data); + const hash = new Uint8Array(32); + hasher.finalize(hash); + return hash; +} + +export function createXetChunker(targetChunkSize: usize = TARGET_CHUNK_SIZE): XetChunker { + return new XetChunker(targetChunkSize); +} diff --git a/packages/xetchunk-wasm/build/.gitignore b/packages/xetchunk-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/xetchunk-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/xetchunk-wasm/package.json b/packages/xetchunk-wasm/package.json new file mode 100644 index 0000000000..1842aa1c47 --- /dev/null +++ b/packages/xetchunk-wasm/package.json @@ -0,0 +1,39 @@ +{ + "name": "@huggingface/xetchunk-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "xet", + "chunk", + "chunking", + "assemblyscript", + "assembly", + "wasm" + ], + "dependencies": { + "@huggingface/blake3-wasm": "workspace:*", + "@huggingface/gearhash-wasm": "workspace:*" + }, + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "^0.27.36" + } +} diff --git a/packages/xetchunk-wasm/pnpm-lock.yaml b/packages/xetchunk-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..32bdab0b41 --- /dev/null +++ b/packages/xetchunk-wasm/pnpm-lock.yaml @@ -0,0 +1,45 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@huggingface/blake3-wasm': + specifier: workspace:* + version: link:../blake3-wasm + '@huggingface/gearhash-wasm': + specifier: workspace:* + version: link:../gearhash-wasm + devDependencies: + assemblyscript: + specifier: ^0.27.36 + version: 0.27.37 + +packages: + + assemblyscript@0.27.37: + resolution: {integrity: sha512-YtY5k3PiV3SyUQ6gRlR2OCn8dcVRwkpiG/k2T5buoL2ymH/Z/YbaYWbk/f9mO2HTgEtGWjPiAQrIuvA7G/63Gg==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.37: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 08e651bb73..bc118ffdff 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -14,3 +14,6 @@ packages: - "packages/ollama-utils" - "packages/mcp-client" - "packages/tiny-agents" + - "packages/gearhash-wasm" + - "packages/blake3-wasm" + - "packages/xetchunk-wasm" diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000000..fbe8ff6fda --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "node", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "lib": ["ESNext"], + "types": ["assemblyscript"] + } +}