Potential incorrect implementation
GuillaumeLeclerc opened this issue · comments
Hello,
I have this piece of code that compare the results of the js-xxhash
implementation with the reference one.
const { hash: binaryProc } = require('xxhash')
const { h32: jsProc } = require('xxhashjs');
const data = 'this is some random piece of data';
const {UINT32: UINT} = require('cuint');
const random = require('random-buffer');
function hashBinary(data, seed) {
if (typeof seed === 'string') {
seed = Buffer.from(seed, 'hex');
}
if (typeof data === 'string') {
data = Buffer.from(data, 'ascii');
}
return binaryProc(data, seed, 'hex')
}
function hashJS(data, seed) {
seed = new UINT(0).fromString(seed, 16);
return jsProc(data, seed).toString(16).match(/.{2}/g).reverse().join("");
}
let valid = 0;
let invalid = [];
const COUNT = 10000;
for (var i = 0 ; i < COUNT ; i++) {
const buf = random(4);
const seed = buf.toString('hex');
const b = hashBinary(data, seed);
const j = hashJS(data, seed);
if (b === j) {
valid++;
} else {
invalid.push(new Uint8Array(buf.buffer))
}
}
console.log(valid / COUNT * 100);
As we can see only 12.5% of the seeds actually return the same result with the two implementations.
I tried to figure it out and I might have a clue.
const { hash: binaryProc } = require('xxhash')
const { h32: jsProc } = require('xxhashjs');
const data = 'this is some random piece of data';
const {UINT32: UINT} = require('cuint');
const random = require('random-buffer');
function normalize_seed(seed) {
for (var i = 0; i < seed.length; ++i) {
seed[i] = seed[i] % 128;
}
return seed;
}
function hashBinary(data, seed) {
if (typeof seed === 'string') {
seed = Buffer.from(seed, 'hex');
}
if (typeof data === 'string') {
data = Buffer.from(data, 'ascii');
}
return binaryProc(data, seed, 'hex')
}
function hashJS(data, seed) {
seed = new UINT(0).fromString(seed, 16);
return jsProc(data, seed).toString(16).match(/.{2}/g).reverse().join("");
}
let valid = 0;
let invalid = [];
const COUNT = 10000;
for (var i = 0 ; i < COUNT ; i++) {
const buf = random(4);
normalize_seed(buf);
const seed = buf.toString('hex');
const b = hashBinary(data, seed);
const j = hashJS(data, seed);
if (b === j) {
valid++;
} else {
invalid.push(new Uint8Array(buf.buffer))
}
}
console.log(valid / COUNT * 100);
In this case I only use seeds where bytes are between 0 and 127. and the ratio of correct output reaches 93%. My guess is that the C implementation uses char (which are signed) and this one uses unsigned char and interpret the high bit of each byte differently.
However there must be another problem that makes the other 7% wrong.
Do you have any idea what is going on there ?
Hmm this is interesting. I am not sure where the bug(s) lies: be it cuint or the js xxh code.
Unfortunately I am strapped on time right now so will have a look in a best effort way.
Same problem here
We know it is open source and I don't want you to feel any pressure (I know the feeling). But did you have time to look at it already ? Do you have any clue where it could come from so I could potentially start investigating ?
I am sorry I havent had the time to look into this. I suspect the bug is in the cuint library.
I have made some research on this and it appears I am getting wrong checksums not from js-xxhash but from xxhash itself.
I compared the results of js-xxhash, xxhash and the C reference implementation and got (picking random seeds):
seed | js-xxhash | xxhash | C xxhash |
---|---|---|---|
0xf00f85ee | 8cb0299d | 27a08d7c | 8cb0299d |
0xb1164e9f | 116249b8 | f9fbff2b | 116249b8 |
Thoughts?
Any updates on this? How does this affect the use of the library though?
In which case I am closing this issue. Feel free to reopen one if needed.