Does not agree with Saltpack / GMP
coolaj86 opened this issue · comments
Reference Strings
For reference, I here's the output of this library (jxskiss) compared to saltpack, which seems to also be compatible with the GMP and GnuPG Base62 implementations, even when using the same alphabet:
enc := base62.NewEncoding("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
Raw : Hello, 世界 (13 bytes)
Base64 : SGVsbG8sIOS4lueVjA (18 chars)
jxskiss : CMvvMYBvWmoRinMP81 (18 chars)
saltpack: 1wJfrzvdbuFbL65vcS (18 chars)
Raw : Hello World (11 bytes)
Base64 : SGVsbG8gV29ybGQ (15 chars)
jxskiss : ancSlT58ln6RbX4 (15 chars)
saltpack: 73XpUgyMwkGr29M (15 chars)
Raw : [0] (1 bytes)
Base64 : AA (2 chars)
jxskiss : 00 (2 chars)
saltpack: 00 (2 chars)
Raw : [0 0 0 0 0 0 0 0 0 0 0 0] (12 bytes)
Base64 : AAAAAAAAAAAAAAAA (16 chars)
jxskiss : 0000000000000000 (16 chars)
saltpack: 00000000000000000 (17 chars)
Raw : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] (24 bytes)
Base64 : AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA (32 chars)
jxskiss : 00000000000000000000000000000000 (32 chars)
saltpack: 000000000000000000000000000000000 (33 chars)
Raw : [0 0 0 0 255 255 255 255] (8 bytes)
Base64 : AAAAAP____8 (11 chars)
jxskiss : VVVVVV300000 (12 chars)
saltpack: 000004gfFC3 (11 chars)
Raw : [255 255 255 255 0 0 0 0] (8 bytes)
Base64 : _____wAAAAA (11 chars)
jxskiss : 00000yVVVVV7 (12 chars)
saltpack: LygHZwPV2MC (11 chars)
As you can see, it's not just a matter of the output being reversed, but the character sequences are entirely different.
Test Output Reference
package main
import (
"encoding/base64"
"fmt"
"github.com/jxskiss/base62"
"github.com/keybase/saltpack/encoding/basex"
)
func main() {
enc := base62.NewEncoding("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
for _, src := range [][]byte{
[]byte("Hello, 世界"),
[]byte("Hello World"),
{0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 255, 255, 255, 255},
{255, 255, 255, 255, 0, 0, 0, 0},
} {
b62 := enc.Encode(src)
b64 := base64.RawURLEncoding.EncodeToString(src)
sp62 := basex.Base62StdEncoding.EncodeToString(src)
if src[0] == 0x0 || src[1] == 255 {
fmt.Printf("Raw : %v (%d bytes)\n", src, len(src))
} else {
fmt.Printf("Raw : %v (%d bytes)\n", string(src), len(src))
}
fmt.Printf("Base64 : %s (%d chars)\n", b64, len(b64))
fmt.Printf("jxskiss : %s (%d chars)\n", b62, len(b62))
fmt.Printf("saltpack: %s (%d chars)\n", sp62, len(sp62))
fmt.Println("")
}
}
This implementation is not string-compatible with typical big-int based implementation. Saying Base62 encoding and decoding, it is correct, it encodes arbitrary bytes to string using 62 characters, then correctly decodes the string back to the original bytes. The correctness is tested by large amount of random bytes, see the test file https://github.com/jxskiss/base62/blob/master/base62_test.go.
This implemention is much performant than typical big-int based implementation. Regarding the reversed order of bytes, it's an implementation detail, and is the main reason we can get the performance.
The folling test code gives this:
(anaconda3-2018.12) ➜ temp git:(master) ✗ go test -count=1 ./testbase62
ok temp/testbase62 0.011s
(anaconda3-2018.12) ➜ temp git:(master) ✗ go test -run=none -bench=. ./testbase62
goos: darwin
goarch: amd64
pkg: temp/testbase62
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
Benchmark_jxskiss_Base62_1K-12 95632 11263 ns/op
Benchmark_saltpack_Base62_1K-12 13999 87196 ns/op
Benchmark_jxskiss_Base62_4K-12 25191 44248 ns/op
Benchmark_saltpack_Base62_4K-12 3537 347388 ns/op
Benchmark_jxskiss_Base62_1M-12 100 11347393 ns/op
Benchmark_saltpack_Base62_1M-12 13 88337949 ns/op
PASS
ok temp/testbase62 8.553s
package testbase62
import (
"bytes"
"crypto/rand"
"testing"
"github.com/jxskiss/base62"
"github.com/keybase/saltpack/encoding/basex"
)
var bytes1K []byte
var bytes4K []byte
var bytes1M []byte
func init() {
bytes1K = make([]byte, 1024)
bytes4K = make([]byte, 4096)
bytes1M = make([]byte, 1024*1024)
_, err1 := rand.Read(bytes1K)
_, err2 := rand.Read(bytes4K)
_, err3 := rand.Read(bytes1M)
if err1 != nil || err2 != nil || err3 != nil {
panic("rand.Read got error")
}
}
func TestCorrectness(t *testing.T) {
b1 := base62.Encode(bytes4K)
b2, err := base62.Decode(b1)
if err != nil {
panic("base62 error")
}
if !bytes.Equal(bytes4K, b2) {
panic("base62 not equal")
}
n3 := basex.Base62StdEncoding.EncodedLen(len(bytes4K))
b3 := make([]byte, n3)
basex.Base62StdEncoding.Encode(b3, bytes4K)
n4 := basex.Base62StdEncoding.DecodedLen(len(b3))
b4 := make([]byte, n4)
nDec, err := basex.Base62StdEncoding.Decode(b4, b3)
if err != nil {
panic("basex error")
}
b4 = b4[:nDec]
if !bytes.Equal(bytes4K, b4) {
panic("basex not equal")
}
}
func Benchmark_jxskiss_Base62_1K(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = base62.Encode(bytes1K)
}
}
func Benchmark_saltpack_Base62_1K(b *testing.B) {
for i := 0; i < b.N; i++ {
n := basex.Base62StdEncoding.EncodedLen(len(bytes1K))
buf := make([]byte, n)
basex.Base62StdEncoding.Encode(buf, bytes1K)
}
}
func Benchmark_jxskiss_Base62_4K(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = base62.Encode(bytes4K)
}
}
func Benchmark_saltpack_Base62_4K(b *testing.B) {
for i := 0; i < b.N; i++ {
n := basex.Base62StdEncoding.EncodedLen(len(bytes4K))
buf := make([]byte, n)
basex.Base62StdEncoding.Encode(buf, bytes4K)
}
}
func Benchmark_jxskiss_Base62_1M(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = base62.Encode(bytes1M)
}
}
func Benchmark_saltpack_Base62_1M(b *testing.B) {
for i := 0; i < b.N; i++ {
n := basex.Base62StdEncoding.EncodedLen(len(bytes1M))
buf := make([]byte, n)
basex.Base62StdEncoding.Encode(buf, bytes1M)
}
}
Hope I will get some time later to add the implementation details to README.