emmansun / gmsm

ShangMi (SM) cipher suites for golang (Go语言商用密码软件)

Home Page:https://emmansun.github.io/gmsm

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

sm3: amd64 AVX版本,遇到不支持指令问题

zhangyongding opened this issue · comments

version: v1.21.1

Running tool: /usr/local/go/bin/go test -timeout 30s -run ^ExampleSum$ github.com/emmansun/gmsm/sm3 -count=1 -v

=== RUN ExampleSum
SIGILL: illegal instruction
PC=0x508c04 m=0 sigcode=2
instruction bytes: 0xc4 0x42 0x68 0xf2 0xea 0xc5 0xf9 0xef 0xc1 0x41 0x9 0xfd 0x45 0x1 0xee 0xc1

goroutine 1 [running]:
github.com/emmansun/gmsm/sm3.blockSIMD(0xc0000939d8, {0xc0000939f8, 0x40, 0x40})
/home/godev/go/src/gmsm/sm3/sm3block_simd_amd64.s:764 +0x2ee4 fp=0xc000093870 sp=0xc000093828 pc=0x508c04
github.com/emmansun/gmsm/sm3.block(0x471a25?, {0xc0000939f8?, 0xc000093920?, 0x471a5a?})
/home/godev/go/src/gmsm/sm3/sm3block_amd64.go:25 +0x4c fp=0xc0000938a0 sp=0xc000093870 pc=0x4fdfcc
github.com/emmansun/gmsm/sm3.(*digest).Write(0xc0000939d8, {0xc000093938, 0x34, 0x48})
/home/godev/go/src/gmsm/sm3/sm3.go:171 +0xb7 fp=0xc0000938f8 sp=0xc0000938a0 pc=0x4fdc97
github.com/emmansun/gmsm/sm3.(*digest).checkSum(0xc0000939d8)
/home/godev/go/src/gmsm/sm3/sm3.go:144 +0xba fp=0xc000093990 sp=0xc0000938f8 pc=0x4fdb1a
github.com/emmansun/gmsm/sm3.Sum({0xc000093a94?, 0x7fafef6465b8?, 0x18?})
/home/godev/go/src/gmsm/sm3/sm3.go:212 +0xb6 fp=0xc000093a58 sp=0xc000093990 pc=0x4fdf16
github.com/emmansun/gmsm/sm3_test.ExampleSum()
/home/godev/go/src/gmsm/sm3/example_test.go:13 +0x3b fp=0xc000093ae0 sp=0xc000093a58 pc=0x50c7fb
testing.runExample({{0x542c11, 0xa}, 0x550110, {0x54e4ef, 0x41}, 0x0})
/usr/local/go/src/testing/run_example.go:63 +0x2cd fp=0xc000093bd8 sp=0xc000093ae0 pc=0x4b8aad
testing.runExamples(0xc000093d98, {0x63a180?, 0x2, 0x5?})
/usr/local/go/src/testing/example.go:44 +0x171 fp=0xc000093c78 sp=0xc000093bd8 pc=0x4b4471
testing.(*M).Run(0xc0000a80a0)
/usr/local/go/src/testing/testing.go:1927 +0x6e6 fp=0xc000093ec0 sp=0xc000093c78 pc=0x4be626
main.main()
_testmain.go:73 +0x19c fp=0xc000093f40 sp=0xc000093ec0 pc=0x50cbbc
runtime.main()
/usr/local/go/src/runtime/proc.go:267 +0x2bb fp=0xc000093fe0 sp=0xc000093f40 pc=0x4391fb
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc000093fe8 sp=0xc000093fe0 pc=0x468b01

goroutine 2 [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/usr/local/go/src/runtime/proc.go:398 +0xce fp=0xc000052fa8 sp=0xc000052f88 pc=0x43966e
runtime.goparkunlock(...)
/usr/local/go/src/runtime/proc.go:404
runtime.forcegchelper()
/usr/local/go/src/runtime/proc.go:322 +0xb3 fp=0xc000052fe0 sp=0xc000052fa8 pc=0x4394d3
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc000052fe8 sp=0xc000052fe0 pc=0x468b01
created by runtime.init.6 in goroutine 1
/usr/local/go/src/runtime/proc.go:310 +0x1a

goroutine 3 [GC sweep wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/usr/local/go/src/runtime/proc.go:398 +0xce fp=0xc000053778 sp=0xc000053758 pc=0x43966e
runtime.goparkunlock(...)
/usr/local/go/src/runtime/proc.go:404
runtime.bgsweep(0x0?)
/usr/local/go/src/runtime/mgcsweep.go:280 +0x94 fp=0xc0000537c8 sp=0xc000053778 pc=0x4240f4
runtime.gcenable.func1()
/usr/local/go/src/runtime/mgc.go:200 +0x25 fp=0xc0000537e0 sp=0xc0000537c8 pc=0x419285
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc0000537e8 sp=0xc0000537e0 pc=0x468b01
created by runtime.gcenable in goroutine 1
/usr/local/go/src/runtime/mgc.go:200 +0x66

goroutine 4 [GC scavenge wait]:
runtime.gopark(0xc000028070?, 0x572560?, 0x1?, 0x0?, 0xc000006d00?)
/usr/local/go/src/runtime/proc.go:398 +0xce fp=0xc000053f70 sp=0xc000053f50 pc=0x43966e
runtime.goparkunlock(...)
/usr/local/go/src/runtime/proc.go:404
runtime.(*scavengerState).park(0x63f6c0)
/usr/local/go/src/runtime/mgcscavenge.go:425 +0x49 fp=0xc000053fa0 sp=0xc000053f70 pc=0x421989
runtime.bgscavenge(0x0?)
/usr/local/go/src/runtime/mgcscavenge.go:653 +0x3c fp=0xc000053fc8 sp=0xc000053fa0 pc=0x421f1c
runtime.gcenable.func2()
/usr/local/go/src/runtime/mgc.go:201 +0x25 fp=0xc000053fe0 sp=0xc000053fc8 pc=0x419225
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc000053fe8 sp=0xc000053fe0 pc=0x468b01
created by runtime.gcenable in goroutine 1
/usr/local/go/src/runtime/mgc.go:201 +0xa5

goroutine 17 [finalizer wait]:
runtime.gopark(0x400000?, 0x100052670?, 0x0?, 0x0?, 0x66dfc0?)
/usr/local/go/src/runtime/proc.go:398 +0xce fp=0xc000052628 sp=0xc000052608 pc=0x43966e
runtime.runfinq()
/usr/local/go/src/runtime/mfinal.go:193 +0x107 fp=0xc0000527e0 sp=0xc000052628 pc=0x418307
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc0000527e8 sp=0xc0000527e0 pc=0x468b01
created by runtime.createfing in goroutine 1
/usr/local/go/src/runtime/mfinal.go:163 +0x3d

goroutine 18 [IO wait]:
runtime.gopark(0x0?, 0xb?, 0x0?, 0x0?, 0x6?)
/usr/local/go/src/runtime/proc.go:398 +0xce fp=0xc00004e578 sp=0xc00004e558 pc=0x43966e
runtime.netpollblock(0x470fd8?, 0x404a86?, 0x0?)
/usr/local/go/src/runtime/netpoll.go:564 +0xf7 fp=0xc00004e5b0 sp=0xc00004e578 pc=0x432377
internal/poll.runtime_pollWait(0x7fafa8ab9f18, 0x72)
/usr/local/go/src/runtime/netpoll.go:343 +0x85 fp=0xc00004e5d0 sp=0xc00004e5b0 pc=0x464265
internal/poll.(*pollDesc).wait(0xc0000ae180?, 0xc000108000?, 0x1)
/usr/local/go/src/internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00004e5f8 sp=0xc00004e5d0 pc=0x484807
internal/poll.(*pollDesc).waitRead(...)
/usr/local/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc0000ae180, {0xc000108000, 0x8000, 0x8000})
/usr/local/go/src/internal/poll/fd_unix.go:164 +0x27a fp=0xc00004e690 sp=0xc00004e5f8 pc=0x484dda
os.(*File).read(...)
/usr/local/go/src/os/file_posix.go:29
os.(*File).Read(0xc000098050, {0xc000108000?, 0xc000072020?, 0x7fafef6521c0?})
/usr/local/go/src/os/file.go:118 +0x52 fp=0xc00004e6d0 sp=0xc00004e690 pc=0x4869d2
io.copyBuffer({0x574100, 0xc000072020}, {0x5740a0, 0xc000098050}, {0x0, 0x0, 0x0})
/usr/local/go/src/io/io.go:430 +0x1a6 fp=0xc00004e750 sp=0xc00004e6d0 pc=0x482966
io.Copy(...)
/usr/local/go/src/io/io.go:389
testing.runExample.func1()
/usr/local/go/src/testing/run_example.go:37 +0x66 fp=0xc00004e7e0 sp=0xc00004e750 pc=0x4b8ce6
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1650 +0x1 fp=0xc00004e7e8 sp=0xc00004e7e0 pc=0x468b01
created by testing.runExample in goroutine 1
/usr/local/go/src/testing/run_example.go:35 +0x207

rax 0x7d4193f2
rbx 0x28aec64d
rcx 0x8eef0a9e
rdx 0xa77f37be
rdi 0x254c0630
rsi 0xc0000939d8
rbp 0xc000093860
rsp 0xc000093828
r8 0x7827d881
r9 0x3d4c4e71
r10 0xfcffa058
r11 0x1342c50b
r12 0xe653422d
r13 0xcef029e
r14 0x12498ab
r15 0x7
rip 0x508c04
rflags 0x10206
cs 0x33
fs 0x0
gs 0x0
FAIL github.com/emmansun/gmsm/sm3 0.023s

v1.21.0版本正常

查了一下,应该没有用到非AVX指令啊

    VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
    VPSLLD   $7, XTMP0, XTMP1;             \
    VPSRLD   $(32-7), XTMP0, XTMP0;        \
    VPOR     XTMP0, XTMP1, XTMP1;          \ // XTMP1 = W[-13] rol 7
    VPALIGNR $8, XDWORD2, XDWORD3, XTMP0;  \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
    VPXOR   XTMP1, XTMP0, XTMP0;           \ // XTMP0 = W[-6] XOR (W[-13] rol 7) 

    // Prepare P1 parameters 
    VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
    VPXOR XDWORD0, XTMP1, XTMP1;           \ // XTMP1 = W[-9] XOR W[-16]
    VPSHUFD $0xA5, XDWORD3, XTMP2;         \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
    VPSRLQ  $17, XTMP2, XTMP2;             \ // XTMP2 = W[-3] ror 17 {xBxA}
    VPXOR   XTMP1, XTMP2, XTMP2;           \ // XTMP2 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {xxxA}

    // P1
    VPSHUFD $0x00, XTMP2, XTMP2;           \ // XTMP2 = {AAAA}
    VPSRLQ  $17, XTMP2, XTMP3;             \ // XTMP3 = XTMP2 rol 15 {xxxA}
    VPSRLQ  $9, XTMP2, XTMP4;              \ // XTMP4 = XTMP2 rol 23 {xxxA}
    VPXOR    XTMP2, XTMP4, XTMP4;          \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA})
    VPXOR    XTMP4, XTMP3, XTMP4;          \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxxA}) XOR (XTMP2 rol 23 {xxxA})

    // First 1 words message schedule result
    VPXOR    XTMP4, XTMP0, XTMP2;          \ // XTMP2 = {..., ..., ..., W[0]}

    // Prepare P1 parameters
    VPALIGNR $4, XDWORD3, XTMP2, XTMP3;    \ // XTMP3 = {W[0], w15, w14, w13}
    VPSLLD $15, XTMP3, XTMP4;              \
    VPSRLD  $(32-15), XTMP3, XTMP3;        \
    VPOR XTMP3, XTMP4, XTMP4;              \ // XTMP4 = W[-3] rol 15 {DCBA}
    VPXOR   XTMP1, XTMP4, XTMP4;           \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA}

    // P1	
    VPSLLD   $15, XTMP4, XTMP2;            \ 
    VPSRLD   $(32-15), XTMP4, XTMP3;       \
    VPOR     XTMP3, XTMP2, XTMP3;          \ // XTMP3 = XTMP4 rol 15 {DCBA}
    VPSHUFB  r08_mask<>(SB), XTMP3, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCBA}
    VPXOR    XTMP3, XTMP4, XTMP3;          \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCBA})
    VPXOR    XTMP3, XTMP1, XTMP1;          \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA})

    // 4 words message schedule result
    VPXOR    XTMP1, XTMP0, XDWORD0;          \ // XTMP1 = {W[3], W[2], W[1], W[0]}

我目前还没有方法通过instruction bytes找到指令的方法,需要时间。

评估是VPALIGNR,我再确认下。
image

有点奇怪,从文档上,VPALIGNR确实是AVX支持的,但其它指令更不可能,试一下吧。

Ask help https://stackoverflow.com/questions/77169432/how-to-get-related-instruction-code-from-instruction-bytes

@zhangyongding 帮忙拉最新的代码测试一下,谢谢!根据https://stackoverflow.com/questions/77169432/how-to-get-related-instruction-code-from-instruction-bytes ,我已经在最新的代码中去除了ANDN的使用。

测试没有问题

fixed in v0.21.2.