Performance difference between two write methods in kernel
ysh329 opened this issue · comments
There're two write methods in kernel function, is there any difference between them?
First
*(__global CL_INPUT_TYPE *)(c + row * N + col) = cc1.s0; *(__global CL_INPUT_TYPE *)(c + row * N + (col+1)) = cc1.s1;
*(__global CL_INPUT_TYPE *)(c + (row+1) * N + col) = cc2.s0; *(__global CL_INPUT_TYPE *)(c + (row+1) * N + (col+1)) = cc2.s1;
Second
c[row * N + col] = cc1.s0; c[row * N + (col+1)] = cc1.s1;
c[(row+1) * N + col] = cc2.s0; c[(row+1) * N + (col+1)] = cc2.s1;
Complete Code
This is from matrix multiplication implementation, it's complete kernel code is below:
__kernel void mat_mult_vec2x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
const int col = get_global_id(0) << 1;
const int row = get_global_id(1) << 1;
CL_ELEM_TYPE aa1, aa2,
bb1, bb2,
cc1 = 0,
cc2 = 0;
for (int p = 0; p < K; p+=2) {
aa1 = *(
(__global CL_ELEM_TYPE *)(a + row * K + p)
);
aa2 = *(
(__global CL_ELEM_TYPE *)(a + (row+1) * K + p)
);
bb1 = *(
(__global CL_ELEM_TYPE *)(b + p * N + col)
);
bb2 = *(
(__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
);
//cc1 = (CL_ELEM_TYPE)
// (aa1.s0*bb1.s0 + aa1.s1*bb2.s0, aa1.s0*bb1.s1 + aa1.s1*bb2.s1);
//cc2 = (CL_ELEM_TYPE)
// (aa2.s0*bb1.s0 + aa2.s1*bb2.s0, aa2.s0*bb1.s1 + aa2.s1*bb2.s1);
cc1.s0 += aa1.s0 * bb1.s0 + aa1.s1 * bb2.s0;
cc1.s1 += aa1.s0 * bb1.s1 + aa1.s1 * bb2.s1;
cc2.s0 += aa2.s0 * bb1.s0 + aa2.s1 * bb2.s0;
cc2.s1 += aa2.s0 * bb1.s1 + aa2.s1 * bb2.s1;
}
//*(__global CL_INPUT_TYPE *)(c + row * N + col) = cc1.s0; *(__global CL_INPUT_TYPE *)(c + row * N + (col+1)) = cc1.s1;
//*(__global CL_INPUT_TYPE *)(c + (row+1) * N + col) = cc2.s0; *(__global CL_INPUT_TYPE *)(c + (row+1) * N + (col+1)) = cc2.s1;
c[row * N + col] = cc1.s0; c[row * N + (col+1)] = cc1.s1;
c[(row+1) * N + col] = cc2.s0; c[(row+1) * N + (col+1)] = cc2.s1;
}
In fact, there's no difference between them. I tested different sizes, including 128x128x128
, 512x512x512
, 1024x1024x1024
. The execution time of kernel is same.
512x512x512
First Method
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 256, 256, 1 }
>>> [WARN] global work size (65536) is smaller than task size (262144)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0 0.049194
>>> [INFO] skip first 1 time(s)
1 0.049066
2 0.048693
3 0.049986
4 0.048974
5 0.049136
6 0.049800
7 0.049019
8 0.048965
9 0.049934
10 0.048857
>>> [INFO] CL_GPU 512x512x512 0.049243 s 5.451241 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
Second Method
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 256, 256, 1 }
>>> [WARN] global work size (65536) is smaller than task size (262144)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0 0.049859
>>> [INFO] skip first 1 time(s)
1 0.052595
2 0.050000
3 0.049764
4 0.049456
5 0.048726
6 0.049554
7 0.048849
8 0.048720
9 0.048496
10 0.049462
>>> [INFO] CL_GPU 512x512x512 0.049562 s 5.416133 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
1024x1024x1024
First Method
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 512, 512, 1 }
>>> [WARN] global work size (262144) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0 0.459874
>>> [INFO] skip first 1 time(s)
1 0.455607
2 0.456987
3 0.457147
4 0.468058
5 0.457499
6 0.456280
7 0.459129
8 0.456129
9 0.456616
10 0.457166
>>> [INFO] CL_GPU 1024x1024x1024 0.458062 s 4.688196 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
Second Method
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 512, 512, 1 }
>>> [WARN] global work size (262144) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0 0.457250
>>> [INFO] skip first 1 time(s)
1 0.453836
2 0.455416
3 0.454867
4 0.454100
5 0.454240
6 0.454772
7 0.456201
8 0.455817
9 0.461128
10 0.456089
>>> [INFO] CL_GPU 1024x1024x1024 0.455647 s 4.713047 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
It's more convenient to use first method, decreasing write times for vector-variables:
non-vector variables
c[row * N + col] = cc1.s0; c[row * N + (col+1)] = cc1.s1; c[row * N + (col+2)] = cc1.s2; c[row * N + (col+3)] = cc1.s3;
c[(row+1) * N + col] = cc2.s0; c[(row+1) * N + (col+1)] = cc2.s1; c[(row+1) * N + (col+2)] = cc2.s2; c[(row+1) * N + (col+3)] = cc2.s3;
c[(row+2) * N + col] = cc3.s0; c[(row+2) * N + (col+1)] = cc3.s1; c[(row+2) * N + (col+2)] = cc3.s2; c[(row+2) * N + (col+3)] = cc3.s3;
c[(row+3) * N + col] = cc4.s0; c[(row+3) * N + (col+1)] = cc4.s1; c[(row+3) * N + (col+2)] = cc4.s2; c[(row+3) * N + (col+3)] = cc4.s3;
vector variables
*(__global CL_ELEM_TYPE *)(c + row * N + col) = cc1;
*(__global CL_ELEM_TYPE *)(c + (row+1) * N + col) = cc2;
*(__global CL_ELEM_TYPE *)(c + (row+2) * N + col) = cc3;
*(__global CL_ELEM_TYPE *)(c + (row+3) * N + col) = cc4;