Suggestion for vectorizing MaxPool and Convolution Layer
Mousavikia opened this issue · comments
Hi everyone... I have been working on writing a CNN (convolutional neural network) code in vector intrinsic mode and it is finished but after examining my code I have found that convolution and max pool layers in vector mode are giving really bad latency and are much worse than their scaler counterpart...what I have done in vector mode is this:
I first sorted the input vector (sth like im2col) to have ordered input for my main convolution and max pool layer, and then since my filters are 2*2
in convolution and max pool I utilized vl=4
and then I used vwmul
to multiply and vredsum
to get the final answer of each convolution window (for max pool just vredmax
).... Following is the code for my convolution:
Here is the sortging function(im2col):
void vec_Conv2d_in(signed char* in, signed short int inSizeCol, signed char* out, signed short int outSizeCol, int outSize) {
int i = 0;
int j = 0;
int k = 0;
int f = 1;
signed short int m = outSizeCol;
while (outSize != 0) {
//////////////////
out[j] = in[i];
j++;
i++;
out[j] = in[i];
j++;
i = i + inSizeCol - 1;
out[j] = in[i];
j++;
i++;
out[j] = in[i];
j++;
///////////////////
m -= 2;
if (m != 0) {
k += 1;
i = k;
}
else {
i = inSizeCol * f;
f++;
k = i;
m = outSizeCol;
}
outSize -= 4;
}
}
And here is the main convolution:
void vec_conv2d(signed char* in, signed char* w1, signed char* w2, signed char* w3, signed char* w4, signed short int* out1, signed short int* out2, signed short int* out3, signed short int* out4, signed short int outSize) {
signed short int j = 0;
while (outSize != 0) {
size_t vl = vsetvl_e8m1(4);
vint8m1_t vin = vle8_v_i8m1(in, vl);
vint8m1_t vwc1 = vle8_v_i8m1(w1, vl);
vint8m1_t vwc2 = vle8_v_i8m1(w2, vl);
vint8m1_t vwc3 = vle8_v_i8m1(w3, vl);
vint8m1_t vwc4 = vle8_v_i8m1(w4, vl);
vint16m2_t vout1 = vwmul_vv_i16m2(vin, vwc1, vl);
vint16m1_t vec_zero1 = vmv_v_x_i16m1(0, vl);
vint16m1_t vec_sum1 = vredsum_vs_i16m2_i16m1(vec_zero1, vout1, vec_zero1, vl);
out1[j] = vmv_x_s_i16m1_i16(vec_sum1);
vint16m2_t vout2 = vwmul_vv_i16m2(vin, vwc2, vl);
vint16m1_t vec_zero2 = vmv_v_x_i16m1(0, vl);
vint16m1_t vec_sum2 = vredsum_vs_i16m2_i16m1(vec_zero2, vout2, vec_zero2, vl);
out2[j] = vmv_x_s_i16m1_i16(vec_sum2);
vint16m2_t vout3 = vwmul_vv_i16m2(vin, vwc3, vl);
vint16m1_t vec_zero3 = vmv_v_x_i16m1(0, vl);
vint16m1_t vec_sum3 = vredsum_vs_i16m2_i16m1(vec_zero3, vout3, vec_zero3, vl);
out3[j] = vmv_x_s_i16m1_i16(vec_sum3);
vint16m2_t vout4 = vwmul_vv_i16m2(vin, vwc4, vl);
vint16m1_t vec_zero4 = vmv_v_x_i16m1(0, vl);
vint16m1_t vec_sum4 = vredsum_vs_i16m2_i16m1(vec_zero4, vout4, vec_zero4, vl);
out4[j] = vmv_x_s_i16m1_i16(vec_sum4);
in += vl;
outSize -= 1;
j++;
}
}
The main function:
int main() {
long time1, time2;
time1 = get_mcycle();
signed char x[10000] = { ..... };
signed char xp[39204]; //39204 = 99*99*4
vec_Conv2d_in(x, 100, xp, 198, 39204); //198 = 99*2
signed char wc11[4] = { 0b11110110,0b11011101,0b00100111,0b11110100 };
signed char wc12[4] = { 0b00101110,0b11101011,0b11110011,0b11110100 };
signed char wc13[4] = { 0b00011101,0b00001011,0b11100100,0b11011100 };
signed char wc14[4] = { 0b00011001,0b00100110,0b11110100,0b00000100 };
signed short int oc11[9801]; //9801 = 99*99
signed short int oc12[9801]; //9801 = 99*99
signed short int oc13[9801]; //9801 = 99*99
signed short int oc14[9801]; //9801 = 99*99
vec_conv2d(xp, wc11, wc12, wc13, wc14, oc11, oc12, oc13, oc14, 9801);
time2 = get_mcycle();
uart_printf("The operation took %d cycles\n\r", time2 - time1);
asm volatile("jr x0;");
return 0;
}
Note: Input is 100*100
matrix flattened to vector, the output will be 98*98
since the filter is 2*2
and stride is 1
Since it is worse than the scaler version, can you suggest a better coding algorithm for the vector convolution and max pool?
Hi @michael-platzer ... regarding the above issue I actually tried vector indexed load to eliminate the sorting function and hopefully get better letancy...
The code for maxpooling is sth like this:
void vec_maxPool(signed char* in1, signed short int inSizeCol, signed char* out1, signed short int n){
signed short int j = 0;
unsigned short int stride = 2;
unsigned short int index[4] = { 0,1,inSizeCol,inSizeCol + 1 };
size_t vl = vsetvl_e8m1(4);
vuint16m2_t vindex = vle16_v_u16m2((const uint16_t*)index, vl);
while (n > 0) {
vint8m1_t vin1 = vloxei16_v_i8m1(in1, vindex, vl);
vint8m1_t vec_zero1 = vmv_v_x_i8m1(0, vl);
vint8m1_t vec_max1 = vredmax_vs_i8m1_i8m1(vec_zero1, vin1, vec_zero1, vl);
out1[j] = vmv_x_s_i8m1_i8(vec_max1);
vindex = vadd_vx_u16m2(vindex, stride, vl);
n -= 1;
j++;
}
}
The code is not complete but at least for first element it should work.
I tested this with the following:
signed char in1[16] = { 11,2,1,3,76,19,22,0,18,0,1,90,42,10,0,11 }; //4*4 matrix flattened to 16 vector stride is 2 so output will be n=2*2
vec_maxPool(in1, 4, om1v, 4);
The ouput:
11,76,90,42
It is like normal loading 4 element and then vredmax
ing the numbers....
11,2,1,3 ....>11
76,19,22,0 ....>76
18,0,1,90 ....>90
42,10,0,11 ....> 42
I tried other inputs and no index load is happening...
It is just like a normal load but I have written indexed load... Can you see my mistake?