ysh329 / OpenCL-101

Learn OpenCL step by step.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

elementwise_mul

ysh329 opened this issue · comments

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
                              __write_only image2d_t outputImage) {
  int x = get_global_id(0);
  int y = get_global_id(1);
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;
  half4 in = read_imageh(input, sampler, coords);
  half4 biase = read_imageh(bias, sampler, coords);
  half4 output = in * biase;
  write_imageh(outputImage, coords, output);
}

__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
                          __write_only image2d_t outputImage, int w) {
  int x = get_global_id(0);
  int y = get_global_id(1);
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;
  int2 coords_bias;
  coords_bias.x = x / w;
  coords_bias.y = 0;
  half4 in = read_imageh(input, sampler, coords);
  half4 biase = read_imageh(bias, sampler, coords_bias);
  half4 output = in * biase;
  write_imageh(outputImage, coords, output);
}

// etc : 1 1 1 72
// run time Y  [value,0,0,0] * 72
__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
                             __write_only image2d_t outputImage, int w) {
  int x = get_global_id(0);
  int y = get_global_id(1);
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;

  int2 coords_bias0;
  int2 coords_bias1;
  int2 coords_bias2;
  int2 coords_bias3;

  coords_bias0.x = x / w * 4;
  coords_bias0.y = 0;

  coords_bias1.x = x / w * 4 + 1;
  coords_bias1.y = 0;

  coords_bias2.x = x / w * 4 + 2;
  coords_bias2.y = 0;

  coords_bias3.x = x / w * 4 + 3;
  coords_bias3.y = 0;

  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
  half4 biase3 = read_imageh(bias, sampler, coords_bias3);

  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
  half4 in = read_imageh(input, sampler, coords);
  half4 output = mad(in, biase, 0);
  write_imageh(outputImage, coords, output);
}

__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
                          __write_only image2d_t outputImage, int w) {
  int x = get_global_id(0);
  int y = get_global_id(1);
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;
  int2 coords_bias;
  coords_bias.x = x / w;
  coords_bias.y = 0;
  half4 in = read_imageh(input, sampler, coords);
  half4 biase = read_imageh(bias, sampler, coords_bias);
  half4 output = in * biase;
  write_imageh(outputImage, coords, output);
}