Why do more threads take longer?
csrdxbb opened this issue · comments
Today I test the NNPACK in ARMv8 machine, I found when I improve the thread number, the time increases.
I am very confused and I am no sure what is the problem, the script just like this:
#include <iostream>
#include <sys/time.h>
#include <vector>
#include "nnpack.h"
using namespace std;
float test_nnpack(size_t bs, size_t threads)
{
enum nnp_status init_status = nnp_initialize();
if (init_status != nnp_status_success)
{
return 0;
}
enum nnp_convolution_algorithm algorithm;
enum nnp_convolution_transform_strategy strategy=nnp_convolution_transform_strategy_tuple_based;
const size_t batch_size = 1;
const size_t input_channels = 16;
const size_t output_channels = 16;
const size_t kernel_num = 3;
const struct nnp_padding input_padding = {1, 1, 1, 1};
const struct nnp_size input_size = {224, 224};
const struct nnp_size kernel_size = {3, 3};
const struct nnp_size stride = {.width=1, .height=1};
const struct nnp_size output_size = {
.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width)/stride.width + 1,
.height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height)/stride.height + 1
};
float *input, *kernel, *output, *bias;
input = (float *)malloc(batch_size * input_channels * input_size.height * input_size.width * sizeof(float));
kernel = (float *)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));
output = (float *)malloc(batch_size * output_channels * output_size.height * output_size.width * sizeof(float));
bias = (float *)malloc(output_channels * sizeof(float));
pthreadpool_t threadpool = nullptr;
if (true) {
threadpool = pthreadpool_create(32);
printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));
}
struct nnp_profile computation_profile;
int i, j, c, iter;
struct timeval start, end;
for (c = 0; c < input_channels; c++)
{
for (i = 0; i < input_size.height; i++)
{
for (j = 0; j < input_size.width; j++)
{
input[c * input_size.height * input_size.width + i * input_size.width + j] = (i * input_size.width + j) * 0.1;
}
}
}
for(i = 0; i < output_channels; i++)
{
for (j = 0; j < input_channels * kernel_size.height * kernel_size.width; j++)
{
kernel[i * input_channels * kernel_size.height * kernel_size.width + j] = 0.1;
}
}
for (i = 0; i < output_channels; i++)
{
bias[i] = 1.0;
}
iter = 1;
gettimeofday(&start, nullptr);
for (i = 0; i < iter; i++)
{
algorithm = nnp_convolution_algorithm_wt8x8;
nnp_convolution_output(algorithm,
batch_size,
input_channels,
output_channels,
input_size,
input_padding,
kernel_size,
input,
kernel,
bias,
output,
threadpool,
nullptr);
}
gettimeofday(&end, nullptr);
long second = end.tv_sec - start.tv_sec;
long usecond = end.tv_usec - start.tv_usec;
float mtime = (second * 1000 + usecond / 1000.0);
cout << "Winograd convolution elapsed time:" << mtime << "ms" << endl;
cout << output[10] << endl;
gettimeofday(&start, nullptr);
for (i = 0; i < iter; i++)
{
algorithm = nnp_convolution_algorithm_ft8x8;
nnp_convolution_output(algorithm,
batch_size,
input_channels,
output_channels,
input_size,
input_padding,
kernel_size,
input,
kernel,
bias,
output,
threadpool,
nullptr);
}
gettimeofday(&end, nullptr);
second = end.tv_sec - start.tv_sec;
usecond = end.tv_usec - start.tv_usec;
mtime = (second * 1000 + usecond / 1000.0);
cout << "FFT8x8 convolution elapsed time:" << mtime << "ms" << endl;
cout << output[10] << endl;
gettimeofday(&start, nullptr);
for (i = 0; i < iter; i++)
{
algorithm = nnp_convolution_algorithm_implicit_gemm;
nnp_convolution_output(algorithm,
batch_size,
input_channels,
output_channels,
input_size,
input_padding,
kernel_size,
input,
kernel,
bias,
output,
threadpool,
nullptr);
}
gettimeofday(&end, nullptr);
second = end.tv_sec - start.tv_sec;
usecond = end.tv_usec - start.tv_usec;
mtime = (second * 1000 + usecond / 1000.0);
cout << "GEMM convolution elapsed time:" << mtime << "ms" << endl;
cout << output[10] << endl;
gettimeofday(&start, nullptr);
for (i = 0; i < iter; i++)
{
algorithm = nnp_convolution_algorithm_direct;
nnp_convolution_output(algorithm,
batch_size,
input_channels,
output_channels,
input_size,
input_padding,
kernel_size,
input,
kernel,
bias,
output,
threadpool,
nullptr);
}
gettimeofday(&end, nullptr);
second = end.tv_sec - start.tv_sec;
usecond = end.tv_usec - start.tv_usec;
mtime = (second * 1000 + usecond / 1000.0);
cout << "Direct convolution elapsed time:" << mtime << "ms" << endl;
cout << output[10] << endl;
return 0;
}
int main(int argc, char* argv[])
{
size_t batch_size = atoi(argv[1]);
size_t thread_num = atoi(argv[2]);
test_nnpack(batch_size,thread_num);
return 0;
}
My machine are numa arch, however I am sure the 32 threads run on same node so there is no numa remote access issue.
Please tell me how to improve the time ? Thanks!