Why do more threads take longer?

Question

Why do more threads take longer?

csrdxbb opened this issue 4 years ago · comments

Today I test the NNPACK in ARMv8 machine, I found when I improve the thread number, the time increases.
I am very confused and I am no sure what is the problem, the script just like this:

#include <iostream>
#include <sys/time.h>
				
#include <vector>
#include "nnpack.h"

using namespace std;

float test_nnpack(size_t bs, size_t threads)
{
	enum nnp_status init_status = nnp_initialize();
	if (init_status != nnp_status_success)
	{
		return 0;
	}
 
	enum nnp_convolution_algorithm algorithm;
	enum nnp_convolution_transform_strategy strategy=nnp_convolution_transform_strategy_tuple_based;
	const size_t batch_size = 1;
	const size_t input_channels = 16;
	const size_t output_channels = 16;
        const size_t kernel_num = 3;
	const struct nnp_padding input_padding = {1, 1, 1, 1};
	const struct nnp_size input_size = {224, 224};
	const struct nnp_size kernel_size = {3, 3};
	const struct nnp_size stride = {.width=1, .height=1};
	const struct nnp_size output_size = {
		.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width)/stride.width + 1,
		.height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height)/stride.height + 1
	};
	float *input, *kernel, *output, *bias;
 
	
	input = (float *)malloc(batch_size * input_channels * input_size.height * input_size.width * sizeof(float));
	kernel = (float *)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));
	output = (float *)malloc(batch_size * output_channels * output_size.height * output_size.width * sizeof(float));
	bias = (float *)malloc(output_channels * sizeof(float));
	
	pthreadpool_t threadpool = nullptr;
        if (true) {   
            threadpool = pthreadpool_create(32);  
            printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));  
        } 	
 
	struct nnp_profile computation_profile;
	
	int i, j, c, iter;
	struct timeval start, end;
	
	for (c = 0; c < input_channels; c++)
	{
		for (i = 0; i < input_size.height; i++)
		{
			for (j = 0; j < input_size.width; j++)
			{
				input[c * input_size.height * input_size.width + i * input_size.width + j] = (i * input_size.width + j) * 0.1;
			}
		}
	}
	
	
	for(i = 0; i < output_channels; i++)
	{
		for (j = 0; j < input_channels * kernel_size.height * kernel_size.width; j++)
		{
			kernel[i * input_channels * kernel_size.height * kernel_size.width + j] = 0.1;
		}
	}
	
	
	for (i = 0; i < output_channels; i++)
	{
		bias[i] = 1.0;
	}
	
        iter = 1;
	gettimeofday(&start, nullptr); 
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_wt8x8;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	long second = end.tv_sec - start.tv_sec;
	long usecond = end.tv_usec - start.tv_usec;
	float mtime = (second * 1000 + usecond / 1000.0);
	cout << "Winograd convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;
	
	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_ft8x8;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "FFT8x8 convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;        
	
	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_implicit_gemm;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "GEMM convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;	

	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_direct;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "Direct convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;	

	return 0;
}
int main(int argc, char* argv[])
{
        size_t batch_size = atoi(argv[1]);
        size_t thread_num = atoi(argv[2]);
	test_nnpack(batch_size,thread_num);
	return 0;
}

My machine are numa arch, however I am sure the 32 threads run on same node so there is no numa remote access issue.
Please tell me how to improve the time ? Thanks!