cuda_OpticalFlowDual_TVL1 is not thread-safe in python

Question

cuda_OpticalFlowDual_TVL1 is not thread-safe in python

daniel-code opened this issue 4 years ago · comments

System information (version)

OpenCV => 4.3.0
Operating System / Platform => Ubuntu
Compiler => gcc
Cuda => 10.0
GPU => nvidai RTX2080ti

Detailed description

I run the same code twice and find cuda_OpticalFlowDual_TVL1 got different results when calculating optical flow using python multithreading.

The different parts of two optical flow result in the same video seem frame-based. Some results of frames in the same video are consistent and some not.

This problem does not appear when using single thread.

Steps to reproduce

import cv2
from threading import Thread
import numpy as np


def job(video_path):
    optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
    video_capture = cv2.VideoCapture(video_path)
    _, prev_frame = video_capture.read()
    prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    output = []
    for i in range(10):
        _, current_frame = video_capture.read()
        current_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
        cuMat1 = cv2.cuda_GpuMat()
        cuMat2 = cv2.cuda_GpuMat()
        cuMat1.upload(prev_frame)
        cuMat2.upload(current_frame)
        cu_flow = optical_flow.calc(cuMat1, cuMat2, None)
        optical_flow_data = cu_flow.download()
        output.append(optical_flow_data)
        prev_frame = current_frame
    np.save('{}.npy'.format(video_path[:-4]), output)


if __name__ == '__main__':
    video_path_list = ['video1.avi',
                       'video2.avi',
                       'video3.avi']
    worker_list = []
    for i in range(3):
        t = Thread(target=job, kwargs={'video_path': video_path_list[i]})
        t.start()
        worker_list.append(t)
    for worker in worker_list:
        worker.join()

Issue submission checklist

I report the issue, it's not a question

I checked the problem with documentation, FAQ, open issues,
answers.opencv.org, Stack Overflow, etc and have not found solution

I updated to latest OpenCV version and the issue is still there

There is reproducer code and related data files: videos, images, onnx, etc

Alexander Alekhin · Answer 1 · Fri Aug 21 2020 15:02:25 GMT+0800 (China Standard Time)

Please try the latest release.
Should be fixed here: #17556 (released with OpenCV 3.4.11 / 4.4.0)

Su YR · Answer 2 · Fri Aug 21 2020 17:12:12 GMT+0800 (China Standard Time)

I have updated my OpenCV to 4.4.0, but unfortunately, there still happens in the above code.
I also try using python thread lock for controlling data upload and calculation, but it still has problems.
The different result part in the same video seems different between v4.3.0 and v4.4.0

import cv2
from threading import Thread, Lock
import numpy as np


def job(video_path, lock: Lock):
    optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
    video_capture = cv2.VideoCapture(video_path)
    _, prev_frame = video_capture.read()
    prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    output = []
    for i in range(10):
        _, current_frame = video_capture.read()
        current_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
        # add thread lock
        lock.acquire()
        try:
            cuMat1 = cv2.cuda_GpuMat()
            cuMat2 = cv2.cuda_GpuMat()
            cuMat1.upload(prev_frame)
            cuMat2.upload(current_frame)
            cu_flow = optical_flow.calc(cuMat1, cuMat2, None)
            optical_flow_data = cu_flow.download()
        finally:
            lock.release()
        output.append(optical_flow_data)
        prev_frame = current_frame
    np.save('{}.npy'.format(video_path[:-4]), output)

if __name__ == '__main__':
    video_path_list = ['video1.avi',
                       'video2.avi',
                       'video3.avi']
    worker_list = []
    lock = Lock()
    for i in range(3):
        t = Thread(target=job, kwargs={'video_path': video_path_list[i], 'lock': lock})
        t.start()
        worker_list.append(t)
    for worker in worker_list:
        worker.join()

update the environment setting

OpenCV => ~~4.3.0~~ -> 4.4.0
Cuda => ~~11.0~~ -> 10.0

Alexander Alekhin · Answer 3 · Sat Aug 22 2020 02:35:09 GMT+0800 (China Standard Time)

@nglee Do you have a chance to take a look on this?

Namgoo Lee · Answer 4 · Sat Aug 22 2020 23:19:37 GMT+0800 (China Standard Time)

@daniel-code
I have slightly modified your test code. It compares the results from synchronous launches and asynchronous runs.
You'll have to use cv2.cuda_Stream() when you call optical_flow.calc().

Test Code

import cv2
from threading import Thread, Lock
import numpy as np

def job(video_path, output):
    optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
    video_capture = cv2.VideoCapture(video_path)
    _, prev_frame = video_capture.read()
    prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    for i in range(10):
        _, current_frame = video_capture.read()
        current_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)

        cuMat1 = cv2.cuda_GpuMat()
        cuMat2 = cv2.cuda_GpuMat()
        cuMat1.upload(prev_frame)
        cuMat2.upload(current_frame)
        cu_flow = optical_flow.calc(cuMat1, cuMat2, None, cv2.cuda_Stream())
        optical_flow_data = cu_flow.download()

        output.append(optical_flow_data)
        prev_frame = current_frame

if __name__ == '__main__':
    video_path_list = ['E:/repos/opencv_extra/testdata/gpu/video/768x576.avi',
                       'E:/repos/opencv_extra/testdata/gpu/video/1920x1080.avi',
                       'E:/repos/opencv_extra/testdata/highgui/video/big_buck_bunny.mp4']

    # synchronous launch
    out0 = []
    out1 = []
    out2 = []
    
    job(video_path_list[0], out0)
    job(video_path_list[1], out1)
    job(video_path_list[2], out2)
    
    print('synchronous run complete')
    
    # asynchronous launch
    tout0 = []
    tout1 = []
    tout2 = []

    t1 = Thread(target=job, kwargs={'video_path': video_path_list[0], 'output': tout0})
    t1.start()
    t2 = Thread(target=job, kwargs={'video_path': video_path_list[1], 'output': tout1})
    t2.start()
    t3 = Thread(target=job, kwargs={'video_path': video_path_list[2], 'output': tout2})
    t3.start()

    t1.join()
    t2.join()
    t3.join()
    
    print('asynchronous run complete')
    
    # compare synchronous and asynchronous result
    print(np.array_equal(out0, tout0))
    print(np.array_equal(out1, tout1))
    print(np.array_equal(out2, tout2))

Test Code in C++

void helper(const string& path, vector<Mat>* out_vec)
{
    cv::VideoCapture capture(path);

    cv::Mat _prev;
    capture.read(_prev);

    cv::cuda::HostMem prev(_prev.size(), CV_8UC1);
    cv::cuda::HostMem cur(_prev.size(), CV_8UC1);

    cv::cvtColor(_prev, prev.createMatHeader(), cv::COLOR_BGR2GRAY);

    auto alg = cv::cuda::OpticalFlowDual_TVL1::create();
    cv::cuda::Stream stream;

    cv::cuda::GpuMat d_prev;
    d_prev.upload(prev, stream);

    for (int i = 0; i < 90; ++i)
    {
        cv::Mat _cur;
        capture.read(_cur);
        cv::cvtColor(_cur, cur.createMatHeader(), cv::COLOR_BGR2GRAY);

        cv::cuda::GpuMat d_cur;
        d_cur.upload(cur, stream);

        cv::cuda::GpuMat d_out;
        alg->calc(d_prev, d_cur, d_out, stream);

        cv::cuda::HostMem out;

		d_out.download(out, stream);

        stream.waitForCompletion();

        out_vec->push_back(out.createMatHeader().clone());

        d_prev = d_cur;
    }
}

TEST(OpticalFlowDual_TVL1_Issue, Issue18155)
{
    vector<string> video_path_list;
    video_path_list.emplace_back("E:/repos/opencv_extra/testdata/gpu/video/768x576.avi");
    video_path_list.emplace_back("E:/repos/opencv_extra/testdata/gpu/video/1920x1080.avi");
    video_path_list.emplace_back("E:/repos/opencv_extra/testdata/highgui/video/big_buck_bunny.mp4");

    // synchronous run

    vector<Mat> t1_sync;
    vector<Mat> t2_sync;
    vector<Mat> t3_sync;

    auto start = std::chrono::high_resolution_clock::now();
    {
        helper(video_path_list[0], &t1_sync);
        helper(video_path_list[1], &t2_sync);
        helper(video_path_list[2], &t3_sync);
    }
    auto end = std::chrono::high_resolution_clock::now();

    cout << "Synchronous run complete (" << std::to_string(std::chrono::duration<float, std::milli>(end - start).count()) << " ms)" << std::endl;

    // asynchronous run
    vector<Mat> t1_async;
    vector<Mat> t2_async;
    vector<Mat> t3_async;

    start = std::chrono::high_resolution_clock::now();
    {
        std::thread thread1(helper, video_path_list[0], &t1_async);
        std::thread thread2(helper, video_path_list[1], &t2_async);
        std::thread thread3(helper, video_path_list[2], &t3_async);

        thread1.join();
        thread2.join();
        thread3.join();
    }
    end = std::chrono::high_resolution_clock::now();

    cout << "All threads complete (Asynchronous run complete) (" << std::to_string(std::chrono::duration<float, std::milli>(end - start).count()) << " ms)" << std::endl;

    std::cout << std::to_string(t1_sync.size()) << std::endl;
    std::cout << std::to_string(t2_sync.size()) << std::endl;
    std::cout << std::to_string(t3_sync.size()) << std::endl;
    std::cout << std::to_string(t1_async.size()) << std::endl;
    std::cout << std::to_string(t2_async.size()) << std::endl;
    std::cout << std::to_string(t3_async.size()) << std::endl;

    for (int i = 0; i < t1_sync.size(); ++i)
        EXPECT_MAT_NEAR(t1_sync[i], t1_async[i], 0.0);
    for (int i = 0; i < t2_sync.size(); ++i)
        EXPECT_MAT_NEAR(t2_sync[i], t2_async[i], 0.0);
    for (int i = 0; i < t3_sync.size(); ++i)
        EXPECT_MAT_NEAR(t3_sync[i], t3_async[i], 0.0);
}

Su YR · Answer 5 · Sun Aug 23 2020 00:01:25 GMT+0800 (China Standard Time)

Thank you for your support.
I will try the code on Monday and report the result.

Su YR · Answer 6 · Mon Aug 24 2020 19:25:37 GMT+0800 (China Standard Time)

It runs correctly in v4.4.0.
I found I link the previous version(4.3) in my virtual env., so it is thread-safe in v4.4.0.
Thank you for the support and your time.