About the muli-scale testing on MPII test set, i only got 91.6% instead of 92.3% reported in the paper.

Question

About the muli-scale testing on MPII test set, i only got 91.6% instead of 92.3% reported in the paper.

MaxChu719 opened this issue 3 years ago · comments

I have implemented the multi-scale testing, and I have verified that the MPII validation set accuracy is 90.75%
I then go on the apply it to the test set and the accuracy I got is only:

& Head & Shoulder & Elbow & Wrist & Hip & Knee  & Ankle & UBody & Total
& 98.3  & 96.5  & 92.4  & 88.3  & 90.6  & 88.3 & 84.1 & 92.4 & 91.6
AUC: 61.6

Which is not 92.3% as reported in the paper. Below is the code I have used for the multi-scale testing:

def read_scaled_image(image_file, s, center, scale, image_size, COLOR_RGB, DATA_FORMAT, image_transform):
    if DATA_FORMAT == 'zip':
        from utils import zipreader
        data_numpy = zipreader.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    else:
        data_numpy = cv2.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if COLOR_RGB:
        data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB)
    trans = get_affine_transform(center, s * scale, 0, image_size)
    images_warp = cv2.warpAffine(data_numpy, trans, tuple(image_size), flags=cv2.INTER_LINEAR)
    return image_transform(images_warp)
    
def validate(config, val_loader, val_dataset, model, criterion, output_dir, tb_log_dir, writer_dict=None, test_scale=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = len(val_dataset)
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32)
    all_boxes = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0

    PRINT_FREQ = min(config.PRINT_FREQ//10, 5)

    image_size = np.array(config.MODEL.IMAGE_SIZE)
    final_test_scale = test_scale if test_scale is not None else config.TEST.SCALE_FACTOR
    with torch.no_grad():
        end = time.time()

        def scale_back_output(output_hm, s, output_size):
            hm_size = [output_hm.size(3), output_hm.size(2)]
            if s != 1.0:
                hm_w_margin = int(abs(1.0 - s) * hm_size[0] / 2.0)
                hm_h_margin = int(abs(1.0 - s) * hm_size[1] / 2.0)
                if s < 1.0:
                    hm_padding = torch.nn.ZeroPad2d((hm_w_margin, hm_w_margin, hm_h_margin, hm_h_margin))
                    resized_hm = hm_padding(output_hm)
                else:
                    resized_hm = output_hm[:, :, hm_h_margin:hm_size[0] - hm_h_margin, hm_w_margin:hm_size[1] - hm_w_margin]
                resized_hm = torch.nn.functional.interpolate(
                    resized_hm,
                    size=(output_size[1], output_size[0]),
                    mode='bilinear',  # bilinear bicubic
                    align_corners=False
                )
            else:
                resized_hm = output_hm
                if hm_size[0] != output_size[0] or hm_size[1] != output_size[1]:
                    resized_hm = torch.nn.functional.interpolate(
                        resized_hm,
                        size=(output_size[1], output_size[0]),
                        mode='bilinear',  # bilinear bicubic
                        align_corners=False
                    )

            # resized_hm = torch.nn.functional.normalize(resized_hm, dim=[2, 3], p=1)
            resized_hm = resized_hm/(torch.sum(resized_hm, dim=[2, 3], keepdim=True) + 1e-9)
            return resized_hm

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        image_transform = transforms.Compose([transforms.ToTensor(), normalize])
        thread_pool = multiprocessing.Pool(multiprocessing.cpu_count())

        start_time = time.time()
        for i, (input, target, target_weight, meta) in enumerate(val_loader):
            # compute output
            # print("Batch", i, "Batch Size", input.size(0))

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)

            outputs = []
            for sidx, s in enumerate(sorted(final_test_scale, reverse=True)):
                print("Test Scale", s)
                if s != 1.0:
                    image_files = meta["image"]
                    centers = meta["center"].numpy()
                    scales = meta["scale"].numpy()

                    # images_resized = []
                    # for (image_file, center, scale) in zip(image_files, centers, scales):
                    #     scaled_image = read_scaled_image(image_file, center, scale, config.DATASET.COLOR_RGB)
                    #     images_resized.append(scaled_image)

                    images_resized = thread_pool.starmap(read_scaled_image,
                                                         [(image_file, s, center, scale, image_size, config.DATASET.COLOR_RGB, config.DATASET.DATA_FORMAT, image_transform) for (image_file, center, scale) in zip(image_files, centers, scales)])
                    images_resized = torch.stack(images_resized, dim=0)
                else:
                    images_resized = input

                model_outputs = model(images_resized)
                hm_size = [model_outputs.size(3), model_outputs.size(2)]
                # hm_size = image_size
                # hm_size = [128, 128]

                if config.TEST.FLIP_TEST:
                    print("Test Flip")
                    input_flipped = images_resized.flip(3)
                    output_flipped = model(input_flipped)

                    if isinstance(output_flipped, list):
                        output_flipped = output_flipped[-1]
                    else:
                        output_flipped = output_flipped

                    output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs)
                    output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                    # feature is not aligned, shift flipped heatmap for higher accuracy
                    if config.TEST.SHIFT_HEATMAP:
                        output_flipped[:, :, :, 1:] = output_flipped.clone()[:, :, :, 0:-1]

                    model_outputs = 0.5 * (model_outputs + output_flipped)

                    # output_flipped_resized = scale_back_output(output_flipped, s, hm_size)
                    # outputs.append(output_flipped_resized)

                output_flipped_resized = scale_back_output(model_outputs, s, hm_size)
                outputs.append(output_flipped_resized)

            target_size = [target.size(3), target.size(2)]
            if hm_size[0] != target_size[0] or hm_size[1] != target_size[1]:
                target = torch.nn.functional.interpolate(
                    target,
                    size=hm_size,
                    mode='bilinear',  # bilinear bicubic
                    align_corners=False
                )
                target = torch.nn.functional.normalize(target, dim=[2, 3], p=2)

            for indv_output in outputs:
                _, avg_acc, _, _ = accuracy(indv_output.cpu().numpy(), target.cpu().numpy())
                print("Indv Accuracy", avg_acc)

            output = torch.stack(outputs, dim=0).mean(dim=0)

            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy())
            print("Avg Accuracy", avg_acc)
            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            score = meta['score'].numpy()

            preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c, s)

            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts
            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] = np.prod(s*200, 1)
            all_boxes[idx:idx + num_images, 5] = score
            image_path.extend(meta['image'])

            idx += num_images

            if i % PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc)
                logger.info(msg)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred*4, output, prefix)

        total_duration = time.time() - start_time
        logger.info("Total test time: {:.1f}".format(total_duration))
        name_values, perf_indicator = val_dataset.evaluate(config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums)

        model_name = config.MODEL.NAME
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, model_name)
        else:
            _print_name_value(name_values, model_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)
            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
            else:
                writer.add_scalars('valid', dict(name_values), global_steps)
            writer_dict['valid_global_steps'] = global_steps + 1

    return perf_indicator

Max Shek-wai Chu · Answer 1 · Sat Apr 03 2021 18:44:16 GMT+0800 (China Standard Time)

@mobeixiaoxin you also get the same testing accuracy as me? I haven't solved the problem yet...

Max Shek-wai Chu · Answer 2 · Wed Apr 07 2021 06:13:27 GMT+0800 (China Standard Time)

@mobeixiaoxin why is your latest comment not shown here (only in email)? I will re-post it here so it may help others:

@MaxChu719 ，First ,thanks to your multi-scale code!!! I refer to the multi-scale code you published, and it can reach 90.8 as the paper, but the result in the test set is only 91.6%.I think there is no problem with the code of the multi-scale test you published. I wonder if it is caused by the fact that 92.3% of the results submitted in the author's paper is inconsistent with the division of the dataset on GitHub. 92.3% was trained by the author using trainval.json file, while 90.33 was trained by train.json file. The biggest difference is the increase in the amount of data in the training set. Maybe that's why I'm going to give it a try

Jingdong Wang · Answer 3 · Wed Apr 07 2021 10:30:15 GMT+0800 (China Standard Time)

Thanks a lot for your interests! Please check our new version (simpler) https://github.com/HRNet/DEKR