HRNet / HRNet-Human-Pose-Estimation

This repo is copied from

Home Page:

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

About the muli-scale testing on MPII test set, i only got 91.6% instead of 92.3% reported in the paper.

MaxChu719 opened this issue · comments

I have implemented the multi-scale testing, and I have verified that the MPII validation set accuracy is 90.75%
I then go on the apply it to the test set and the accuracy I got is only:

& Head & Shoulder & Elbow & Wrist & Hip & Knee  & Ankle & UBody & Total
& 98.3  & 96.5  & 92.4  & 88.3  & 90.6  & 88.3 & 84.1 & 92.4 & 91.6
AUC: 61.6

Which is not 92.3% as reported in the paper. Below is the code I have used for the multi-scale testing:

def read_scaled_image(image_file, s, center, scale, image_size, COLOR_RGB, DATA_FORMAT, image_transform):
    if DATA_FORMAT == 'zip':
        from utils import zipreader
        data_numpy = zipreader.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        data_numpy = cv2.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if COLOR_RGB:
        data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB)
    trans = get_affine_transform(center, s * scale, 0, image_size)
    images_warp = cv2.warpAffine(data_numpy, trans, tuple(image_size), flags=cv2.INTER_LINEAR)
    return image_transform(images_warp)
def validate(config, val_loader, val_dataset, model, criterion, output_dir, tb_log_dir, writer_dict=None, test_scale=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    # switch to evaluate mode

    num_samples = len(val_dataset)
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32)
    all_boxes = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0

    PRINT_FREQ = min(config.PRINT_FREQ//10, 5)

    image_size = np.array(config.MODEL.IMAGE_SIZE)
    final_test_scale = test_scale if test_scale is not None else config.TEST.SCALE_FACTOR
    with torch.no_grad():
        end = time.time()

        def scale_back_output(output_hm, s, output_size):
            hm_size = [output_hm.size(3), output_hm.size(2)]
            if s != 1.0:
                hm_w_margin = int(abs(1.0 - s) * hm_size[0] / 2.0)
                hm_h_margin = int(abs(1.0 - s) * hm_size[1] / 2.0)
                if s < 1.0:
                    hm_padding = torch.nn.ZeroPad2d((hm_w_margin, hm_w_margin, hm_h_margin, hm_h_margin))
                    resized_hm = hm_padding(output_hm)
                    resized_hm = output_hm[:, :, hm_h_margin:hm_size[0] - hm_h_margin, hm_w_margin:hm_size[1] - hm_w_margin]
                resized_hm = torch.nn.functional.interpolate(
                    size=(output_size[1], output_size[0]),
                    mode='bilinear',  # bilinear bicubic
                resized_hm = output_hm
                if hm_size[0] != output_size[0] or hm_size[1] != output_size[1]:
                    resized_hm = torch.nn.functional.interpolate(
                        size=(output_size[1], output_size[0]),
                        mode='bilinear',  # bilinear bicubic

            # resized_hm = torch.nn.functional.normalize(resized_hm, dim=[2, 3], p=1)
            resized_hm = resized_hm/(torch.sum(resized_hm, dim=[2, 3], keepdim=True) + 1e-9)
            return resized_hm

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        image_transform = transforms.Compose([transforms.ToTensor(), normalize])
        thread_pool = multiprocessing.Pool(multiprocessing.cpu_count())

        start_time = time.time()
        for i, (input, target, target_weight, meta) in enumerate(val_loader):
            # compute output
            # print("Batch", i, "Batch Size", input.size(0))

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)

            outputs = []
            for sidx, s in enumerate(sorted(final_test_scale, reverse=True)):
                print("Test Scale", s)
                if s != 1.0:
                    image_files = meta["image"]
                    centers = meta["center"].numpy()
                    scales = meta["scale"].numpy()

                    # images_resized = []
                    # for (image_file, center, scale) in zip(image_files, centers, scales):
                    #     scaled_image = read_scaled_image(image_file, center, scale, config.DATASET.COLOR_RGB)
                    #     images_resized.append(scaled_image)

                    images_resized = thread_pool.starmap(read_scaled_image,
                                                         [(image_file, s, center, scale, image_size, config.DATASET.COLOR_RGB, config.DATASET.DATA_FORMAT, image_transform) for (image_file, center, scale) in zip(image_files, centers, scales)])
                    images_resized = torch.stack(images_resized, dim=0)
                    images_resized = input

                model_outputs = model(images_resized)
                hm_size = [model_outputs.size(3), model_outputs.size(2)]
                # hm_size = image_size
                # hm_size = [128, 128]

                if config.TEST.FLIP_TEST:
                    print("Test Flip")
                    input_flipped = images_resized.flip(3)
                    output_flipped = model(input_flipped)

                    if isinstance(output_flipped, list):
                        output_flipped = output_flipped[-1]
                        output_flipped = output_flipped

                    output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs)
                    output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                    # feature is not aligned, shift flipped heatmap for higher accuracy
                    if config.TEST.SHIFT_HEATMAP:
                        output_flipped[:, :, :, 1:] = output_flipped.clone()[:, :, :, 0:-1]

                    model_outputs = 0.5 * (model_outputs + output_flipped)

                    # output_flipped_resized = scale_back_output(output_flipped, s, hm_size)
                    # outputs.append(output_flipped_resized)

                output_flipped_resized = scale_back_output(model_outputs, s, hm_size)

            target_size = [target.size(3), target.size(2)]
            if hm_size[0] != target_size[0] or hm_size[1] != target_size[1]:
                target = torch.nn.functional.interpolate(
                    mode='bilinear',  # bilinear bicubic
                target = torch.nn.functional.normalize(target, dim=[2, 3], p=2)

            for indv_output in outputs:
                _, avg_acc, _, _ = accuracy(indv_output.cpu().numpy(), target.cpu().numpy())
                print("Indv Accuracy", avg_acc)

            output = torch.stack(outputs, dim=0).mean(dim=0)

            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy())
            print("Avg Accuracy", avg_acc)
            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            score = meta['score'].numpy()

            preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c, s)

            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts
            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] =*200, 1)
            all_boxes[idx:idx + num_images, 5] = score

            idx += num_images

            if i % PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred*4, output, prefix)

        total_duration = time.time() - start_time"Total test time: {:.1f}".format(total_duration))
        name_values, perf_indicator = val_dataset.evaluate(config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums)

        model_name = config.MODEL.NAME
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, model_name)
            _print_name_value(name_values, model_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)
            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
                writer.add_scalars('valid', dict(name_values), global_steps)
            writer_dict['valid_global_steps'] = global_steps + 1

    return perf_indicator

@mobeixiaoxin you also get the same testing accuracy as me? I haven't solved the problem yet...

@mobeixiaoxin why is your latest comment not shown here (only in email)? I will re-post it here so it may help others:

@MaxChu719 ,First ,thanks to your multi-scale code!!! I refer to the multi-scale code you published, and it can reach 90.8 as the paper, but the result in the test set is only 91.6%.I think there is no problem with the code of the multi-scale test you published. I wonder if it is caused by the fact that 92.3% of the results submitted in the author's paper is inconsistent with the division of the dataset on GitHub. 92.3% was trained by the author using trainval.json file, while 90.33 was trained by train.json file. The biggest difference is the increase in the amount of data in the training set. Maybe that's why I'm going to give it a try

Thanks a lot for your interests! Please check our new version (simpler)