使用预训练模型在got10k上测试与leadboard上的分数不一致

Question

使用预训练模型在got10k上测试与leadboard上的分数不一致

Z-Xiong opened this issue 3 years ago · comments

您好，我使用您提供的两个预训练模型在got10k测试，得到的结果如下：

Method	AO	SR0.50	SR0.75	Hz	Hardware	Language
TransT_N2	0.590	0.683	0.492	11.58 fps	3090	Python
TransT_N4	0.610	0.693	0.533	11.92 fps	3090	Python

而在leadboard上的评分比这个高很多，不知是不是我的推理代码有问题，代码如下：

from got10k.trackers import Tracker
from got10k.experiments import ExperimentGOT10k
import numpy as np
import math
import torchvision.transforms.functional as tvisf
import cv2
import torch
import torch.nn.functional as F

from pytracking.utils.loading import load_network
from easydict import EasyDict as edict


class TransT(Tracker):
    def __init__(self, name, net, window_penalty=0.49, exemplar_size=128, instance_size=256):
        super(TransT, self).__init__(
            name=name,
            is_deterministic=True)
        self.net = net
        self.window_penalty = window_penalty
        self.exemplar_size = exemplar_size
        self.instance_size = instance_size

    def _convert_score(self, score):

        score = score.permute(2, 1, 0).contiguous().view(2, -1).permute(1, 0)
        score = F.softmax(score, dim=1).data[:, 0].cpu().numpy()
        return score

    def _convert_bbox(self, delta):

        delta = delta.permute(2, 1, 0).contiguous().view(4, -1)
        delta = delta.data.cpu().numpy()

        return delta

    def _bbox_clip(self, cx, cy, width, height, boundary):
        cx = max(0, min(cx, boundary[1]))
        cy = max(0, min(cy, boundary[0]))
        width = max(10, min(width, boundary[1]))
        height = max(10, min(height, boundary[0]))
        return cx, cy, width, height

    def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
        """
        args:
            im: rgb based image
            pos: center position
            model_sz: exemplar size
            original_sz: original size
            avg_chans: channel average
        """
        if isinstance(pos, float):
            pos = [pos, pos]
        sz = original_sz
        im_sz = im.shape
        c = (original_sz + 1) / 2
        # context_xmin = round(pos[0] - c) # py2 and py3 round
        context_xmin = np.floor(pos[0] - c + 0.5)
        context_xmax = context_xmin + sz - 1
        # context_ymin = round(pos[1] - c)
        context_ymin = np.floor(pos[1] - c + 0.5)
        context_ymax = context_ymin + sz - 1
        left_pad = int(max(0., -context_xmin))
        top_pad = int(max(0., -context_ymin))
        right_pad = int(max(0., context_xmax - im_sz[1] + 1))
        bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))

        context_xmin = context_xmin + left_pad
        context_xmax = context_xmax + left_pad
        context_ymin = context_ymin + top_pad
        context_ymax = context_ymax + top_pad

        r, c, k = im.shape
        if any([top_pad, bottom_pad, left_pad, right_pad]):
            size = (r + top_pad + bottom_pad, c + left_pad + right_pad, k)
            te_im = np.zeros(size, np.uint8)
            te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
            if top_pad:
                te_im[0:top_pad, left_pad:left_pad + c, :] = avg_chans
            if bottom_pad:
                te_im[r + top_pad:, left_pad:left_pad + c, :] = avg_chans
            if left_pad:
                te_im[:, 0:left_pad, :] = avg_chans
            if right_pad:
                te_im[:, c + left_pad:, :] = avg_chans
            im_patch = te_im[int(context_ymin):int(context_ymax + 1),
                       int(context_xmin):int(context_xmax + 1), :]
        else:
            im_patch = im[int(context_ymin):int(context_ymax + 1),
                       int(context_xmin):int(context_xmax + 1), :]

        if not np.array_equal(model_sz, original_sz):
            im_patch = cv2.resize(im_patch, (model_sz, model_sz))
        im_patch = im_patch.transpose(2, 0, 1)
        im_patch = im_patch[np.newaxis, :, :, :]
        im_patch = im_patch.astype(np.float32)
        im_patch = torch.from_numpy(im_patch)
        im_patch = im_patch.cuda()
        return im_patch

    def initialize_features(self):
        # if not getattr(self, 'features_initialized', False):
        #     self.net.initialize()
        self.features_initialized = True

    def init(self, image, box):
        # PIL to np.array, (H, W, C)
        image = np.array(image).astype(np.uint8)
        hanning = np.hanning(32)
        window = np.outer(hanning, hanning)
        self.window = window.flatten()

        # Initialize
        self.initialize_features()
        self.center_pos = np.array([box[0] + box[2] / 2,
                                    box[1] + box[3] / 2])
        self.size = np.array([box[2], box[3]])

        # calculate z crop size
        w_z = self.size[0] + (2 - 1) * ((self.size[0] + self.size[1]) * 0.5)
        h_z = self.size[1] + (2 - 1) * ((self.size[0] + self.size[1]) * 0.5)
        s_z = math.ceil(math.sqrt(w_z * h_z))

        # calculate channle average
        self.channel_average = np.mean(image, axis=(0, 1))

        # get crop
        z_crop = self.get_subwindow(image, self.center_pos,
                                    self.exemplar_size,
                                    s_z, self.channel_average)

        # normalize
        z_crop = z_crop.float().mul(1.0 / 255.0).clamp(0.0, 1.0)
        self.mean = [0.485, 0.456, 0.406]
        self.std = [0.229, 0.224, 0.225]
        self.inplace = False
        z_crop[0] = tvisf.normalize(z_crop[0], self.mean, self.std, self.inplace)

        # initialize template feature
        self.net.template(z_crop)
        self.box = box

    def update(self, image):
        image = np.array(image).astype(np.uint8)
        # calculate x crop size
        w_x = self.size[0] + (4 - 1) * ((self.size[0] + self.size[1]) * 0.5)
        h_x = self.size[1] + (4 - 1) * ((self.size[0] + self.size[1]) * 0.5)
        s_x = math.ceil(math.sqrt(w_x * h_x))

        # get crop
        x_crop = self.get_subwindow(image, self.center_pos,
                                    self.instance_size,
                                    round(s_x), self.channel_average)

        # normalize
        x_crop = x_crop.float().mul(1.0 / 255.0).clamp(0.0, 1.0)
        x_crop[0] = tvisf.normalize(x_crop[0], self.mean, self.std, self.inplace)

        # track
        outputs = self.net.track(x_crop)
        score = self._convert_score(outputs['pred_logits'])
        pred_bbox = self._convert_bbox(outputs['pred_boxes'])

        # window penalty
        pscore = score * (1 - self.window_penalty) + \
                 self.window * self.window_penalty

        best_idx = np.argmax(pscore)
        bbox = pred_bbox[:, best_idx]
        bbox = bbox * s_x
        cx = bbox[0] + self.center_pos[0] - s_x / 2
        cy = bbox[1] + self.center_pos[1] - s_x / 2
        width = bbox[2]
        height = bbox[3]

        # clip boundary
        cx, cy, width, height = self._bbox_clip(cx, cy, width,
                                                height, image.shape[:2])

        # update state
        self.center_pos = np.array([cx, cy])
        self.size = np.array([width, height])

        bbox = [cx - width / 2,
                cy - height / 2,
                width,
                height]

        self.box = bbox
        return self.box


if __name__ == '__main__':
    # setup tracker
    settings = edict()
    settings.device = 'cuda'
    settings.description = 'TransT with default settings.'
    settings.root_dir = 'GOT-10k'
    settings.model_path = TransT/pytracking/networks/transt.pth'

    model = load_network(settings.model_path)
    tracker = TransT(name="TransT", net=model)

    # run experiments on GOT-10k (validation subset)
    experiment = ExperimentGOT10k(settings.root_dir, subset='test', result_dir="results", report_dir="reports")
    experiment.run(tracker)

    # report performance
    experiment.report([tracker.name])

chenxin · Answer 1 · Tue Mar 08 2022 10:53:50 GMT+0800 (China Standard Time)

使用github上提供的测试代码，与论文中的性能应是一致的。
如果代码是用的github的代码，有可能是数据集不完整等原因导致的，可以尝试测试其他数据集的指标观察是否有异常，来排查

Zhuang Xiong · Answer 2 · Wed Mar 09 2022 17:34:07 GMT+0800 (China Standard Time)

使用github上提供的测试代码，与论文中的性能应是一致的。如果代码是用的github的代码，有可能是数据集不完整等原因导致的，可以尝试测试其他数据集的指标观察是否有异常，来排查

谢谢回复，我找到原因了，是因为我加载模型后没有启用eval模式，添加了eval()后就和排行榜的测试结果一致了。