[JIT] traced model with optimization shows no performance improvement

Question

[JIT] traced model with optimization shows no performance improvement

nnguyen-aurora opened this issue 5 years ago · comments

🐛 Bug

Using torch.jit.trace with optimize=True shows no performance difference with optimize=False
The test model I used is resnet from torchvision. I modified it to run only the features extraction (no ave pooling and fc for classification).

Inference test.py python script:

""" Pytorch inference script """

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import argparse
import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

# Select appropriate model for test
import resnet
def timeGraph(model, batch_size, num_loops):
    # Create random input tensor of certain size
    input = torch.rand(batch_size, 3, 1200, 1920, dtype=torch.float).cuda()

    print("Warm up ...")
    with torch.no_grad():
        for _ in range(20):
            model(input)

    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(num_loops):
            start_time = timeit.default_timer()
            features = model(input)
            end_time = timeit.default_timer()
            timings.append(end_time - start_time)
            print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
        print("Output features size:", features.size())
    return timings
    
def printStats(graphName,timings,batch_size):
    times = np.array(timings)
    steps = len(times)
    speeds = batch_size / times
    time_mean = np.mean(times)
    time_med = np.median(times)
    time_99th = np.percentile(times, 99)
    time_std = np.std(times, ddof=0)
    speed_mean = np.mean(speeds)
    speed_med = np.median(speeds)

    msg = ("\n%s =================================\n"
            "batch size=%d, num iterations=%d\n"
            "  Median FPS: %.1f, mean: %.1f\n"
            "  Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
            ) % (graphName,
                batch_size, steps,
                speed_med, speed_mean,
                time_med, time_mean, time_99th, time_std)

    print(msg)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a model with random input values")
    parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
    parser.add_argument("--batch_size", type=int, default=1, help="Batch size (default=1)")
    parser.add_argument('--optimize', action='store_true', help='Turn on optimization for traced model')
    parser.add_argument("--iter", default=10, type=int, help="Number of iteration loops")
    args = parser.parse_args()

    # Creating model with random weights
    model = resnet.resnet50()
    print("Tracing model... Optimization=", args.optimize)
    example_input = torch.rand(args.batch_size, 3, 1200, 1920, dtype=torch.float)
    traced_model = torch.jit.trace(model, example_input,
        check_trace=True,
        check_tolerance=1e-05,
        optimize=args.optimize,
        ) 

    # Save the script module
    # traced_model.save("model_traced.pt")

    # Create graph on GPU if CUDA is available
    if args.gpu is not None:
        if torch.cuda.is_available():
            # Enable CuDNN autotune for better performance (with fixed inputs)
            cudnn.benchmark = True
            traced_model = traced_model.cuda(args.gpu)
        else:
            raise Exception("No cuda available.")

    dev = torch.cuda.current_device()
    print("Cuda device id, count=", dev, torch.cuda.device_count())
    print("Cuda DNN version=", cudnn.version())
    print("Cuda compute capability=", torch.cuda.get_device_capability(dev))
    print("Cuda device name=", torch.cuda.get_device_name(dev))

    # Timing graph inference
    timings = timeGraph(traced_model, args.batch_size, args.iter)
    
    printStats("resnet", timings, args.batch_size)

Modified resnet.py from torchvision

import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        #x = self.avgpool(x)
        #x = x.view(x.size(0), -1)
        #x = self.fc(x)

        return x

def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model

To Reproduce

Steps to reproduce the behavior:

Run test.py with GPU:

python test.py --gpu 0 --iter 100

Run test.py with GPU and trace optimize:

python test.py --gpu 0 --optimize --iter 100

Expected behavior

Tracing model... Optimization= True
Cuda device id, count= 0 1
Cuda DNN version= 7401
Cuda compute capability= (6, 1)
Cuda device name= GeForce GTX 1080
Warm up ...
Start timing ...
Iteration 0: 0.133147 s
Iteration 1: 0.137695 s
Iteration 2: 0.132463 s
Iteration 3: 0.132877 s
Iteration 4: 0.132633 s
Iteration 5: 0.137405 s
Iteration 6: 0.134528 s
Iteration 7: 0.133907 s
Iteration 8: 0.134656 s
Iteration 9: 0.133537 s
Output features size: (1, 2048, 38, 60)

resnet =================================
batch size=1, num iterations=10
  Median FPS: 7.5, mean: 7.4
  Median latency: 0.133722, mean: 0.134285, 99th_p: 0.137669, std_dev: 0.001777

Environment

PyTorch Version (e.g., 1.0): 1.1.0a0
OS (e.g., Linux): Ubuntu 14.04 / 16.04
How you installed PyTorch (conda, pip, source): pip/source
Build command you used (if compiling from source):
Python version:2.7 / 3.5
CUDA/cuDNN version: 10.0 / 7.4
GPU models and configuration: Nvidia GTX1080
Any other relevant information:

Additional context

Also tried to run the model without jit.trace, and there seem to be little change in performance also.

cc @suo

Tongzhou Wang · Answer 1 · Wed May 01 2019 10:55:49 GMT+0800 (China Standard Time)

In such a large and sequential vision model you probably can't get much from tracing, except for maybe fusing BN with conv in eval. But you are measuring in training.

Nhiem Nguyen · Answer 2 · Wed May 01 2019 22:02:24 GMT+0800 (China Standard Time)

My intention is to measure the forward pass (inference), not training.
I used

with torch.no_grad():

in the inference loop of the test script. Is it not enough to imply that is the forward path? If not, what should I do to make sure that I am running the inference pass?

Nhiem Nguyen · Answer 3 · Wed May 01 2019 22:24:57 GMT+0800 (China Standard Time)

FYI, I also added this to the test script to explicitly specify the forward pass:

traced_model = traced_model.eval()

There is no difference in the result.

Sacha · Answer 4 · Thu May 02 2019 08:44:43 GMT+0800 (China Standard Time)

I did a test with/without optimisation on a traced model (faceboxes).
Test was run with no_grad, eval mode, inferencing. Times are averaged over 205 batches on a Titan V.

Without tracing the model (normal python):
forward_pass_time: 0.0897s misc: 0.0214s
Accuracy: 99.21% AP

Without optimisation:
forward_pass_time: 0.0840s misc: 0.0196s
Accuracy: 99.21% AP

With optimisation:
forward_pass_time: 0.0834s misc: 0.0193s
Accuracy: 99.21% AP

I see no appreciable difference with/without optimisation

Basil Hosmer · Answer 5 · Thu Dec 12 2019 15:45:32 GMT+0800 (China Standard Time)

repro script using with torch.jit.optimized_execution() instead of optimize:

""" Pytorch inference script """

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import argparse
import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

# Select appropriate model for test
import resnet
def timeGraph(model, batch_size, num_loops):
    # Create random input tensor of certain size
    input = torch.rand(batch_size, 3, 1200, 1920, dtype=torch.float) # .cuda()

    print("Warm up ...")
    with torch.no_grad():
        for _ in range(20):
            model(input)

    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(num_loops):
            start_time = timeit.default_timer()
            features = model(input)
            end_time = timeit.default_timer()
            timings.append(end_time - start_time)
            print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
        print("Output features size:", features.size())
    return timings

def printStats(graphName,timings,batch_size):
    times = np.array(timings)
    steps = len(times)
    speeds = batch_size / times
    time_mean = np.mean(times)
    time_med = np.median(times)
    time_99th = np.percentile(times, 99)
    time_std = np.std(times, ddof=0)
    speed_mean = np.mean(speeds)
    speed_med = np.median(speeds)

    msg = ("\n%s =================================\n"
            "batch size=%d, num iterations=%d\n"
            "  Median FPS: %.1f, mean: %.1f\n"
            "  Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
            ) % (graphName,
                batch_size, steps,
                speed_med, speed_mean,
                time_med, time_mean, time_99th, time_std)

    print(msg)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a model with random input values")
    parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
    parser.add_argument("--batch_size", type=int, default=1, help="Batch size (default=1)")
    parser.add_argument('--optimize', action='store_true', help='Turn on optimization for traced model')
    parser.add_argument("--iter", default=10, type=int, help="Number of iteration loops")
    args = parser.parse_args()

    # Creating model with random weights
    model = resnet.resnet50()
    print("Tracing model... Optimization=", args.optimize)
    example_input = torch.rand(args.batch_size, 3, 1200, 1920, dtype=torch.float)
    with torch.jit.optimized_execution(args.optimize):
        traced_model = torch.jit.trace(model, example_input,
                                       check_trace=True,
                                       check_tolerance=1e-05,
                                       # optimize=args.optimize,
        )

    # Save the script module
    # traced_model.save("model_traced.pt")

    # Create graph on GPU if CUDA is available
    if args.gpu is not None:
        if torch.cuda.is_available():
            # Enable CuDNN autotune for better performance (with fixed inputs)
            cudnn.benchmark = True
            traced_model = traced_model.cuda(args.gpu)
        else:
            raise Exception("No cuda available.")

    dev = torch.cuda.current_device()
    print("Cuda device id, count=", dev, torch.cuda.device_count())
    print("Cuda DNN version=", cudnn.version())
    print("Cuda compute capability=", torch.cuda.get_device_capability(dev))
    print("Cuda device name=", torch.cuda.get_device_name(dev))

    # Timing graph inference
    timings = timeGraph(traced_model, args.batch_size, args.iter)

    printStats("resnet", timings, args.batch_size)