pytorch / pytorch

Tensors and Dynamic neural networks in Python with strong GPU acceleration

Home Page:https://pytorch.org

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

[JIT] traced model with optimization shows no performance improvement

nnguyen-aurora opened this issue Β· comments

πŸ› Bug

Using torch.jit.trace with optimize=True shows no performance difference with optimize=False
The test model I used is resnet from torchvision. I modified it to run only the features extraction (no ave pooling and fc for classification).

Inference test.py python script:

""" Pytorch inference script """

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import argparse
import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

# Select appropriate model for test
import resnet
def timeGraph(model, batch_size, num_loops):
    # Create random input tensor of certain size
    input = torch.rand(batch_size, 3, 1200, 1920, dtype=torch.float).cuda()

    print("Warm up ...")
    with torch.no_grad():
        for _ in range(20):
            model(input)

    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(num_loops):
            start_time = timeit.default_timer()
            features = model(input)
            end_time = timeit.default_timer()
            timings.append(end_time - start_time)
            print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
        print("Output features size:", features.size())
    return timings
    
def printStats(graphName,timings,batch_size):
    times = np.array(timings)
    steps = len(times)
    speeds = batch_size / times
    time_mean = np.mean(times)
    time_med = np.median(times)
    time_99th = np.percentile(times, 99)
    time_std = np.std(times, ddof=0)
    speed_mean = np.mean(speeds)
    speed_med = np.median(speeds)

    msg = ("\n%s =================================\n"
            "batch size=%d, num iterations=%d\n"
            "  Median FPS: %.1f, mean: %.1f\n"
            "  Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
            ) % (graphName,
                batch_size, steps,
                speed_med, speed_mean,
                time_med, time_mean, time_99th, time_std)

    print(msg)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a model with random input values")
    parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
    parser.add_argument("--batch_size", type=int, default=1, help="Batch size (default=1)")
    parser.add_argument('--optimize', action='store_true', help='Turn on optimization for traced model')
    parser.add_argument("--iter", default=10, type=int, help="Number of iteration loops")
    args = parser.parse_args()

    # Creating model with random weights
    model = resnet.resnet50()
    print("Tracing model... Optimization=", args.optimize)
    example_input = torch.rand(args.batch_size, 3, 1200, 1920, dtype=torch.float)
    traced_model = torch.jit.trace(model, example_input,
        check_trace=True,
        check_tolerance=1e-05,
        optimize=args.optimize,
        ) 

    # Save the script module
    # traced_model.save("model_traced.pt")

    # Create graph on GPU if CUDA is available
    if args.gpu is not None:
        if torch.cuda.is_available():
            # Enable CuDNN autotune for better performance (with fixed inputs)
            cudnn.benchmark = True
            traced_model = traced_model.cuda(args.gpu)
        else:
            raise Exception("No cuda available.")

    dev = torch.cuda.current_device()
    print("Cuda device id, count=", dev, torch.cuda.device_count())
    print("Cuda DNN version=", cudnn.version())
    print("Cuda compute capability=", torch.cuda.get_device_capability(dev))
    print("Cuda device name=", torch.cuda.get_device_name(dev))

    # Timing graph inference
    timings = timeGraph(traced_model, args.batch_size, args.iter)
    
    printStats("resnet", timings, args.batch_size)

Modified resnet.py from torchvision

import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        #x = self.avgpool(x)
        #x = x.view(x.size(0), -1)
        #x = self.fc(x)

        return x

def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model

To Reproduce

Steps to reproduce the behavior:

Run test.py with GPU:

python test.py --gpu 0 --iter 100

Run test.py with GPU and trace optimize:

python test.py --gpu 0 --optimize --iter 100

Expected behavior

Tracing model... Optimization= True
Cuda device id, count= 0 1
Cuda DNN version= 7401
Cuda compute capability= (6, 1)
Cuda device name= GeForce GTX 1080
Warm up ...
Start timing ...
Iteration 0: 0.133147 s
Iteration 1: 0.137695 s
Iteration 2: 0.132463 s
Iteration 3: 0.132877 s
Iteration 4: 0.132633 s
Iteration 5: 0.137405 s
Iteration 6: 0.134528 s
Iteration 7: 0.133907 s
Iteration 8: 0.134656 s
Iteration 9: 0.133537 s
Output features size: (1, 2048, 38, 60)

resnet =================================
batch size=1, num iterations=10
  Median FPS: 7.5, mean: 7.4
  Median latency: 0.133722, mean: 0.134285, 99th_p: 0.137669, std_dev: 0.001777

Environment

  • PyTorch Version (e.g., 1.0): 1.1.0a0
  • OS (e.g., Linux): Ubuntu 14.04 / 16.04
  • How you installed PyTorch (conda, pip, source): pip/source
  • Build command you used (if compiling from source):
  • Python version:2.7 / 3.5
  • CUDA/cuDNN version: 10.0 / 7.4
  • GPU models and configuration: Nvidia GTX1080
  • Any other relevant information:

Additional context

Also tried to run the model without jit.trace, and there seem to be little change in performance also.

cc @suo

In such a large and sequential vision model you probably can't get much from tracing, except for maybe fusing BN with conv in eval. But you are measuring in training.

My intention is to measure the forward pass (inference), not training.
I used

with torch.no_grad():

in the inference loop of the test script. Is it not enough to imply that is the forward path? If not, what should I do to make sure that I am running the inference pass?

FYI, I also added this to the test script to explicitly specify the forward pass:

traced_model = traced_model.eval()

There is no difference in the result.

commented

I did a test with/without optimisation on a traced model (faceboxes).
Test was run with no_grad, eval mode, inferencing. Times are averaged over 205 batches on a Titan V.

Without tracing the model (normal python):
forward_pass_time: 0.0897s misc: 0.0214s
Accuracy: 99.21% AP

Without optimisation:
forward_pass_time: 0.0840s misc: 0.0196s
Accuracy: 99.21% AP

With optimisation:
forward_pass_time: 0.0834s misc: 0.0193s
Accuracy: 99.21% AP

I see no appreciable difference with/without optimisation

repro script using with torch.jit.optimized_execution() instead of optimize:

""" Pytorch inference script """

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import argparse
import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

# Select appropriate model for test
import resnet
def timeGraph(model, batch_size, num_loops):
    # Create random input tensor of certain size
    input = torch.rand(batch_size, 3, 1200, 1920, dtype=torch.float) # .cuda()

    print("Warm up ...")
    with torch.no_grad():
        for _ in range(20):
            model(input)

    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(num_loops):
            start_time = timeit.default_timer()
            features = model(input)
            end_time = timeit.default_timer()
            timings.append(end_time - start_time)
            print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
        print("Output features size:", features.size())
    return timings

def printStats(graphName,timings,batch_size):
    times = np.array(timings)
    steps = len(times)
    speeds = batch_size / times
    time_mean = np.mean(times)
    time_med = np.median(times)
    time_99th = np.percentile(times, 99)
    time_std = np.std(times, ddof=0)
    speed_mean = np.mean(speeds)
    speed_med = np.median(speeds)

    msg = ("\n%s =================================\n"
            "batch size=%d, num iterations=%d\n"
            "  Median FPS: %.1f, mean: %.1f\n"
            "  Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
            ) % (graphName,
                batch_size, steps,
                speed_med, speed_mean,
                time_med, time_mean, time_99th, time_std)

    print(msg)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a model with random input values")
    parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
    parser.add_argument("--batch_size", type=int, default=1, help="Batch size (default=1)")
    parser.add_argument('--optimize', action='store_true', help='Turn on optimization for traced model')
    parser.add_argument("--iter", default=10, type=int, help="Number of iteration loops")
    args = parser.parse_args()

    # Creating model with random weights
    model = resnet.resnet50()
    print("Tracing model... Optimization=", args.optimize)
    example_input = torch.rand(args.batch_size, 3, 1200, 1920, dtype=torch.float)
    with torch.jit.optimized_execution(args.optimize):
        traced_model = torch.jit.trace(model, example_input,
                                       check_trace=True,
                                       check_tolerance=1e-05,
                                       # optimize=args.optimize,
        )

    # Save the script module
    # traced_model.save("model_traced.pt")

    # Create graph on GPU if CUDA is available
    if args.gpu is not None:
        if torch.cuda.is_available():
            # Enable CuDNN autotune for better performance (with fixed inputs)
            cudnn.benchmark = True
            traced_model = traced_model.cuda(args.gpu)
        else:
            raise Exception("No cuda available.")

    dev = torch.cuda.current_device()
    print("Cuda device id, count=", dev, torch.cuda.device_count())
    print("Cuda DNN version=", cudnn.version())
    print("Cuda compute capability=", torch.cuda.get_device_capability(dev))
    print("Cuda device name=", torch.cuda.get_device_name(dev))

    # Timing graph inference
    timings = timeGraph(traced_model, args.batch_size, args.iter)

    printStats("resnet", timings, args.batch_size)