[RFC] INC PyTorch 3.x API Design
xin3he opened this issue · comments
xinhe commented
INC PyTorch 3.x API Design
Target
- Design consistent with PyTorch design.
Main principles
quantize
andautotune
are user interface APIs for quantization. One is a one-time quantization, the other requires a set of configurations.- Each algorithm has its own configuration class, such as
GPTQConfig
and autotune will use a set of configurations.
Repo Architecture
- common
- config # basic configuration definition.
- utils
- logger # logger is a common used tool.
- torch
- algorithms # make sure all algos here have user interface.
- smooth_quant
- weight_only
- amp # autocast for
BF16
/FP16
/FP8
- quantization
- init.py # make sure all configurations and
quantize
,autotune
are imported here. - fp8 # special quantization implementation.
- ipex # special quantization implementation, import algos from
algorithms
folder. - weight_only # special quantization implementation, import algos from
algorithms
folder. - config.py # contains all configurations, such as SmoothQuantConfig, get_default_config.
- quantize.py # the common interface of quantization, implementation in
fp8
/ipex
/weight_only
folder. - layers.py # INC defined modules, such as
WeightOnlyLinear
.
- init.py # make sure all configurations and
- utils
- utility.py # saving common used functions, such as
fetch_modules
. - constants.py # saving common used configurations, such as
GGML_TYPE_Q4_K
.
- utility.py # saving common used functions, such as
- algorithms # make sure all algos here have user interface.
Previous Design
IPEX StaticQuant & SmoothQuant
# Tuning space
# 'weight': {
# 'dtype': ['int8'],
# 'scheme': ['sym'],
# 'granularity': ['per_channel'],
# 'algorithm': ['minmax']
# },
# 'activation': {
# 'dtype': ['uint8'],
# 'scheme': ['asym', 'sym'],
# 'granularity': ['per_tensor'],
# 'algorithm': ['minmax', 'kl']
# },
# },
conf = PostTrainingQuantConfig(
backend="ipex",
op_name_dict=op_name_dict,
op_type_dict=op_type_dict,
recipes={
"smooth_quant": True,
"smooth_quant_args": {"folding": folding, "alpha": np.arange(0.1, 0.4, 0.05).tolist()},
}, # without smoothquant recipes, it's static quantization
)
calib_dataloader = Dataloader()
int8_model = quantization.fit(
model,
conf,
calib_dataloader=calib_dataloader,
)
int8_model.save("./saved")
PyTorch Weight-only Quantization
# Tuning space
# 'Linear': &cap_weight_only_integer_linear { # only Linear now
# 'weight': {
# 'dtype': ['int', 'int8', 'int4', 'nf4', 'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1'],
# 'bits': [4, 1, 2, 3, 5, 6, 7, 8], # [1-8], # 4
# # group_size=-1 means per-channel, others means per-group
# 'group_size': [32, -1, 1, 4, 8, 16, 64, 128, 256, 512, 1024], # [1-inf], # 32
# 'scheme': ['sym', 'asym'], # sym, no ZP
# 'algorithm': ['RTN', 'AWQ', 'GPTQ', 'TEQ'], # RTN, [RTN, GPTQ, AWQ,] RTN+AWQ+TEQ order
# },
conf = PostTrainingQuantConfig(
approach="weight_only",
op_type_dict={
".*": { # re.match
"weight": {
"bits": 8, # 1-8 bit
"group_size": -1, # -1 (per-channel)
"scheme": "sym",
"algorithm": "RTN", # select from ['RTN', 'AWQ', 'GPTQ', 'TEQ']
},
},
},
recipes={
'rtn_args':{'enable_full_range': True, 'enable_mse_search': True},
'gptq_args':{'percdamp': 0.01, 'actorder':True, 'block_size': 128, 'nsamples': 128, 'use_full_length': False},
'awq_args':{'enable_auto_scale': True, 'enable_mse_search': True, 'n_blocks': 5},
},
)
New Design
IPEX StaticQuant & SmoothQuant
Configuration
The argument to config is data or a list of data. If the parameters can be assembled into different configurations, the returned obj will be a list of configurations used for autotuning.
# default configuration
conf = get_static_quant_default_config()
conf = get_smooth_quant_default_config()
# customized configuration
fp32config = FP32Config()
conf = StaticQuantConfig()
fp32config = FP32Config()
conf = StaticQuantConfig(
act_sym=[True, False],
act_algo=['minmax', 'kl'],
)
conf = SmoothQuantConfig(
act_sym=[True, False],
act_algo=['minmax', 'kl'],
alpha = [0.5, 1.0],
folding = [True, False],
scale_sharing=[True, False], # whether share the same scale for layers with the same input.
auto_alpha_args = {
"init_alpha": 0.5,
"alpha_min": 0.0,
"alpha_max": 1.0,
"alpha_step": 0.1,
"shared_criterion": "max",
"enable_blockwise_loss": False,
}
)
# By default, conf is a global configuration
conf.set_local(["lm_head", "Linear"], fp32config) # op_name, op_type fallback
Quantize Interface
from neural_compressor.torch.quantization import (
StaticQuantConfig,
SmoothQuantConfig,
TuningConfig,
quantize,
autotune,
)
from neural_compressor.torch.quantization import load
fp32config = FP32Config()
conf = StaticQuantConfig()
conf.set_local(["lm_head", "Linear"], fp32config) # op_name, op_type fallback
# for same user experience, consider using quantize_ipex interface.
model = quantize(
model,
conf,
example_inputs, # example_inputs for jit.trace
run_fn, # calibration function
inplace=True,
) # int8 model here is a pure torch model, not an INC model
# below config will try static quant and smoothquant one-by-one
conf = TuningConfig(quant_configs=[StaticQuantConfig(), SmoothQuantConfig()])
# below config will try different alphas with smoothquant one-by-one
conf = TuningConfig(quant_configs=[SmoothQuantConfig(alpha=[0.5, 1.0])])
# SmoothQuantConfig(alpha=0.5), SmoothQuantConfig(alpha=1.0) == SmoothQuantConfig.expend(SmoothQuantConfig(alpha=[0.5, 1.0]))
# model = BertModel(
# Linear()
# )
model = autotune(
model,
conf,
example_inputs, # example_inputs for jit.trace
run_fn, # calibration function
eval_fn, # evaluation function
)
# model = BertModel(
# QuantizedLinear()
# )
# Insert save function to original model.
# Rename the original func to model.orig_save() if hasattr(model, 'save'), add logger.warning to make customer aware.
model.save("./saved")
# reloaded model should have the save attribution.
model = load("./saved") # jit model doesn't require fp32 model for initialization
# model = load(model, "./saved") # for eager model
PyTorch Weight-only Quantization
Configuration
# default configuration
# by default, lm_head/embed_out is quantized
conf = get_rtn_default_config()
conf = get_gptq_default_config()
conf = get_awq_default_config()
conf = get_teq_default_config()
# customized configuration
fp32config = FP32Config()
conf = RTNConfig(
dtype=['int4', 'nf4'],
group_size=['32', '128'],
use_sym=[True, False],
)
conf = GPTQConfig(
dtype=['int4', 'nf4'],
group_size=['32', '128'],
use_sym=[True, False],
nsamples=[128, 256],
use_actorder=[True, False],
)
conf = AWQConfig(
dtype=['int4', 'nf4'],
group_size=['32', '128'],
use_sym=[True, False],
folding=[True, False],
)
conf = TEQConfig(
dtype=['int4', 'nf4'],
group_size=['32', '128'],
use_sym=[True, False],
)
# By default, conf is a global configuration
conf.set_local(["lm_head"], fp32config) # op_name, op_type fallback
Quantize Interface
from neural_compressor.torch.quantization import (
RTNConfig,
GPTQConfig,
AWQConfig,
TEQConfig,
TuningConfig,
quantize,
autotune,
)
from neural_compressor.torch.quantization import load
fp32config = FP32Config()
conf = RTNConfig(dtype=['int8'])
conf = GPTQConfig(dtype=['nf4'])
# lm_head/embed_out is quantized by default. GPTQ will set RTN Quantize for lm_head.
conf.set_local(["lm_head"], fp32config) # op_name, op_type fallback
conf.set_local(["*.fc1"], RTNconfig) # specific configuration for op_name, op_type.
# for same user experience, consider using quantize_ipex interface.
model = quantize(
model,
conf,
example_inputs, # example_inputs for jit.trace
run_fn, # calibration function
inplace=True,
) # int8 model here is a pure torch model, not an INC model
# below config will try three weight-only quantization configuration one-by-one
conf = TuningConfig(quant_configs=[RTNConfig(), GPTQConfig(), AWQConfig()])
# below config will try different dtypes with fp32 lm_head one-by-one
conf = TuningConfig(quant_configs=[GPTQConfig(dtype=['int4', 'nf4'].set_local("lm_head", fp32config)])
# model = BertModel(
# Linear()
# )
model = autotune(
model,
conf,
example_inputs, # example_inputs for jit.trace
run_fn, # calibration function
eval_fn, # evaluation function
)
# model = BertModel(
# WeightOnlyLinear()
# )
# Insert save function to original model.
# Rename the original func to model.orig_save() if hasattr(model, 'save'), add logger.warning to make customer aware.
model.save("./saved")
# reloaded model should have the save attribution.
model = load(model, "./saved") # original fp32 model is required for initialization.