Documentation Index
Fetch the complete documentation index at: https://mintlify.com/microsoft/onnxruntime/llms.txt
Use this file to discover all available pages before exploring further.
Model Quantization Guide
Quantization reduces model size and improves inference performance by converting floating-point weights and activations to lower precision formats (typically 8-bit integers). ONNX Runtime provides comprehensive quantization tools supporting both static and dynamic quantization.
Prerequisites
pip install onnxruntime onnx
Quantization Methods
Dynamic Quantization
Dynamic quantization converts weights to int8 at runtime, with activations quantized dynamically during inference:
from onnxruntime.quantization import quantize_dynamic, QuantType
import onnx
# Quantize model
model_input = "model.onnx"
model_output = "model_quantized.onnx"
quantize_dynamic(
model_input,
model_output,
weight_type=QuantType.QInt8
)
print("Dynamic quantization completed")
Static Quantization
Static quantization uses calibration data to determine optimal quantization parameters:
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType, QuantFormat
import numpy as np
# Define calibration data reader
class DataReader(CalibrationDataReader):
def __init__(self, calibration_data):
self.data = calibration_data
self.datasize = len(calibration_data)
self.idx = 0
def get_next(self):
if self.idx < self.datasize:
input_data = {"input": self.data[self.idx]}
self.idx += 1
return input_data
return None
# Prepare calibration data
calibration_samples = []
for i in range(100): # Use 100-1000 samples
sample = np.random.randn(1, 3, 224, 224).astype(np.float32)
calibration_samples.append(sample)
data_reader = DataReader(calibration_samples)
# Quantize model
quantize_static(
model_input="model.onnx",
model_output="model_quantized.onnx",
calibration_data_reader=data_reader,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
per_channel=True
)
Configuration Options
Quantization Config
Use StaticQuantConfig for fine-grained control:
from onnxruntime.quantization import (
quantize_static,
StaticQuantConfig,
CalibrationDataReader,
CalibrationMethod,
QuantType,
QuantFormat
)
# Create quantization configuration
quant_config = StaticQuantConfig(
calibration_data_reader=data_reader,
calibrate_method=CalibrationMethod.MinMax,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
op_types_to_quantize=['Conv', 'MatMul', 'Gemm'],
per_channel=True,
reduce_range=False,
use_external_data_format=False,
extra_options={
'ActivationSymmetric': False,
'WeightSymmetric': True,
'EnableSubgraph': False,
'ForceQuantizeNoInputCheck': False,
}
)
# Apply quantization
from onnxruntime.quantization import quantize
quantize(
model_input="model.onnx",
model_output="model_quantized.onnx",
quant_config=quant_config
)
Calibration Methods
from onnxruntime.quantization import CalibrationMethod
# MinMax: Uses min/max values from calibration data
calibrate_method = CalibrationMethod.MinMax
# Entropy: Uses KL divergence to minimize information loss
calibrate_method = CalibrationMethod.Entropy
# Percentile: Uses percentile values to handle outliers
calibrate_method = CalibrationMethod.Percentile
Advanced Quantization
Per-Channel Quantization
Quantize weights per output channel for better accuracy:
from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
quantize_static(
model_input="model.onnx",
model_output="model_quantized.onnx",
calibration_data_reader=data_reader,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
per_channel=True, # Enable per-channel quantization
extra_options={
'QDQOpTypePerChannelSupportToAxis': {
'MatMul': 1, # Specify axis for MatMul
'Conv': 0 # Specify axis for Conv
}
}
)
Selective Quantization
Quantize only specific operators:
from onnxruntime.quantization import quantize_static
quantize_static(
model_input="model.onnx",
model_output="model_quantized.onnx",
calibration_data_reader=data_reader,
op_types_to_quantize=['Conv', 'MatMul'], # Only quantize Conv and MatMul
nodes_to_exclude=['final_layer'], # Exclude specific nodes
per_channel=True
)
Quantize-Dequantize (QDQ) format is recommended for best compatibility:
from onnxruntime.quantization import quantize_static, QuantFormat
quantize_static(
model_input="model.onnx",
model_output="model_qdq.onnx",
calibration_data_reader=data_reader,
quant_format=QuantFormat.QDQ, # Use QDQ format
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
extra_options={
'AddQDQPairToWeight': False, # Quantize weights directly
'QDQKeepRemovableActivations': False,
'DedicatedQDQPair': False
}
)
Specialized quantization for transformer models:
from onnxruntime.quantization import quantize_dynamic
from pathlib import Path
class QuantizeHelper:
@staticmethod
def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data_format=False):
"""Quantize ONNX model for transformers"""
import onnx
from onnxruntime.quantization import quantize_dynamic
Path(quantized_model_path).parent.mkdir(parents=True, exist_ok=True)
# Get model size before quantization
import os
original_size = os.path.getsize(onnx_model_path) / (1024 * 1024)
print(f"Original model size: {original_size:.2f} MB")
# Quantize
quantize_dynamic(
onnx_model_path,
quantized_model_path,
use_external_data_format=use_external_data_format,
extra_options={"DefaultTensorType": onnx.TensorProto.FLOAT}
)
# Get quantized model size
quantized_size = os.path.getsize(quantized_model_path) / (1024 * 1024)
print(f"Quantized model size: {quantized_size:.2f} MB")
print(f"Size reduction: {(1 - quantized_size/original_size)*100:.1f}%")
# Usage
QuantizeHelper.quantize_onnx_model(
"bert_model.onnx",
"bert_model_quantized.onnx"
)
Calibration Data Best Practices
Representative Dataset
import numpy as np
from onnxruntime.quantization import CalibrationDataReader
class ImageNetDataReader(CalibrationDataReader):
def __init__(self, data_folder, batch_size=1, start_index=0, end_index=100):
self.data_folder = data_folder
self.batch_size = batch_size
self.start_index = start_index
self.end_index = end_index
self.preprocess_func = self.preprocess_imagenet
self.enum_data = None
self.datasize = 0
def get_next(self):
if self.enum_data is None:
self.enum_data = iter(
self.load_batches()
)
return next(self.enum_data, None)
def load_batches(self):
# Load calibration images
for idx in range(self.start_index, self.end_index):
image = self.load_image(idx)
image = self.preprocess_func(image)
yield {"input": image}
def preprocess_imagenet(self, image):
# Standard ImageNet preprocessing
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
image = (image / 255.0 - mean) / std
return image.astype(np.float32)
# Use 100-1000 representative samples
data_reader = ImageNetDataReader(
data_folder="calibration_data",
start_index=0,
end_index=500
)
extra_options = {
# Symmetric quantization
'ActivationSymmetric': False, # Asymmetric activations (better accuracy)
'WeightSymmetric': True, # Symmetric weights (common practice)
# Calibration options
'CalibTensorRangeSymmetric': False,
'CalibMovingAverage': True,
'CalibMovingAverageConstant': 0.01,
# Quantization behavior
'ForceQuantizeNoInputCheck': True,
'MatMulConstBOnly': False, # Quantize all MatMul operations
# QDQ options
'AddQDQPairToWeight': False,
'DedicatedQDQPair': False,
'QDQKeepRemovableActivations': False,
# Subgraph quantization
'EnableSubgraph': False,
# Minimum range enforcement
'MinimumRealRange': None,
# Operator exclusions
'OpTypesToExcludeOutputQuantization': [],
}
quantize_static(
model_input="model.onnx",
model_output="model_quantized.onnx",
calibration_data_reader=data_reader,
extra_options=extra_options
)
Model Preprocessing
Optimize model before quantization:
from onnxruntime.quantization import preprocess
# Preprocess model for better quantization
preprocess(
input_model_path="model.onnx",
output_model_path="model_preprocessed.onnx",
auto_merge=True,
save_as_external_data=False
)
# Then quantize the preprocessed model
quantize_static(
model_input="model_preprocessed.onnx",
model_output="model_quantized.onnx",
calibration_data_reader=data_reader
)
Validating Quantized Models
import onnxruntime as ort
import numpy as np
def validate_quantization(original_model, quantized_model, test_data):
"""Compare outputs between original and quantized models"""
# Load models
original_session = ort.InferenceSession(original_model)
quantized_session = ort.InferenceSession(quantized_model)
input_name = original_session.get_inputs()[0].name
# Run inference
original_output = original_session.run(None, {input_name: test_data})[0]
quantized_output = quantized_session.run(None, {input_name: test_data})[0]
# Calculate metrics
mse = np.mean((original_output - quantized_output) ** 2)
mae = np.mean(np.abs(original_output - quantized_output))
max_diff = np.max(np.abs(original_output - quantized_output))
print(f"Mean Squared Error: {mse:.6f}")
print(f"Mean Absolute Error: {mae:.6f}")
print(f"Max Absolute Difference: {max_diff:.6f}")
# Check if outputs are close
rtol = 0.01 # 1% relative tolerance
atol = 0.01 # absolute tolerance
is_close = np.allclose(original_output, quantized_output, rtol=rtol, atol=atol)
if is_close:
print("✓ Quantization validation passed")
else:
print("⚠ Quantization may have accuracy loss")
return is_close
# Test
test_input = np.random.randn(1, 3, 224, 224).astype(np.float32)
validate_quantization(
"model.onnx",
"model_quantized.onnx",
test_input
)
import time
import onnxruntime as ort
import numpy as np
def benchmark_model(model_path, test_data, iterations=100):
session = ort.InferenceSession(model_path)
input_name = session.get_inputs()[0].name
# Warmup
for _ in range(10):
_ = session.run(None, {input_name: test_data})
# Benchmark
start = time.time()
for _ in range(iterations):
_ = session.run(None, {input_name: test_data})
elapsed = time.time() - start
return elapsed / iterations
# Compare
test_input = np.random.randn(1, 3, 224, 224).astype(np.float32)
original_time = benchmark_model("model.onnx", test_input)
quantized_time = benchmark_model("model_quantized.onnx", test_input)
print(f"Original model: {original_time*1000:.2f} ms")
print(f"Quantized model: {quantized_time*1000:.2f} ms")
print(f"Speedup: {original_time/quantized_time:.2f}x")
import os
original_size = os.path.getsize("model.onnx") / (1024**2)
quantized_size = os.path.getsize("model_quantized.onnx") / (1024**2)
print(f"\nOriginal size: {original_size:.2f} MB")
print(f"Quantized size: {quantized_size:.2f} MB")
print(f"Size reduction: {(1-quantized_size/original_size)*100:.1f}%")
Best Practices
- Use representative calibration data: 100-1000 samples covering your use cases
- Choose appropriate method: Dynamic for ease, static for best performance
- Enable per-channel quantization: Better accuracy with minimal overhead
- Use QDQ format: Better compatibility with execution providers
- Preprocess models: Run preprocessing before quantization
- Validate accuracy: Always compare quantized vs original outputs
- Test on target hardware: Performance gains vary by platform
- Consider symmetric quantization: For GPU/TensorRT deployment
Hardware-Specific Quantization
For CPUs (VNNI support)
quantize_static(
model_input="model.onnx",
model_output="model_cpu_int8.onnx",
calibration_data_reader=data_reader,
activation_type=QuantType.QUInt8, # Asymmetric activations
weight_type=QuantType.QInt8, # Symmetric weights
per_channel=True,
reduce_range=False
)
For GPUs (TensorRT)
quantize_static(
model_input="model.onnx",
model_output="model_gpu_int8.onnx",
calibration_data_reader=data_reader,
activation_type=QuantType.QInt8, # Symmetric
weight_type=QuantType.QInt8, # Symmetric
extra_options={
'ActivationSymmetric': True, # Required for TensorRT
'WeightSymmetric': True
}
)
Next Steps