Documentation Index Fetch the complete documentation index at: https://mintlify.com/microsoft/onnxruntime/llms.txt
Use this file to discover all available pages before exploring further.
Python Inference API
Learn how to run ONNX model inference in Python using the ONNX Runtime API. This guide includes real API signatures and working code examples.
Installation
pip install onnxruntime
# For GPU support (CUDA)
pip install onnxruntime-gpu
Quick Start
Here’s a minimal example to run inference:
import onnxruntime as ort
import numpy as np
# Create inference session
session = ort.InferenceSession( "model.onnx" )
# Get input name
input_name = session.get_inputs()[ 0 ].name
# Prepare input data
input_data = np.random.randn( 1 , 3 , 224 , 224 ).astype(np.float32)
# Run inference
outputs = session.run( None , {input_name: input_data})
print ( "Output shape:" , outputs[ 0 ].shape)
InferenceSession Class
Creating a Session
From file path:
import onnxruntime as ort
# Basic usage
session = ort.InferenceSession(
"model.onnx" ,
providers = [ 'CUDAExecutionProvider' , 'CPUExecutionProvider' ]
)
From bytes:
with open ( "model.onnx" , "rb" ) as f:
model_bytes = f.read()
session = ort.InferenceSession(model_bytes)
With session options:
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = 4
sess_options.graph_optimization_level = ort.GraphOptimizationLevel. ORT_ENABLE_ALL
sess_options.enable_profiling = True
session = ort.InferenceSession(
"model.onnx" ,
sess_options = sess_options,
providers = [ 'CPUExecutionProvider' ]
)
Session Methods
run()
Execute the model with input data.
outputs = session.run(
output_names = None , # None = all outputs, or list of output names
input_feed = { "input" : input_array}, # Dict of input_name: numpy_array
run_options = None # Optional RunOptions
)
Complete example:
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession( "model.onnx" )
# Get input/output metadata
input_name = session.get_inputs()[ 0 ].name
output_name = session.get_outputs()[ 0 ].name
# Prepare inputs
x = np.random.randn( 1 , 3 , 224 , 224 ).astype(np.float32)
# Run inference - get all outputs
outputs = session.run( None , {input_name: x})
# Or request specific outputs
outputs = session.run([output_name], {input_name: x})
print ( f "Output: { outputs[ 0 ] } " )
Get model input metadata.
inputs = session.get_inputs()
for input_meta in inputs:
print ( f "Name: { input_meta.name } " )
print ( f "Shape: { input_meta.shape } " )
print ( f "Type: { input_meta.type } " )
get_outputs()
Get model output metadata.
outputs = session.get_outputs()
for output_meta in outputs:
print ( f "Name: { output_meta.name } " )
print ( f "Shape: { output_meta.shape } " )
print ( f "Type: { output_meta.type } " )
Get model metadata.
meta = session.get_modelmeta()
print ( f "Producer: { meta.producer_name } " )
print ( f "Graph name: { meta.graph_name } " )
print ( f "Domain: { meta.domain } " )
print ( f "Version: { meta.version } " )
print ( f "Custom metadata: { meta.custom_metadata_map } " )
SessionOptions
Configure session behavior before creating the session.
sess_options = ort.SessionOptions()
# Graph optimization
sess_options.graph_optimization_level = ort.GraphOptimizationLevel. ORT_ENABLE_EXTENDED
# Threading
sess_options.intra_op_num_threads = 4
sess_options.inter_op_num_threads = 2
# Execution mode
sess_options.execution_mode = ort.ExecutionMode. ORT_SEQUENTIAL
# Memory optimization
sess_options.enable_cpu_mem_arena = True
sess_options.enable_mem_pattern = True
# Profiling
sess_options.enable_profiling = True
sess_options.profile_file_prefix = "ort_profile"
# Log settings
sess_options.log_severity_level = 2 # 0=Verbose, 1=Info, 2=Warning, 3=Error, 4=Fatal
sess_options.log_verbosity_level = 0
# Save optimized model
sess_options.optimized_model_filepath = "optimized_model.onnx"
Graph Optimization Levels
ort.GraphOptimizationLevel. ORT_DISABLE_ALL # No optimizations
ort.GraphOptimizationLevel. ORT_ENABLE_BASIC # Basic optimizations (constant folding, etc.)
ort.GraphOptimizationLevel. ORT_ENABLE_EXTENDED # Extended optimizations (operator fusion, etc.)
ort.GraphOptimizationLevel. ORT_ENABLE_ALL # All optimizations including layout optimization
RunOptions
Configure individual inference runs.
run_options = ort.RunOptions()
run_options.log_severity_level = 2
run_options.log_verbosity_level = 0
run_options.run_tag = "my_inference_run"
run_options.terminate = False # Set to True to terminate inference
outputs = session.run( None , {input_name: x}, run_options)
Execution Providers
Checking Available Providers
import onnxruntime as ort
# Get all available providers
available_providers = ort.get_available_providers()
print ( "Available providers:" , available_providers)
Setting Providers
Priority order:
session = ort.InferenceSession(
"model.onnx" ,
providers = [
'CUDAExecutionProvider' ,
'CPUExecutionProvider'
]
)
With provider options:
# CUDA provider options
cuda_options = {
'device_id' : 0 ,
'gpu_mem_limit' : 2 * 1024 * 1024 * 1024 , # 2GB
'arena_extend_strategy' : 'kNextPowerOfTwo' ,
'cudnn_conv_algo_search' : 'EXHAUSTIVE' ,
}
session = ort.InferenceSession(
"model.onnx" ,
providers = [
( 'CUDAExecutionProvider' , cuda_options),
'CPUExecutionProvider'
]
)
Check active provider:
print ( "Using providers:" , session.get_providers())
Common Providers
# CPU (default)
providers = [ 'CPUExecutionProvider' ]
# NVIDIA GPU
providers = [ 'CUDAExecutionProvider' , 'CPUExecutionProvider' ]
# TensorRT
providers = [ 'TensorrtExecutionProvider' , 'CUDAExecutionProvider' , 'CPUExecutionProvider' ]
# DirectML (Windows)
providers = [ 'DmlExecutionProvider' , 'CPUExecutionProvider' ]
# CoreML (macOS/iOS)
providers = [ 'CoreMLExecutionProvider' , 'CPUExecutionProvider' ]
# OpenVINO (Intel)
providers = [ 'OpenVINOExecutionProvider' , 'CPUExecutionProvider' ]
Working with IOBinding
Use IOBinding for zero-copy inference with GPU tensors.
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession( "model.onnx" , providers = [ 'CUDAExecutionProvider' ])
# Create IO binding
io_binding = session.io_binding()
# Bind input to GPU
input_name = session.get_inputs()[ 0 ].name
x_numpy = np.random.randn( 1 , 3 , 224 , 224 ).astype(np.float32)
x_ortvalue = ort.OrtValue.ortvalue_from_numpy(x_numpy, 'cuda' , 0 )
io_binding.bind_input(
name = input_name,
device_type = 'cuda' ,
device_id = 0 ,
element_type = np.float32,
shape = x_ortvalue.shape(),
buffer_ptr = x_ortvalue.data_ptr()
)
# Bind output to GPU
output_name = session.get_outputs()[ 0 ].name
io_binding.bind_output(output_name, 'cuda' )
# Run with binding
session.run_with_iobinding(io_binding)
# Get output
outputs = io_binding.get_outputs()
result = outputs[ 0 ].numpy()
print ( f "Output shape: { result.shape } " )
Complete Example: Image Classification
import onnxruntime as ort
import numpy as np
from PIL import Image
def preprocess_image ( image_path , size = ( 224 , 224 )):
"""Preprocess image for ResNet/MobileNet models."""
img = Image.open(image_path).convert( 'RGB' )
img = img.resize(size)
img_data = np.array(img).astype(np.float32)
# Normalize to [0, 1]
img_data = img_data / 255.0
# Normalize with ImageNet mean/std
mean = np.array([ 0.485 , 0.456 , 0.406 ])
std = np.array([ 0.229 , 0.224 , 0.225 ])
img_data = (img_data - mean) / std
# Convert HWC to CHW format
img_data = np.transpose(img_data, ( 2 , 0 , 1 ))
# Add batch dimension
img_data = np.expand_dims(img_data, axis = 0 )
return img_data.astype(np.float32)
def run_inference ( model_path , image_path ):
"""Run inference on an image."""
# Create session with GPU support
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel. ORT_ENABLE_ALL
session = ort.InferenceSession(
model_path,
sess_options = sess_options,
providers = [ 'CUDAExecutionProvider' , 'CPUExecutionProvider' ]
)
print ( f "Using provider: { session.get_providers() } " )
# Get model metadata
input_meta = session.get_inputs()[ 0 ]
print ( f "Input: { input_meta.name } , Shape: { input_meta.shape } , Type: { input_meta.type } " )
output_meta = session.get_outputs()[ 0 ]
print ( f "Output: { output_meta.name } , Shape: { output_meta.shape } , Type: { output_meta.type } " )
# Preprocess image
input_data = preprocess_image(image_path)
print ( f "Input data shape: { input_data.shape } " )
# Run inference
outputs = session.run(
[output_meta.name],
{input_meta.name: input_data}
)
# Get predictions
predictions = outputs[ 0 ][ 0 ]
top5_idx = np.argsort(predictions)[ - 5 :][:: - 1 ]
print ( " \n Top 5 predictions:" )
for idx in top5_idx:
print ( f " Class { idx } : { predictions[idx] :.4f} " )
return predictions
if __name__ == "__main__" :
predictions = run_inference(
model_path = "resnet50.onnx" ,
image_path = "cat.jpg"
)
Use the Right Execution Provider
Always specify execution providers in priority order. GPU providers like CUDA or TensorRT can provide 10-100x speedups for compute-intensive models.
Enable Graph Optimization
Set graph_optimization_level to ORT_ENABLE_ALL for maximum performance. The runtime will fuse operators and optimize the graph.
Creating a session is expensive. Create once and reuse for multiple inferences.
When using GPU providers, IOBinding eliminates CPU-GPU memory copies for better performance.
Process multiple inputs in a single batch when possible to maximize hardware utilization.
Error Handling
import onnxruntime as ort
try :
session = ort.InferenceSession( "model.onnx" )
outputs = session.run( None , { "input" : input_data})
except ort.OrtException as e:
print ( f "ONNX Runtime error: { e } " )
except Exception as e:
print ( f "Error: { e } " )
Next Steps
Model Optimization Learn how to optimize models for production
Execution Providers Configure hardware acceleration