Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/microsoft/onnxruntime/llms.txt

Use this file to discover all available pages before exploring further.

IOBinding

The IOBinding class provides an API to bind model inputs and outputs to specific device memory (CPU, CUDA, DirectML, etc.), enabling zero-copy inference and improved performance for GPU workloads.

Constructor

IOBinding is created through an InferenceSession:
io_binding = session.io_binding()

Methods

bind_cpu_input()

Bind an input to a numpy array on CPU.
bind_cpu_input(
    name: str,
    arr_on_cpu: np.ndarray
)
name
str
required
Name of the input.
arr_on_cpu
np.ndarray
required
Input values as a numpy array on CPU.

bind_input()

Bind an input to pre-allocated device memory.
bind_input(
    name: str,
    device_type: str,
    device_id: int,
    element_type: np.dtype | int,
    shape: tuple[int],
    buffer_ptr: int
)
name
str
required
Name of the input.
device_type
str
required
Device type: “cpu”, “cuda”, “cann”, “dml”, etc.
device_id
int
required
Device ID (e.g., 0 for first GPU).
element_type
np.dtype | int
required
Element data type (numpy type like np.float32 or ONNX TensorProto type).
shape
tuple[int]
required
Shape of the input tensor.
buffer_ptr
int
required
Memory pointer to the input data buffer.

bind_ortvalue_input()

Bind an input to an OrtValue object.
bind_ortvalue_input(
    name: str,
    ortvalue: OrtValue
)
name
str
required
Name of the input.
ortvalue
OrtValue
required
OrtValue instance containing input data.

bind_output()

Bind an output to device memory.
bind_output(
    name: str,
    device_type: str = "cpu",
    device_id: int = 0,
    element_type: np.dtype | int | None = None,
    shape: tuple[int] | None = None,
    buffer_ptr: int | None = None
)
name
str
required
Name of the output.
device_type
str
Device type: “cpu”, “cuda”, etc. Default is “cpu”.
device_id
int
Device ID. Default is 0.
element_type
np.dtype | int
Element data type. Required if buffer_ptr is provided.
shape
tuple[int]
Output shape. Required if buffer_ptr is provided.
buffer_ptr
int
Pre-allocated memory pointer. If None, ORT allocates memory.

bind_ortvalue_output()

Bind an output to an OrtValue object.
bind_ortvalue_output(
    name: str,
    ortvalue: OrtValue
)

get_outputs()

Get output OrtValues after running inference.
get_outputs() -> list[OrtValue]
outputs
list[OrtValue]
List of OrtValue objects containing output data on their respective devices.

copy_outputs_to_cpu()

Copy output contents to CPU as numpy arrays.
copy_outputs_to_cpu() -> list[np.ndarray]
outputs
list[np.ndarray]
List of output tensors as numpy arrays on CPU.

synchronize_inputs()

Synchronize device inputs before inference.
synchronize_inputs()

synchronize_outputs()

Synchronize device outputs after inference.
synchronize_outputs()

clear_binding_inputs()

Clear all bound inputs.
clear_binding_inputs()

clear_binding_outputs()

Clear all bound outputs.
clear_binding_outputs()

Example Usage

Basic CUDA Inference

import onnxruntime as ort
import numpy as np

sess = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider"]
)

# Create IOBinding
io_binding = sess.io_binding()

# Create input on GPU
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
ortvalue_input = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)

# Bind input and output
io_binding.bind_ortvalue_input("input", ortvalue_input)
io_binding.bind_output("output", "cuda")

# Run inference on GPU
sess.run_with_iobinding(io_binding)

# Get outputs (still on GPU)
outputs = io_binding.get_outputs()
print(f"Output device: {outputs[0].device_name()}")

# Copy to CPU if needed
output_cpu = io_binding.copy_outputs_to_cpu()

Reusing IOBinding for Multiple Runs

sess = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
io_binding = sess.io_binding()

# Bind output once
io_binding.bind_output("output", "cuda")

# Run multiple times with different inputs
for i in range(100):
    # Create new input
    input_data = generate_input(i)
    ortvalue_input = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)
    
    # Update input binding
    io_binding.clear_binding_inputs()
    io_binding.bind_ortvalue_input("input", ortvalue_input)
    
    # Run inference
    sess.run_with_iobinding(io_binding)
    
    # Process outputs
    outputs = io_binding.get_outputs()
    process_output(outputs[0])

Pre-allocated Output Buffers

import torch

sess = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
io_binding = sess.io_binding()

# Pre-allocate output buffer with PyTorch
output_shape = (1, 1000)
output_buffer = torch.zeros(output_shape, dtype=torch.float32, device="cuda:0")

# Bind to pre-allocated buffer
io_binding.bind_output(
    "output",
    device_type="cuda",
    device_id=0,
    element_type=np.float32,
    shape=output_shape,
    buffer_ptr=output_buffer.data_ptr()
)

# Input binding
input_ortvalue = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)
io_binding.bind_ortvalue_input("input", input_ortvalue)

# Run inference - output written directly to PyTorch tensor
sess.run_with_iobinding(io_binding)
print(f"Output in PyTorch tensor: {output_buffer}")

Multi-Input Model

sess = ort.InferenceSession("multi_input_model.onnx", providers=["CUDAExecutionProvider"])
io_binding = sess.io_binding()

# Bind multiple inputs
input1 = ort.OrtValue.ortvalue_from_numpy(data1, "cuda", 0)
input2 = ort.OrtValue.ortvalue_from_numpy(data2, "cuda", 0)

io_binding.bind_ortvalue_input("input1", input1)
io_binding.bind_ortvalue_input("input2", input2)

# Bind multiple outputs
io_binding.bind_output("output1", "cuda")
io_binding.bind_output("output2", "cuda")

sess.run_with_iobinding(io_binding)

outputs = io_binding.get_outputs()
output1, output2 = outputs[0], outputs[1]

CPU Binding

sess = ort.InferenceSession("model.onnx")
io_binding = sess.io_binding()

# Bind CPU input directly
input_array = np.random.randn(1, 3, 224, 224).astype(np.float32)
io_binding.bind_cpu_input("input", input_array)

# Bind output to CPU
io_binding.bind_output("output", "cpu")

sess.run_with_iobinding(io_binding)
outputs = io_binding.copy_outputs_to_cpu()

Performance Best Practices

# 1. Keep data on GPU throughout pipeline
io_binding = sess.io_binding()
io_binding.bind_ortvalue_input("input", gpu_ortvalue)
io_binding.bind_output("output", "cuda")

sess.run_with_iobinding(io_binding)
gpu_output = io_binding.get_outputs()[0]  # Stays on GPU

# 2. Reuse IOBinding object
for batch in batches:
    io_binding.clear_binding_inputs()
    io_binding.bind_ortvalue_input("input", batch)
    sess.run_with_iobinding(io_binding)
    outputs = io_binding.get_outputs()

# 3. Synchronize explicitly when needed
io_binding.synchronize_outputs()  # Ensure GPU work is complete