More comprehensive examples can be found in our Examples Repo.

import engineml.keras as eml
from keras import backend as K
import tensorflow as tf
# Partition your data set across replicas. This is generally a list
# of files.
data = get_data_files(data_dir='/engine/data')
data =
# Scale the learning rate by the number of model replicas
lr = 0.1
lr = eml.optimizer.scale_learning_rate(lr)
# Create your model
model = MyModel(...)
# Wrap your optimizer in the Allreduce Optimizer
opt = tf.train.AdamOptimizer(lr)
opt = eml.optimizer.distribute(opt)
# Compile your model with the distributed optimizer
model.compile(..., optimizer=opt)
# By default, tf.Session() uses 100% of GPU memory for all GPUs
# on the machine. We don't want this behavior. One option is
# setting the environment var CUDA_VISIBLE_DEVICES.
# Unfortunately, this disables high-speed GPU-to-GPU
# communication. Using Engine ML's config disables TensorFlow's
# greedy approach and allows for fast IPC.
config = eml.session.make_distributed_config()
callbacks = [
# Synchronize all replica weights
# Set the callback to automatically save a model checkpoint if the run is preempted
# Set the output directory for saving event files and checkpoints.
# `` returns `None` when running locally
save_dir = or '/path/when/training/locally', callbacks=callbacks)