TensorFlow

More comprehensive examples can be found in our Examples Repo.

import os
import engineml.tensorflow as eml
import tensorflow as tf
# Partition your data set across replicas. This is generally a list
# of files.
data = get_data_files(data_dir='/engine/data')
data = eml.data.distribute(data)
# Scale the learning rate by the number of model replicas
lr = 0.1
lr = eml.optimizer.scale_learning_rate(lr)
# Create your model and loss
loss = MyModel(...)
# Wrap your optimizer in the Allreduce Optimizer
opt = tf.train.AdamOptimizer(lr)
opt = eml.optimizer.distribute(opt)
train_op = opt.minimize(loss)
# Wrap your saver in `eml.saver()` to ensure checkpoints are saved
# correctly when using Engine ML
saver = tf.train.Saver()
saver = eml.saver(saver)
# By default, tf.Session() uses 100% of GPU memory for all GPUs
# on the machine. We don't want this behavior. One option is
# setting the environment var CUDA_VISIBLE_DEVICES.
# Unfortunately, this disables high-speed GPU-to-GPU
# communication. Using Engine ML's config disables TensorFlow's
# greedy approach and allows for fast IPC.
config = eml.session.make_distributed_config()
sess = tf.Session(config=config)
# Synchronize all replica weights
sess.run(eml.session.init_op())
# Set a handler to automatically save a model checkpoint if the run is preempted
eml.preempted_handler(saver.save, sess, os.path.join(eml.data.data.output_dir(), 'preempted'))
# Set the output directory for saving event files and checkpoints
# `eml.data.output_dir()` returns `None` when running locally
save_dir = eml.data.output_dir() or '/path/when/training/locally'
# Optionally, set the input directory for restoring from a
# previous run's checkpoints. The input directory is either
# mounted to a previous run's output directory, or to an
# S3 bucket.
restore_dir = eml.data.input_dir()