PyTorch

More comprehensive examples can be found in our Examples Repo.

import os
import engineml.torch as eml
import torch
import torch.optim as optim
# Partition your data set across replicas. This is generally a list
# of files.
data = get_data_files(data_dir='/engine/data')
data = eml.data.distribute(data)
# Pins your PyTorch code to a single GPU while enabling fast
# GPU-to-GPU communication. Make sure you run `init_devices()`
# before any `.cuda()` calls
eml.session.init_devices()
model = MyModel().cuda()
# Scale the learning rate by the number of model replicas
lr = 0.1
lr = eml.optimizer.scale_learning_rate(lr)
# Wrap your optimizer in the Allreduce Optimizer
opt = optim.Adam(model.parameters(), lr=lr)
opt = eml.optimizer.distribute(opt)
# Create a handler to automatically write a checkpoint when a run is preempted
def save_checkpoint(model, opt, checkpoint_path):
state = {
'model_state': model.state_dict(),
'optimizer_state': optimizer.state_dict(),
}
eml.save(state, checkpoint_path)
# Set the preempted checkpoint handler
eml.preempted_handler(save_checkpoint, model, opt, os.path.join(eml.data.output_dir(), 'preempted.pt'))
# Synchronize all replica weights
eml.session.sync_model_replicas(model)
# Optionally, set the input directory for restoring from a previous
# run's checkpoints
restore_dir = eml.data.input_dir()
torch.load(os.path.join(restore_dir, 'pretrained-checkpoint.pt'))
# Set the output directory for saving event files (if using
# tensorboardX) and checkpoints `eml.data.output_dir()`
# returns `None` when running locally
save_dir = eml.data.output_dir() or '/path/when/training/locally'
state = {
'model': model.state_dict(),
'optimizer': opt.state_dict(),
}
# Use `eml.save()` instead of `torch.save()` to ensure checkpoints
# are saved correctly when using Engine ML
eml.save(state, os.path.join(save_dir, 'checkpoint.pt'))