import itertools
import warnings
from abc import ABC
from collections.abc import Iterable
import numpy as np
from sklearn.utils import shuffle
from .schedules import constant
from ... import Optimizer
[docs]
class StochasticOptimizer(Optimizer, ABC):
"""
Abstract base class for stochastic (mini-batch) first-order optimizers.
At each iteration the gradient is estimated on a mini batch of the data
drawn from f.args() rather than on the whole sample, and the point is moved
by a step controlled by a (possibly adaptive or scheduled) learning rate.
The mini batches are produced by iterating over the data, optionally shuffled
once per epoch, where an epoch is a full pass over all the samples.
"""
def __init__(self,
f,
x=None,
step_size=0.01,
batch_size=None,
eps=1e-6,
tol=1e-8,
epochs=1000,
callback=None,
callback_args=(),
shuffle=True,
random_state=None,
verbose=False):
"""
:param f: the objective function.
:param x: ([n x 1] real column vector): the point where to start the algorithm from.
:param step_size: (real scalar > 0, callable or iterable, optional, default value 0.01): the
learning rate, i.e., the size of the step taken along the search direction.
It can be a positive scalar (kept constant for all iterations), a callable
with signature step_size(X_batch, y_batch) returning an iterator (e.g., one
of the schedules in schedules.py) or an iterable yielding the step size to
use at each iteration.
:param batch_size: (integer scalar or None, optional, default value None): the size of the mini
batches used to estimate the gradient. If None the full sample is used at
every iteration (i.e., plain batch gradient descent); otherwise it is clipped
to lie in [1, n_samples].
:param eps: (real scalar, optional, default value 1e-6): the accuracy in the stopping
criterion: the algorithm is stopped when the norm of the gradient is less
than or equal to eps.
:param tol: (real scalar, optional, default value 1e-8): the tolerance used in the
optimality conditions of the Lagrangian dual (when f is a Lagrangian dual
function), i.e., the algorithm is stopped when the variables or the
constraints change by less than tol.
:param epochs: (integer scalar, optional, default value 1000): the maximum number of epochs,
i.e., full passes over the whole sample, before the algorithm is stopped.
:param callback: (callable, optional, default value None): a function called at each iteration
with the optimizer instance (and callback_args) as arguments; it can raise
StopIteration to interrupt the optimization.
:param callback_args: (tuple, optional, default value ()): additional positional arguments passed
to the callback at each call.
:param shuffle: (boolean, optional, default value True): whether to shuffle the order of the
mini batches at the beginning of each epoch.
:param random_state: (integer scalar or None, optional, default value None): seed for the random
number generator used to initialize x (when not provided) and to shuffle the
mini batches, for reproducibility.
:param verbose: (boolean or integer, optional, default value False): print details about each
iteration if True (or every `verbose` epochs if an integer), nothing otherwise.
"""
super(StochasticOptimizer, self).__init__(f=f,
x=x,
eps=eps,
tol=tol,
max_iter=epochs,
callback=callback,
callback_args=callback_args,
random_state=random_state,
verbose=verbose)
if not callable(step_size) and not isinstance(step_size, Iterable) and not step_size > 0:
raise ValueError('step_size must be > 0 or a callable or an iterator')
if isinstance(step_size, Iterable): # wrap the iterable into a function to deal with mini batches, i.e., *args
self.step_size = lambda *args: step_size
elif callable(step_size): # i.e., step_size(X_batch, y_batch)
self.step_size = step_size
else:
self.step_size = lambda *args: constant(step_size)
self.epochs = epochs
self.epoch = 0
self.shuffle = shuffle
self.step = 0
if batch_size is None:
self.batch_size = None
self.batches = itertools.repeat(f.args())
else:
n_samples = len(f.args()[0])
if batch_size < 1 or batch_size > n_samples:
warnings.warn('Got `batch_size` less than 1 or larger than '
'sample size. It is going to be clipped.')
self.batch_size = np.clip(batch_size, 1, n_samples)
self.n_batches, rest = divmod(len(f.args()[0]), self.batch_size)
if rest:
self.n_batches += 1
self.max_iter *= self.n_batches
self.batches = (i for i in self.iter_mini_batches())
[docs]
def iter_mini_batches(self):
"""
Return an iterator that successively yields tuples of aligned mini batches
of size ``batch_size`` from the sliceable arrays returned by ``f.args()``,
in random order (when ``shuffle`` is True) without replacement.
:return: an infinite iterator of mini batches (one tuple of aligned slices per step).
"""
while True:
idx = list(range(self.n_batches))
while True:
if self.shuffle:
shuffle(idx, random_state=self.random_state)
for i in idx:
start = i * self.batch_size
stop = (i + 1) * self.batch_size
yield [param[slice(start, stop)] for param in self.f.args()]
[docs]
def is_batch_end(self):
return (self.batch_size is None or self.batch_size == len(self.f.args()[0])
or (self.iter and not self.iter % self.n_batches))
[docs]
def is_verbose(self):
return self.verbose and not self.epoch % self.verbose
def _print_header(self):
if self.verbose:
print('epoch\titer\t cost\t', end='')
if self.f.f_star() < np.inf:
print('\t gap\t\t rate', end='')
self.prev_f_x = np.inf
def _print_info(self):
if self.is_batch_end() and self.is_verbose():
print('\n{:4d}\t{:4d}\t{: 1.4e}'.format(self.epoch, self.iter, self.f_x), end='')
if self.f.f_star() < np.inf:
print('\t{: 1.4e}'.format((self.f_x - self.f.f_star()) /
max(abs(self.f.f_star()), 1)), end='')
if self.prev_f_x < np.inf:
print('\t{: 1.4e}'.format((self.f_x - self.f.f_star()) /
max(abs(self.prev_f_x - self.f.f_star()), 1)), end='')
else:
print('\t\t', end='')
self.prev_f_x = self.f_x
[docs]
class StochasticMomentumOptimizer(StochasticOptimizer, ABC):
"""
Abstract base class for stochastic optimizers that support a momentum term.
In addition to the plain stochastic step it keeps a velocity that accumulates
an exponentially decaying fraction of the past steps, which damps oscillations
and accelerates convergence along consistent descent directions. Both the
classical heavy-ball (Polyak) momentum and Nesterov's accelerated momentum are
supported, as selected by momentum_type.
"""
def __init__(self,
f,
x=None,
step_size=0.01,
momentum_type='none',
momentum=0.9,
batch_size=None,
eps=1e-6,
tol=1e-8,
epochs=1000,
callback=None,
callback_args=(),
shuffle=True,
random_state=None,
verbose=False):
"""
:param f: the objective function.
:param x: ([n x 1] real column vector): the point where to start the algorithm from.
:param step_size: (real scalar > 0, callable or iterable, optional, default value 0.01): the
learning rate, i.e., the size of the step taken along the search direction
(see StochasticOptimizer for the accepted forms).
:param momentum_type: (string in {'none', 'polyak', 'nesterov'}, optional, default value 'none'):
the kind of momentum to apply: 'none' disables momentum, 'polyak' uses the
classical heavy-ball momentum and 'nesterov' uses Nesterov's accelerated
momentum (the gradient is evaluated after the momentum jump).
:param momentum: (real scalar in [0, 1) or iterable, optional, default value 0.9): the momentum
factor, i.e., the fraction of the previous step retained in the current one.
It can be a scalar (kept constant) or an iterable yielding the value to use at
each iteration.
:param batch_size: (integer scalar or None, optional, default value None): the size of the mini
batches used to estimate the gradient; if None the full sample is used.
:param eps: (real scalar, optional, default value 1e-6): the accuracy in the stopping
criterion: the algorithm is stopped when the norm of the gradient is less
than or equal to eps.
:param tol: (real scalar, optional, default value 1e-8): the tolerance used in the
optimality conditions of the Lagrangian dual (when f is a Lagrangian dual).
:param epochs: (integer scalar, optional, default value 1000): the maximum number of epochs
before the algorithm is stopped.
:param callback: (callable, optional, default value None): a function called at each iteration
with the optimizer instance (and callback_args) as arguments; it can raise
StopIteration to interrupt the optimization.
:param callback_args: (tuple, optional, default value ()): additional positional arguments passed
to the callback at each call.
:param shuffle: (boolean, optional, default value True): whether to shuffle the order of the
mini batches at the beginning of each epoch.
:param random_state: (integer scalar or None, optional, default value None): seed for the random
number generator, for reproducibility.
:param verbose: (boolean or integer, optional, default value False): print details about each
iteration if True (or every `verbose` epochs if an integer), nothing otherwise.
"""
super(StochasticMomentumOptimizer, self).__init__(f=f,
x=x,
step_size=step_size,
batch_size=batch_size,
eps=eps,
tol=tol,
epochs=epochs,
callback=callback,
callback_args=callback_args,
shuffle=shuffle,
random_state=random_state,
verbose=verbose)
if momentum_type not in ('polyak', 'nesterov', 'none'):
raise ValueError(f'unknown momentum type {momentum_type}')
self.momentum_type = momentum_type
if not isinstance(momentum, Iterable) and not 0 <= momentum < 1:
raise ValueError('momentum must be between 0 and 1 or an iterator')
if isinstance(momentum, Iterable):
self.momentum = momentum
else:
self.momentum = constant(momentum)