diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py index 10e0635f57..79073d82cd 100644 --- a/pymc3/tests/test_variational_inference.py +++ b/pymc3/tests/test_variational_inference.py @@ -7,11 +7,10 @@ from pymc3 import Model, Normal from pymc3.variational import ( ADVI, FullRankADVI, SVGD, - Empirical, - fit + Empirical, ASVGD, + MeanField, fit ) from pymc3.variational.operators import KL -from pymc3.variational.approximations import MeanField from pymc3.tests import models from pymc3.tests.helpers import SeededTest @@ -70,7 +69,15 @@ class TestApproximates: class Base(SeededTest): inference = None NITER = 12000 - optimizer = functools.partial(pm.adam, learning_rate=.01) + optimizer = pm.adagrad_window(learning_rate=0.01) + conv_cb = property(lambda self: [ + pm.callbacks.CheckParametersConvergence( + every=500, + diff='relative', tolerance=0.001), + pm.callbacks.CheckParametersConvergence( + every=500, + diff='absolute', tolerance=0.0001) + ]) def test_vars_view(self): _, model, _ = models.multidimensional_model() @@ -147,11 +154,10 @@ def test_optimizer_with_full_data(self): inf.fit(10) approx = inf.fit(self.NITER, obj_optimizer=self.optimizer, - callbacks= - [pm.callbacks.CheckParametersConvergence()],) + callbacks=self.conv_cb,) trace = approx.sample(10000) - np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1) - np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4) + np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05) + np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1) def test_optimizer_minibatch_with_generator(self): n = 1000 @@ -176,11 +182,10 @@ def create_minibatch(data): Normal('x', mu=mu_, sd=sd, observed=minibatches, total_size=n) inf = self.inference() approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer, - callbacks= - [pm.callbacks.CheckParametersConvergence()]) + callbacks=self.conv_cb) trace = approx.sample(10000) - np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1) - np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4) + np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05) + np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1) def test_optimizer_minibatch_with_callback(self): n = 1000 @@ -208,12 +213,21 @@ def cb(*_): mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0) Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n) inf = self.inference(scale_cost_to_minibatch=True) - approx = inf.fit(self.NITER * 3, callbacks= - [cb, pm.callbacks.CheckParametersConvergence()], - obj_n_mc=10, obj_optimizer=self.optimizer) + approx = inf.fit( + self.NITER * 3, callbacks=[cb] + self.conv_cb, obj_optimizer=self.optimizer) trace = approx.sample(10000) - np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4) - np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4) + np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05) + np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1) + + def test_n_obj_mc(self): + n_samples = 100 + xs = np.random.binomial(n=1, p=0.2, size=n_samples) + with pm.Model(): + p = pm.Beta('p', alpha=1, beta=1) + pm.Binomial('xs', n=1, p=p, observed=xs) + inf = self.inference(scale_cost_to_minibatch=True) + # should just work + inf.fit(10, obj_n_mc=10, obj_optimizer=self.optimizer) def test_pickling(self): with models.multidimensional_model()[1]: @@ -277,8 +291,14 @@ def test_from_advi(self): class TestSVGD(TestApproximates.Base): inference = functools.partial(SVGD, n_particles=100) - NITER = 2500 - optimizer = functools.partial(pm.adam, learning_rate=.1) + + +class TestASVGD(TestApproximates.Base): + NITER = 15000 + inference = ASVGD + test_aevb = _test_aevb + optimizer = pm.adagrad_window(learning_rate=0.002) + conv_cb = [] class TestEmpirical(SeededTest): diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py index 0e8a50093c..808442c927 100644 --- a/pymc3/variational/__init__.py +++ b/pymc3/variational/__init__.py @@ -1,39 +1,45 @@ from .advi import advi, sample_vp from .advi_minibatch import advi_minibatch +# commonly used +from . import updates from .updates import ( sgd, apply_momentum, momentum, apply_nesterov_momentum, + adagrad_window, nesterov_momentum, adagrad, - adagrad_window, rmsprop, adadelta, adam, adamax, norm_constraint, - total_norm_constraint, + total_norm_constraint ) +from . import inference from .inference import ( ADVI, FullRankADVI, SVGD, - fit, + ASVGD, + Inference, + fit ) + +from . import approximations from .approximations import ( - Empirical, - FullRank, MeanField, + FullRank, + Empirical, sample_approx ) -from . import approximations +# special +from .stein import Stein from . import operators from . import test_functions from . import opvi -from . import updates -from . import inference from . import callbacks diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py index 643f169d44..24201b8327 100644 --- a/pymc3/variational/approximations.py +++ b/pymc3/variational/approximations.py @@ -28,7 +28,7 @@ class MeanField(Approximation): mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - model : :class:`pymc3.Model` + model : :class:`pymc3.Model` PyMC3 model for inference start : `Point` initial mean @@ -38,14 +38,14 @@ class MeanField(Approximation): 1 at the start and 0 in the end. So slow decay will be ok. See (Sticking the Landing; Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016) for details - scale_cost_to_minibatch : `bool` + scale_cost_to_minibatch : `bool` Scale cost to minibatch instead of full dataset, default False random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one References - ---------- + ---------- - Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016 Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf @@ -76,12 +76,9 @@ def create_shared_params(self, **kwargs): start = start_ start = self.gbij.map(start) return {'mu': theano.shared( - pm.floatX(start), - 'mu'), + pm.floatX(start), 'mu'), 'rho': theano.shared( - np.zeros((self.global_size,), dtype=theano.config.floatX), - 'rho') - } + pm.floatX(np.zeros((self.global_size,))), 'rho')} def log_q_W_global(self, z): """ @@ -120,13 +117,13 @@ class FullRank(Approximation): archiving better convergence properties. Common schedule is 1 at the start and 0 in the end. So slow decay will be ok. See (Sticking the Landing; Geoffrey Roeder, - Yuhuai Wu, David Duvenaud, 2016) for details + Yuhuai Wu, David Duvenaud, 2016) for details scale_cost_to_minibatch : bool, default False Scale cost to minibatch instead of full dataset random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one - + Other Parameters ---------------- gpu_compat : bool @@ -138,6 +135,7 @@ class FullRank(Approximation): Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI approximateinference.org/accepted/RoederEtAl2016.pdf """ + def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1, scale_cost_to_minibatch=False, gpu_compat=False, random_seed=None, **kwargs): @@ -173,7 +171,9 @@ def tril_index_matrix(self): num_tril_entries = self.num_tril_entries tril_index_matrix = np.zeros([n, n], dtype=int) tril_index_matrix[np.tril_indices(n)] = np.arange(num_tril_entries) - tril_index_matrix[np.tril_indices(n)[::-1]] = np.arange(num_tril_entries) + tril_index_matrix[ + np.tril_indices(n)[::-1] + ] = np.arange(num_tril_entries) return tril_index_matrix def create_shared_params(self, **kwargs): @@ -184,14 +184,14 @@ def create_shared_params(self, **kwargs): start_ = self.model.test_point.copy() pm.sampling._update_start_vals(start_, start, self.model) start = start_ - start = self.gbij.map(start) + start = pm.floatX(self.gbij.map(start)) n = self.global_size L_tril = ( np.eye(n) [np.tril_indices(n)] .astype(theano.config.floatX) ) - return {'mu': theano.shared(pm.floatX(start), 'mu'), + return {'mu': theano.shared(start, 'mu'), 'L_tril': theano.shared(L_tril, 'L_tril') } @@ -220,7 +220,7 @@ def from_mean_field(cls, mean_field, gpu_compat=False): """Construct FullRank from MeanField approximation Parameters - ---------- + ---------- mean_field : :class:`MeanField` approximation to start with @@ -264,9 +264,9 @@ class Empirical(Approximation): mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - scale_cost_to_minibatch : `bool` + scale_cost_to_minibatch : `bool` Scale cost to minibatch instead of full dataset, default False - model : :class:`pymc3.Model` + model : :class:`pymc3.Model` PyMC3 model for inference random_seed : None or int leave None to use package global RandomStream or other @@ -279,6 +279,7 @@ class Empirical(Approximation): ... trace = sample(1000, step=step) ... histogram = Empirical(trace[100:]) """ + def __init__(self, trace, local_rv=None, scale_cost_to_minibatch=False, model=None, random_seed=None, **kwargs): @@ -368,15 +369,15 @@ def from_noise(cls, size, jitter=.01, local_rv=None, Parameters ---------- - size : `int` + size : `int` number of initial particles - jitter : `float` + jitter : `float` initial sd local_rv : `dict` mapping {model_variable -> local_variable} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - start : `Point` + start : `Point` initial point model : :class:`pymc3.Model` PyMC3 model for inference @@ -386,20 +387,25 @@ def from_noise(cls, size, jitter=.01, local_rv=None, kwargs : other kwargs passed to init Returns - ------- + ------- :class:`Empirical` """ - hist = cls(None, local_rv=local_rv, model=model, random_seed=random_seed, **kwargs) + hist = cls( + None, + local_rv=local_rv, + model=model, + random_seed=random_seed, + **kwargs) if start is None: start = hist.model.test_point else: start_ = hist.model.test_point.copy() pm.sampling._update_start_vals(start_, start, hist.model) start = start_ - start = hist.gbij.map(start) + start = pm.floatX(hist.gbij.map(start)) # Initialize particles x0 = np.tile(start, (size, 1)) - x0 += np.random.normal(0, jitter, x0.shape) + x0 += pm.floatX(np.random.normal(0, jitter, x0.shape)) hist.histogram.set_value(x0) return hist @@ -408,7 +414,7 @@ def sample_approx(approx, draws=100, include_transformed=True): """Draw samples from variational posterior. Parameters - ---------- + ---------- approx : :class:`Approximation` Approximation to sample from draws : `int` @@ -417,7 +423,7 @@ def sample_approx(approx, draws=100, include_transformed=True): If True, transformed variables are also sampled. Default is True. Returns - ------- + ------- trace : class:`pymc3.backends.base.MultiTrace` Samples drawn from variational posterior. """ diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py index b6691227e1..964826281a 100644 --- a/pymc3/variational/callbacks.py +++ b/pymc3/variational/callbacks.py @@ -12,12 +12,13 @@ def __call__(self, approx, loss, i): def relative(current, prev, eps=1e-6): - return (np.abs(current - prev)+eps)/(np.abs(prev)+eps) + return (np.abs(current - prev) + eps) / (np.abs(prev) + eps) def absolute(current, prev): return np.abs(current - prev) + _diff = dict( relative=relative, absolute=absolute @@ -32,7 +33,7 @@ class CheckParametersConvergence(Callback): every : int check frequency tolerance : float - if diff norm < tolerance : break + if diff norm < tolerance : break diff : str difference type one of {'absolute', 'relative'} ord : {non-zero int, inf, -inf, 'fro', 'nuc'}, optional @@ -50,7 +51,8 @@ class CheckParametersConvergence(Callback): ... ) """ - def __init__(self, every=100, tolerance=1e-3, diff='relative', ord=np.inf): + def __init__(self, every=100, tolerance=1e-3, + diff='relative', ord=np.inf): self._diff = _diff[diff] self.ord = ord self.every = every diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py index 957c2d9cb2..8cf0ec26a7 100644 --- a/pymc3/variational/inference.py +++ b/pymc3/variational/inference.py @@ -8,7 +8,7 @@ import pymc3 as pm from pymc3.variational.approximations import MeanField, FullRank, Empirical -from pymc3.variational.operators import KL, KSD +from pymc3.variational.operators import KL, KSD, AKSD from pymc3.variational.opvi import Approximation from pymc3.variational import test_functions @@ -19,6 +19,7 @@ 'ADVI', 'FullRankADVI', 'SVGD', + 'ASVGD', 'Inference', 'fit' ] @@ -41,7 +42,8 @@ class Inference(object): See (AEVB; Kingma and Welling, 2014) for details model : Model PyMC3 Model - kwargs : kwargs for :class:`Approximation` + kwargs : kwargs + additional kwargs for :class:`Approximation` """ def __init__(self, op, approx, tf, local_rv=None, model=None, **kwargs): @@ -53,7 +55,8 @@ def __init__(self, op, approx, tf, local_rv=None, model=None, **kwargs): elif isinstance(approx, Approximation): # pragma: no cover pass else: # pragma: no cover - raise TypeError('approx should be Approximation instance or Approximation subclass') + raise TypeError( + 'approx should be Approximation instance or Approximation subclass') self.objective = op(approx)(tf) approx = property(lambda self: self.objective.approx) @@ -104,7 +107,8 @@ def fit(self, n=10000, score=None, callbacks=None, progressbar=True, calls provided functions after each iteration step progressbar : bool whether to show progressbar or not - kwargs : kwargs for :func:`ObjectiveFunction.step_function` + kwargs : kwargs + additional kwargs for :func:`ObjectiveFunction.step_function` Returns ------- @@ -154,6 +158,9 @@ def _infmean(input_array): if i % 10 == 0: avg_loss = _infmean(scores[max(0, i - 1000):i + 1]) progress.set_description('Average Loss = {:,.5g}'.format(avg_loss)) + avg_loss = scores[max(0, i - 1000):i + 1].mean() + progress.set_description( + 'Average Loss = {:,.5g}'.format(avg_loss)) for callback in callbacks: callback(self.approx, scores[:i + 1], i) except (KeyboardInterrupt, StopIteration) as e: @@ -171,10 +178,12 @@ def _infmean(input_array): i, 100 * i // n, avg_loss)) else: if n < 10: - logger.info('Finished [100%]: Loss = {:,.5g}'.format(scores[-1])) + logger.info( + 'Finished [100%]: Loss = {:,.5g}'.format(scores[-1])) else: avg_loss = _infmean(scores[max(0, i - 1000):i + 1]) - logger.info('Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss)) + logger.info( + 'Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss)) finally: progress.close() self.hist = np.concatenate([self.hist, scores]) @@ -183,13 +192,13 @@ def _infmean(input_array): class ADVI(Inference): R""" Automatic Differentiation Variational Inference (ADVI) - + This class implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. - + For explanation, we classify random variables in probabilistic models into three types. Observed random variables :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations. @@ -268,42 +277,42 @@ class ADVI(Inference): When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be set to :math:`N/M`, where :math:`M` is the number of observations in each - mini-batch. This is done with supplying `total_size` parameter to + mini-batch. This is done with supplying `total_size` parameter to observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`). In this case it is possible to automatically determine appropriate scaling for :math:`logp` - of observed nodes. Interesting to note that it is possible to have two independent + of observed nodes. Interesting to note that it is possible to have two independent observed variables with different `total_size` and iterate them independently - during inference. + during inference. For working with ADVI, we need to give - + - The probabilistic model `model` with three types of RVs (`observed_RVs`, - `global_RVs` and `local_RVs`). - + `global_RVs` and `local_RVs`). + - (optional) Minibatches - The tensors to which mini-bathced samples are supplied are - handled separately by using callbacks in :func:`Inference.fit` method - that change storage of shared theano variable or by :func:`pymc3.generator` - that automatically iterates over minibatches and defined beforehand. - + The tensors to which mini-bathced samples are supplied are + handled separately by using callbacks in :func:`Inference.fit` method + that change storage of shared theano variable or by :func:`pymc3.generator` + that automatically iterates over minibatches and defined beforehand. + - (optional) Parameters of deterministic mappings - They have to be passed along with other params to :func:`Inference.fit` method - as `more_obj_params` argument. + They have to be passed along with other params to :func:`Inference.fit` method + as `more_obj_params` argument. - For more information concerning training stage please reference + For more information concerning training stage please reference :func:`pymc3.variational.opvi.ObjectiveFunction.step_function` - + Parameters ---------- local_rv : dict[var->tuple] mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - model : :class:`pymc3.Model` + model : :class:`pymc3.Model` PyMC3 model for inference cost_part_grad_scale : `scalar` Scaling score part of gradient can be useful near optimum for @@ -315,7 +324,7 @@ class ADVI(Inference): Scale cost to minibatch instead of full dataset, default False random_seed : None or int leave None to use package global RandomStream or other - valid value to create instance specific one + valid value to create instance specific one start : `Point` starting point for inference @@ -374,10 +383,10 @@ class FullRankADVI(Inference): Parameters ---------- local_rv : dict[var->tuple] - mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} + mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - model : :class:`pymc3.Model` + model : :class:`pymc3.Model` PyMC3 model for inference cost_part_grad_scale : `scalar` Scaling score part of gradient can be useful near optimum for @@ -498,7 +507,7 @@ class SVGD(Inference): they fit target distribution best. Algorithm is outlined below - + *Input:* A target distribution with density function :math:`p(x)` and a set of initial particles :math:`{x^0_i}^n_{i=1}` @@ -506,8 +515,8 @@ class SVGD(Inference): .. math:: - x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\ - \hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)] + x_i^{l+1} &\leftarrow x_i^{l} + \epsilon_l \hat{\phi}^{*}(x_i^l) \\ + \hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)] Parameters ---------- @@ -552,7 +561,95 @@ def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_function model=model, random_seed=random_seed) -def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, **kwargs): +class ASVGD(Inference): + R""" + Amortized Stein Variational Gradient Descent + + This inference is based on Kernelized Stein Discrepancy + it's main idea is to move initial noisy particles so that + they fit target distribution best. + + Algorithm is outlined below + + *Input:* Parametrized random generator :math:`R_{\theta}` + + *Output:* :math:`R_{\theta^{*}}` that approximates the target distribution. + + .. math:: + + \Delta x_i &= \hat{\phi}^{*}(x_i) \\ + \hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x_j,x) \nabla_{x_j} logp(x_j)+ \nabla_{x_j} k(x_j,x)] \\ + \Delta_{\theta} &= \frac{1}{n}\sum^{n}_{i=1}\Delta x_i\frac{\partial x_i}{\partial \theta} + + Parameters + ---------- + approx : :class:`Approximation` + local_rv : dict[var->tuple] + mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} + Local Vars are used for Autoencoding Variational Bayes + See (AEVB; Kingma and Welling, 2014) for details + kernel : `callable` + kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))` + model : :class:`Model` + kwargs : kwargs for :class:`Approximation` + + References + ---------- + - Dilin Wang, Yihao Feng, Qiang Liu (2016) + Learning to Sample Using Stein Discrepancy + http://bayesiandeeplearning.org/papers/BDL_21.pdf + + - Dilin Wang, Qiang Liu (2016) + Learning to Draw Samples: With Application to Amortized MLE for Generative Adversarial Learning + https://arxiv.org/abs/1611.01722 + """ + + def __init__(self, approx=FullRank, local_rv=None, + kernel=test_functions.rbf, model=None, **kwargs): + super(ASVGD, self).__init__( + op=AKSD, + approx=approx, + local_rv=local_rv, + tf=kernel, + model=model, + **kwargs + ) + + def fit(self, n=10000, score=None, callbacks=None, progressbar=True, + obj_n_mc=30, **kwargs): + """ + Performs Amortized Stein Variational Gradient Descent + + Parameters + ---------- + n : int + number of iterations + score : bool + evaluate loss on each iteration or not + callbacks : list[function : (Approximation, losses, i) -> None] + calls provided functions after each iteration step + progressbar : bool + whether to show progressbar or not + obj_n_mc : int + sample `n` particles for Stein gradient + kwargs : kwargs + additional kwargs for :func:`ObjectiveFunction.step_function` + + Returns + ------- + Approximation + """ + return super(ASVGD, self).fit( + n=n, score=score, callbacks=callbacks, + progressbar=progressbar, obj_n_mc=obj_n_mc, **kwargs) + + def run_profiling(self, n=1000, score=None, obj_n_mc=30, **kwargs): + return super(ASVGD, self).run_profiling( + n=n, score=score, obj_n_mc=obj_n_mc, **kwargs) + + +def fit(n=10000, local_rv=None, method='advi', model=None, + random_seed=None, start=None, inf_kwargs=None, **kwargs): R""" Handy shortcut for using inference methods in functional way @@ -565,12 +662,14 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or :class:`Inference` - string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd'} + string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd', 'asvgd'} model : :class:`pymc3.Model` PyMC3 model for inference random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one + inf_kwargs : dict + additional kwargs passed to :class:`Inference` start : `Point` starting point for inference @@ -578,26 +677,34 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta ---------------- frac : `float` if method is 'advi->fullrank_advi' represents advi fraction when training - kwargs : kwargs for :func:`Inference.fit` + kwargs : kwargs + additional kwargs for :func:`Inference.fit` Returns ------- :class:`Approximation` """ + if inf_kwargs is None: + inf_kwargs = dict() if model is None: model = pm.modelcontext(model) _select = dict( advi=ADVI, fullrank_advi=FullRankADVI, - svgd=SVGD + svgd=SVGD, + asvgd=ASVGD ) if isinstance(method, str) and method.lower() == 'advi->fullrank_advi': frac = kwargs.pop('frac', .5) if not 0. < frac < 1.: raise ValueError('frac should be in (0, 1)') n1 = int(n * frac) - n2 = n-n1 - inference = ADVI(local_rv=local_rv, model=model, random_seed=random_seed, start=start) + n2 = n - n1 + inference = ADVI( + local_rv=local_rv, + model=model, + random_seed=random_seed, + start=start) logger.info('fitting advi ...') inference.fit(n1, **kwargs) inference = FullRankADVI.from_advi(inference) @@ -608,7 +715,8 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta try: inference = _select[method.lower()]( local_rv=local_rv, model=model, random_seed=random_seed, - start=start + start=start, + **inf_kwargs ) except KeyError: raise KeyError('method should be one of %s ' diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py index dd78a813bc..5eb815f467 100644 --- a/pymc3/variational/operators.py +++ b/pymc3/variational/operators.py @@ -1,7 +1,7 @@ from theano import theano, tensor as tt from pymc3.variational.opvi import Operator, ObjectiveFunction, _warn_not_used -from pymc3.variational.updates import adam -import pymc3 as pm +from pymc3.variational.stein import Stein +from pymc3.variational import updates __all__ = [ 'KL', @@ -17,6 +17,7 @@ class KL(Operator): KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv """ + def apply(self, f): z = self.input return self.logq_norm(z) - self.logp_norm(z) @@ -25,16 +26,44 @@ def apply(self, f): class KSDObjective(ObjectiveFunction): - def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam, - more_obj_params=None, more_replacements=None): - if obj_n_mc is not None: - _warn_not_used('obj_n_mc', self.op) - d_obj_padams = self(None) - d_obj_padams = theano.clone(d_obj_padams, more_replacements, strict=False) - updates.update(obj_optimizer([d_obj_padams], self.obj_params)) + """Helper class for construction loss and updates for variational inference + + Parameters + ---------- + op : :class:`KSD` + OPVI Functional operator + tf : :class:`TestFunction` + OPVI TestFunction + """ - def __call__(self, z): - return self.op.apply(self.tf) + def __init__(self, op, tf): + if not isinstance(op, KSD): + raise TypeError('Op should be KSD') + ObjectiveFunction.__init__(self, op, tf) + + def get_input(self, n_mc): + if hasattr(self.approx, 'histogram'): + if n_mc is not None: + _warn_not_used('n_mc', self.op) + return self.approx.histogram + elif n_mc is not None and n_mc > 1: + return self.approx.random(n_mc) + else: + raise ValueError('Variational type approximation requires ' + 'sample size (`n_mc` : int > 1 should be passed)') + + def __call__(self, z, **kwargs): + op = self.op # type: KSD + grad = op.apply(self.tf) + if 'more_obj_params' in kwargs: + params = self.obj_params + kwargs['more_obj_params'] + else: + params = self.test_params + kwargs['more_tf_params'] + grad *= -1 + grad = theano.clone(grad, {op.input_matrix: z}) + grad = tt.grad(None, params, known_grads={z: grad}) + grad = updates.total_norm_constraint(grad, 10) + return grad class KSD(Operator): @@ -45,7 +74,7 @@ class KSD(Operator): and a set of initial particles :math:`\{x^0_i\}^n_{i=1}` Output: A set of particles :math:`\{x_i\}^n_{i=1}` that approximates the target distribution. - + .. math:: x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\ @@ -68,22 +97,14 @@ class KSD(Operator): OBJECTIVE = KSDObjective def __init__(self, approx): - if not isinstance(approx, pm.Empirical): - raise ValueError('approx should be an Empirical approximation, got %r' % approx) Operator.__init__(self, approx) + self.input_matrix = tt.matrix('KSD input matrix') def apply(self, f): # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.)) - X = self.approx.histogram - t = self.approx.normalizing_constant - dlogpdx = theano.scan( - fn=lambda zg: theano.grad(self.logp_norm(zg), zg), - sequences=[X] - )[0] # bottleneck - Kxy, dxkxy = f(X) - # scaling factor - # not needed for Kxy as we already scaled dlogpdx - dxkxy /= t - n = X.shape[0].astype('float32') / t - svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n - return -1 * svgd_grad # gradient + stein = Stein(self.approx, f, self.input_matrix) + return -1 * stein.grad + + +class AKSD(KSD): + SUPPORT_AEVB = True diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py index 39f17f2030..dc0ff7fa5a 100644 --- a/pymc3/variational/opvi.py +++ b/pymc3/variational/opvi.py @@ -69,10 +69,11 @@ class ObjectiveFunction(object): Parameters ---------- op : :class:`Operator` - OPVI Functional operator + OPVI Functional operator tf : :class:`TestFunction` OPVI TestFunction """ + def __init__(self, op, tf): self.op = op self.tf = tf @@ -132,10 +133,6 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize more_updates = dict() if more_replacements is None: more_replacements = dict() - if not self.op.RETURNS_LOSS and (more_obj_params or more_tf_params): - raise ValueError('%s does not support, differentiation with ' - 'additional params, try passing your updates ' - 'via more_updates kwarg' % self.op.__class__) resulting_updates = ObjectiveUpdates() if self.test_params: self.add_test_updates( @@ -160,20 +157,33 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize resulting_updates.update(more_updates) return resulting_updates - def add_test_updates(self, updates, tf_n_mc=None, test_optimizer=adam, more_tf_params=None, more_replacements=None): - tf_z = self.random(tf_n_mc) - tf_target = -self(tf_z) + def add_test_updates(self, updates, tf_n_mc=None, test_optimizer=adam, + more_tf_params=None, more_replacements=None): + tf_z = self.get_input(tf_n_mc) + tf_target = self(tf_z, more_tf_params=more_tf_params) tf_target = theano.clone(tf_target, more_replacements, strict=False) - updates.update(test_optimizer(tf_target, self.test_params + more_tf_params)) - - def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam, more_obj_params=None, more_replacements=None): - obj_z = self.random(obj_n_mc) - obj_target = self(obj_z) + updates.update( + test_optimizer( + tf_target, + self.test_params + + more_tf_params)) + + def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam, + more_obj_params=None, more_replacements=None): + obj_z = self.get_input(obj_n_mc) + obj_target = self(obj_z, more_obj_params=more_obj_params) obj_target = theano.clone(obj_target, more_replacements, strict=False) - updates.update(obj_optimizer(obj_target, self.obj_params + more_obj_params)) + updates.update( + obj_optimizer( + obj_target, + self.obj_params + + more_obj_params)) if self.op.RETURNS_LOSS: updates.loss = obj_target + def get_input(self, n_mc): + return self.random(n_mc) + @memoize @change_flags(compute_test_value='off') def step_function(self, obj_n_mc=None, tf_n_mc=None, @@ -184,7 +194,7 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None, R"""Step function that should be called on each optimization step. Generally it solves the following problem: - + .. math:: \mathbf{\lambda^{*}} = \inf_{\lambda} \sup_{\theta} t(\mathbb{E}_{\lambda}[(O^{p,q}f_{\theta})(z)]) @@ -228,7 +238,8 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None, more_updates=more_updates, more_replacements=more_replacements) if score: - step_fn = theano.function([], updates.loss, updates=updates, **fn_kwargs) + step_fn = theano.function( + [], updates.loss, updates=updates, **fn_kwargs) else: step_fn = theano.function([], None, updates=updates, **fn_kwargs) return step_fn @@ -257,7 +268,10 @@ def score_function(self, sc_n_mc=None, more_replacements=None, fn_kwargs=None): raise NotImplementedError('%s does not have loss' % self.op) if more_replacements is None: more_replacements = {} - loss = theano.clone(self(self.random(sc_n_mc)), more_replacements, strict=False) + loss = theano.clone( + self(self.random(sc_n_mc)), + more_replacements, + strict=False) return theano.function([], loss, **fn_kwargs) def __getstate__(self): @@ -266,14 +280,22 @@ def __getstate__(self): def __setstate__(self, state): self.__init__(*state) - def __call__(self, z): + def __call__(self, z, **kwargs): + if 'more_tf_params' in kwargs: + m = -1 + else: + m = 1 if z.ndim > 1: a = theano.scan( - lambda z_: theano.clone(self.op.apply(self.tf), {self.op.input: z_}, strict=False), + lambda z_: theano.clone( + self.op.apply(self.tf), + {self.op.input: z_}, strict=False), sequences=z, n_steps=z.shape[0])[0].mean() else: - a = theano.clone(self.op.apply(self.tf), {self.op.input: z}, strict=False) - return self.op.T(a) + a = theano.clone( + self.op.apply(self.tf), + {self.op.input: z}, strict=False) + return m * self.op.T(a) class Operator(object): @@ -305,32 +327,14 @@ def __init__(self, approx): flat_view = property(lambda self: self.approx.flat_view) input = property(lambda self: self.approx.flat_view.input) - def logp(self, z): - factors = ([tt.sum(var.logpt)for var in self.model.basic_RVs] + - [tt.sum(var) for var in self.model.potentials]) - - p = self.approx.to_flat_input(tt.add(*factors)) - p = theano.clone(p, {self.input: z}) - return p - - def logp_norm(self, z): - t = self.approx.normalizing_constant - factors = ([tt.sum(var.logpt) / t for var in self.model.basic_RVs] + - [tt.sum(var) / t for var in self.model.potentials]) - logpt = tt.add(*factors) - p = self.approx.to_flat_input(logpt) - p = theano.clone(p, {self.input: z}) - return p - - def logq(self, z): - return self.approx.logq(z) - - def logq_norm(self, z): - return self.approx.logq_norm(z) + logp = property(lambda self: self.approx.logp) + logq = property(lambda self: self.approx.logq) + logp_norm = property(lambda self: self.approx.logp_norm) + logq_norm = property(lambda self: self.approx.logq_norm) def apply(self, f): # pragma: no cover R"""Operator itself - + .. math:: (O^{p,q}f_{\theta})(z) @@ -357,7 +361,9 @@ def __call__(self, f=None): f = TestFunction.from_function(f) else: if f is not None: - warnings.warn('TestFunction for %s is redundant and removed' % self) + warnings.warn( + 'TestFunction for %s is redundant and removed' % + self) else: pass f = TestFunction() @@ -399,7 +405,8 @@ def cast_to_list(params): elif params is None: return [] else: - raise TypeError('Unknown type %s for %r, need list, dict or shared variable') + raise TypeError( + 'Unknown type %s for %r, need list, dict or shared variable') class TestFunction(object): @@ -432,7 +439,7 @@ def _setup(self, dim): Parameters ---------- - dim : int + dim : int dimension of posterior distribution """ pass @@ -455,14 +462,14 @@ class Approximation(object): mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details - model : :class:`Model` + model : :class:`Model` PyMC3 model for inference cost_part_grad_scale : float or scalar tensor Scaling score part of gradient can be useful near optimum for archiving better convergence properties. Common schedule is 1 at the start and 0 in the end. So slow decay will be ok. See (Sticking the Landing; Geoffrey Roeder, - Yuhuai Wu, David Duvenaud, 2016) for details + Yuhuai Wu, David Duvenaud, 2016) for details scale_cost_to_minibatch : bool, default False Scale cost to minibatch instead of full dataset random_seed : None or int @@ -568,10 +575,12 @@ def seed(self, random_seed=None): @property def normalizing_constant(self): - t = self.to_flat_input(tt.max([v.scaling for v in self.model.basic_RVs])) + t = self.to_flat_input( + tt.max([v.scaling for v in self.model.basic_RVs])) t = theano.clone(t, {self.input: tt.zeros(self.total_size)}) # if not scale_cost_to_minibatch: t=1 - t = tt.switch(self.scale_cost_to_minibatch, t, tt.constant(1, dtype=t.dtype)) + t = tt.switch(self.scale_cost_to_minibatch, t, + tt.constant(1, dtype=t.dtype)) return t def _setup(self, **kwargs): @@ -589,8 +598,12 @@ def get_local_vars(self, **kwargs): def check_model(self, model, **kwargs): """Checks that model is valid for variational inference """ - vars_ = [var for var in model.vars if not isinstance(var, pm.model.ObservedRV)] - if any([var.dtype in pm.discrete_types for var in vars_]): # pragma: no cover + vars_ = [ + var for var in model.vars + if not isinstance(var, pm.model.ObservedRV) + ] + if any([var.dtype in pm.discrete_types for var in vars_] + ): # pragma: no cover raise ValueError('Model should not include discrete RVs') def create_shared_params(self, **kwargs): @@ -630,7 +643,8 @@ def construct_replacements(self, include=None, exclude=None, Replacements """ if include is not None and exclude is not None: - raise ValueError('Only one parameter is supported {include|exclude}, got two') + raise ValueError( + 'Only one parameter is supported {include|exclude}, got two') if include is not None: # pragma: no cover replacements = {k: v for k, v in self.flat_view.replacements.items() if k in include} @@ -724,11 +738,11 @@ def initial(self, size, no_rand=False, l=None): Parameters ---------- - size : `int` + size : `int` number of samples no_rand : `bool` return zeros if True - l : `int` + l : `int` length of sample, defaults to latent space dim Returns @@ -924,6 +938,22 @@ def logq(self, z): def logq_norm(self, z): return self.logq(z) / self.normalizing_constant + def logp(self, z): + factors = ([tt.sum(var.logpt)for var in self.model.basic_RVs] + + [tt.sum(var) for var in self.model.potentials]) + p = self.to_flat_input(tt.add(*factors)) + p = theano.clone(p, {self.input: z}) + return p + + def logp_norm(self, z): + t = self.normalizing_constant + factors = ([tt.sum(var.logpt) / t for var in self.model.basic_RVs] + + [tt.sum(var) / t for var in self.model.potentials]) + logpt = tt.add(*factors) + p = self.to_flat_input(logpt) + p = theano.clone(p, {self.input: z}) + return p + def view(self, space, name, reshape=True): """Construct view on a variable from flattened `space` @@ -949,7 +979,9 @@ def view(self, space, name, reshape=True): elif space.ndim < 2: view = space[slc] else: # pragma: no cover - raise ValueError('Space should have no more than 2 dims, got %d' % space.ndim) + raise ValueError( + 'Space should have no more than 2 dims, got %d' % + space.ndim) if reshape: if len(_shape) > 0: if theano_is_here: diff --git a/pymc3/variational/stein.py b/pymc3/variational/stein.py new file mode 100644 index 0000000000..0e6ef30f70 --- /dev/null +++ b/pymc3/variational/stein.py @@ -0,0 +1,71 @@ +from theano import theano, tensor as tt +from pymc3.variational.test_functions import rbf +from pymc3.theanof import memoize + +__all__ = [ + 'Stein' +] + + +class Stein(object): + def __init__(self, approx, kernel=rbf, input_matrix=None): + self.approx = approx + self._kernel_f = kernel + if input_matrix is None: + input_matrix = tt.matrix('stein_input_matrix') + input_matrix.tag.test_value = approx.random(10).tag.test_value + self.input_matrix = input_matrix + + @property + @memoize + def grad(self): + t = self.approx.normalizing_constant + Kxy, dxkxy = self.Kxy, self.dxkxy + dlogpdx = self.dlogp # Normalized + n = self.input_matrix.shape[0].astype('float32') + svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy/t) / n + return svgd_grad + + @property + def Kxy(self): + return self._kernel()[0] + + @property + def dxkxy(self): + return self._kernel()[1] + + @property + @memoize + def logp(self): + return theano.scan( + fn=lambda zg: self.approx.logp_norm(zg), + sequences=[self.input_matrix] + )[0] + + @property + @memoize + def dlogp(self): + return theano.scan( + fn=lambda zg: theano.grad(self.approx.logp_norm(zg), zg), + sequences=[self.input_matrix] + )[0] + + @memoize + def _kernel(self): + return self._kernel_f(self.input_matrix) + + def get_approx_input(self, size=100): + """ + + Parameters + ---------- + size : if approx is not Empirical, takes `n=size` random samples + + Returns + ------- + matrix + """ + if hasattr(self.approx, 'histogram'): + return self.approx.histogram + else: + return self.approx.random(size) diff --git a/pymc3/variational/test_functions.py b/pymc3/variational/test_functions.py index ed9badfdc2..012fb5f156 100644 --- a/pymc3/variational/test_functions.py +++ b/pymc3/variational/test_functions.py @@ -1,4 +1,4 @@ -from theano import tensor as tt +from theano import theano, tensor as tt from .opvi import TestFunction @@ -10,9 +10,9 @@ class Kernel(TestFunction): """ Dummy base class for kernel SVGD in case we implement more - + .. math:: - + f(x) -> (k(x,.), \nabla_x k(x,.)) """ @@ -21,26 +21,30 @@ class Kernel(TestFunction): class RBF(Kernel): def __call__(self, X): XY = X.dot(X.T) - x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1)) + x2 = tt.sum(X ** 2, axis=1).dimshuffle(0, 'x') X2e = tt.repeat(x2, X.shape[0], axis=1) - H = tt.sub(tt.add(X2e, X2e.T), 2 * XY) + H = X2e + X2e.T - 2. * XY V = tt.sort(H.flatten()) length = V.shape[0] # median distance - h = tt.switch(tt.eq((length % 2), 0), + m = tt.switch(tt.eq((length % 2), 0), # if even vector - tt.mean(V[((length//2)-1):((length//2)+1)]), + tt.mean(V[((length // 2) - 1):((length // 2) + 1)]), # if odd vector V[length // 2]) - h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0)) + h = .5 * m / tt.log(tt.cast(H.shape[0] + 1., theano.config.floatX)) - Kxy = tt.exp(-H / h ** 2 / 2.0) + # RBF + Kxy = tt.exp(-H / h / 2.0) + + # Derivative dxkxy = -tt.dot(Kxy, X) sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x') - dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2) + dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / h return Kxy, dxkxy + rbf = RBF() diff --git a/pymc3/variational/updates.py b/pymc3/variational/updates.py index 781f13b36f..598162674b 100755 --- a/pymc3/variational/updates.py +++ b/pymc3/variational/updates.py @@ -187,12 +187,12 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3): ------- OrderedDict A dictionary mapping each parameter to its update expression - + Notes ----- Optimizer can be called without both loss_or_grads and params in that case partial function is returned - + Examples -------- >>> a = theano.shared(1.) @@ -210,7 +210,8 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3): if loss_or_grads is None and params is None: return partial(sgd, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -268,7 +269,8 @@ def apply_momentum(updates, params=None, momentum=0.9): return updates -def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): +def momentum(loss_or_grads=None, params=None, + learning_rate=1e-3, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with momentum Generates update expressions of the form: @@ -300,12 +302,12 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): Optimizer can be called without both loss_or_grads and params in that case partial function is returned - + See Also -------- apply_momentum : Generic function applying momentum to updates nesterov_momentum : Nesterov's variant of SGD with momentum - + Examples -------- >>> a = theano.shared(1.) @@ -323,7 +325,8 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): if loss_or_grads is None and params is None: return partial(pm.updates.momentum, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') updates = sgd(loss_or_grads, params, learning_rate) return apply_momentum(updates, momentum=momentum) @@ -382,7 +385,8 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9): return updates -def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9): +def nesterov_momentum(loss_or_grads=None, params=None, + learning_rate=1e-3, momentum=0.9): """Stochastic Gradient Descent (SGD) updates with Nesterov momentum Generates update expressions of the form: @@ -417,7 +421,7 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen position in parameter space. Here, we use the formulation described at https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617, which allows the gradient to be evaluated at the current parameters. - + Optimizer can be called without both loss_or_grads and params in that case partial function is returned @@ -442,7 +446,8 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen if loss_or_grads is None and params is None: return partial(nesterov_momentum, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') updates = sgd(loss_or_grads, params, learning_rate) return apply_nesterov_momentum(updates, momentum=momentum) @@ -480,7 +485,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): as such the learning rate is monotonically decreasing. Epsilon is not included in the typical formula, see [2]_. - + Optimizer can be called without both loss_or_grads and params in that case partial function is returned @@ -492,7 +497,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): .. [2] Chris Dyer: Notes on AdaGrad. http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf - + Examples -------- >>> a = theano.shared(1.) @@ -510,7 +515,8 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6): if loss_or_grads is None and params is None: return partial(adagrad, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -574,7 +580,8 @@ def adagrad_window(loss_or_grads=None, params=None, return updates -def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon=1e-6): +def rmsprop(loss_or_grads=None, params=None, + learning_rate=1.0, rho=0.9, epsilon=1e-6): """RMSProp updates Scale learning rates by dividing with the moving average of the root mean @@ -610,8 +617,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon .. math:: r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\ \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}} - - + + Optimizer can be called without both loss_or_grads and params in that case partial function is returned @@ -620,7 +627,7 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon .. [1] Tieleman, tt. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20) - + Examples -------- >>> a = theano.shared(1.) @@ -638,7 +645,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon if loss_or_grads is None and params is None: return partial(rmsprop, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -657,7 +665,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon return updates -def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsilon=1e-6): +def adadelta(loss_or_grads=None, params=None, + learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates Scale learning rates by the ratio of accumulated gradients to accumulated @@ -703,7 +712,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil \\eta_t &= \\eta \\frac{\\sqrt{s_{t-1} + \\epsilon}} {\sqrt{r_t + \epsilon}}\\\\ s_t &= \\rho s_{t-1} + (1-\\rho)*(\\eta_t*g)^2 - + Optimizer can be called without both loss_or_grads and params in that case partial function is returned @@ -730,7 +739,8 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil if loss_or_grads is None and params is None: return partial(adadelta, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() @@ -793,7 +803,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, The paper [1]_ includes an additional hyperparameter lambda. This is only needed to prove convergence of the algorithm and has no practical use (personal communication with the authors), it is therefore omitted here. - + Optimizer can be called without both loss_or_grads and params in that case partial function is returned @@ -802,7 +812,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, .. [1] Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980. - + Examples -------- >>> a = theano.shared(1.) @@ -820,7 +830,8 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, if loss_or_grads is None and params is None: return partial(adam, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(pm.theanof.floatX(0.)) updates = OrderedDict() @@ -829,7 +840,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, one = tt.constant(1) t = t_prev + 1 - a_t = learning_rate*tt.sqrt(one-beta2**t)/(one-beta1**t) + a_t = learning_rate * tt.sqrt(one - beta2**t) / (one - beta1**t) for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) @@ -838,9 +849,9 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9, v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) - m_t = beta1*m_prev + (one-beta1)*g_t - v_t = beta2*v_prev + (one-beta2)*g_t**2 - step = a_t*m_t/(tt.sqrt(v_t) + epsilon) + m_t = beta1 * m_prev + (one - beta1) * g_t + v_t = beta2 * v_prev + (one - beta2) * g_t**2 + step = a_t * m_t / (tt.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t @@ -876,7 +887,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, ------- OrderedDict A dictionary mapping each parameter to its update expression - + Notes ----- Optimizer can be called without both loss_or_grads and params @@ -887,7 +898,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, .. [1] Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980. - + Examples -------- >>> a = theano.shared(1.) @@ -905,7 +916,8 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, if loss_or_grads is None and params is None: return partial(adamax, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: - raise ValueError('Please provide both `loss_or_grads` and `params` to get updates') + raise ValueError( + 'Please provide both `loss_or_grads` and `params` to get updates') all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(pm.theanof.floatX(0.)) updates = OrderedDict() @@ -914,7 +926,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, one = tt.constant(1) t = t_prev + 1 - a_t = learning_rate/(one-beta1**t) + a_t = learning_rate / (one - beta1**t) for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) @@ -923,9 +935,9 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) - m_t = beta1*m_prev + (one-beta1)*g_t - u_t = tt.maximum(beta2*u_prev, abs(g_t)) - step = a_t*m_t/(u_t + epsilon) + m_t = beta1 * m_prev + (one - beta1) * g_t + u_t = tt.maximum(beta2 * u_prev, abs(g_t)) + step = a_t * m_t / (u_t + epsilon) updates[m_prev] = m_t updates[u_prev] = u_t @@ -1075,7 +1087,7 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, dtype = np.dtype(theano.config.floatX).type target_norm = tt.clip(norm, 0, dtype(max_norm)) multiplier = target_norm / (dtype(epsilon) + norm) - tensor_vars_scaled = [step*multiplier for step in tensor_vars] + tensor_vars_scaled = [step * multiplier for step in tensor_vars] if return_norm: return tensor_vars_scaled, norm