diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index 10e0635f57..79073d82cd 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -7,11 +7,10 @@
 from pymc3 import Model, Normal
 from pymc3.variational import (
     ADVI, FullRankADVI, SVGD,
-    Empirical,
-    fit
+    Empirical, ASVGD,
+    MeanField, fit
 )
 from pymc3.variational.operators import KL
-from pymc3.variational.approximations import MeanField
 
 from pymc3.tests import models
 from pymc3.tests.helpers import SeededTest
@@ -70,7 +69,15 @@ class TestApproximates:
     class Base(SeededTest):
         inference = None
         NITER = 12000
-        optimizer = functools.partial(pm.adam, learning_rate=.01)
+        optimizer = pm.adagrad_window(learning_rate=0.01)
+        conv_cb = property(lambda self: [
+            pm.callbacks.CheckParametersConvergence(
+                every=500,
+                diff='relative', tolerance=0.001),
+            pm.callbacks.CheckParametersConvergence(
+                every=500,
+                diff='absolute', tolerance=0.0001)
+        ])
 
         def test_vars_view(self):
             _, model, _ = models.multidimensional_model()
@@ -147,11 +154,10 @@ def test_optimizer_with_full_data(self):
                 inf.fit(10)
                 approx = inf.fit(self.NITER,
                                  obj_optimizer=self.optimizer,
-                                 callbacks=
-                                 [pm.callbacks.CheckParametersConvergence()],)
+                                 callbacks=self.conv_cb,)
                 trace = approx.sample(10000)
-            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
-            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
+            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05)
+            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1)
 
         def test_optimizer_minibatch_with_generator(self):
             n = 1000
@@ -176,11 +182,10 @@ def create_minibatch(data):
                 Normal('x', mu=mu_, sd=sd, observed=minibatches, total_size=n)
                 inf = self.inference()
                 approx = inf.fit(self.NITER * 3, obj_optimizer=self.optimizer,
-                                 callbacks=
-                                 [pm.callbacks.CheckParametersConvergence()])
+                                 callbacks=self.conv_cb)
                 trace = approx.sample(10000)
-            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.1)
-            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
+            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05)
+            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1)
 
         def test_optimizer_minibatch_with_callback(self):
             n = 1000
@@ -208,12 +213,21 @@ def cb(*_):
                 mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
                 Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n)
                 inf = self.inference(scale_cost_to_minibatch=True)
-                approx = inf.fit(self.NITER * 3, callbacks=
-                [cb, pm.callbacks.CheckParametersConvergence()],
-                                 obj_n_mc=10, obj_optimizer=self.optimizer)
+                approx = inf.fit(
+                    self.NITER * 3, callbacks=[cb] + self.conv_cb, obj_optimizer=self.optimizer)
                 trace = approx.sample(10000)
-            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
-            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
+            np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.05)
+            np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.1)
+
+        def test_n_obj_mc(self):
+            n_samples = 100
+            xs = np.random.binomial(n=1, p=0.2, size=n_samples)
+            with pm.Model():
+                p = pm.Beta('p', alpha=1, beta=1)
+                pm.Binomial('xs', n=1, p=p, observed=xs)
+                inf = self.inference(scale_cost_to_minibatch=True)
+                # should just work
+                inf.fit(10, obj_n_mc=10, obj_optimizer=self.optimizer)
 
         def test_pickling(self):
             with models.multidimensional_model()[1]:
@@ -277,8 +291,14 @@ def test_from_advi(self):
 
 class TestSVGD(TestApproximates.Base):
     inference = functools.partial(SVGD, n_particles=100)
-    NITER = 2500
-    optimizer = functools.partial(pm.adam, learning_rate=.1)
+
+
+class TestASVGD(TestApproximates.Base):
+    NITER = 15000
+    inference = ASVGD
+    test_aevb = _test_aevb
+    optimizer = pm.adagrad_window(learning_rate=0.002)
+    conv_cb = []
 
 
 class TestEmpirical(SeededTest):
diff --git a/pymc3/variational/__init__.py b/pymc3/variational/__init__.py
index 0e8a50093c..808442c927 100644
--- a/pymc3/variational/__init__.py
+++ b/pymc3/variational/__init__.py
@@ -1,39 +1,45 @@
 from .advi import advi, sample_vp
 from .advi_minibatch import advi_minibatch
 
+# commonly used
+from . import updates
 from .updates import (
     sgd,
     apply_momentum,
     momentum,
     apply_nesterov_momentum,
+    adagrad_window,
     nesterov_momentum,
     adagrad,
-    adagrad_window,
     rmsprop,
     adadelta,
     adam,
     adamax,
     norm_constraint,
-    total_norm_constraint,
+    total_norm_constraint
 )
 
+from . import inference
 from .inference import (
     ADVI,
     FullRankADVI,
     SVGD,
-    fit,
+    ASVGD,
+    Inference,
+    fit
 )
+
+from . import approximations
 from .approximations import (
-    Empirical,
-    FullRank,
     MeanField,
+    FullRank,
+    Empirical,
     sample_approx
 )
 
-from . import approximations
+# special
+from .stein import Stein
 from . import operators
 from . import test_functions
 from . import opvi
-from . import updates
-from . import inference
 from . import callbacks
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 643f169d44..24201b8327 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -28,7 +28,7 @@ class MeanField(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    model : :class:`pymc3.Model` 
+    model : :class:`pymc3.Model`
         PyMC3 model for inference
     start : `Point`
         initial mean
@@ -38,14 +38,14 @@ class MeanField(Approximation):
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-    scale_cost_to_minibatch : `bool` 
+    scale_cost_to_minibatch : `bool`
         Scale cost to minibatch instead of full dataset, default False
     random_seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
 
     References
-    ----------   
+    ----------
     -   Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
@@ -76,12 +76,9 @@ def create_shared_params(self, **kwargs):
             start = start_
         start = self.gbij.map(start)
         return {'mu': theano.shared(
-                    pm.floatX(start),
-                    'mu'),
+                    pm.floatX(start), 'mu'),
                 'rho': theano.shared(
-                    np.zeros((self.global_size,), dtype=theano.config.floatX),
-                    'rho')
-                }
+                    pm.floatX(np.zeros((self.global_size,))), 'rho')}
 
     def log_q_W_global(self, z):
         """
@@ -120,13 +117,13 @@ class FullRank(Approximation):
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
-        Yuhuai Wu, David Duvenaud, 2016) for details   
+        Yuhuai Wu, David Duvenaud, 2016) for details
     scale_cost_to_minibatch : bool, default False
         Scale cost to minibatch instead of full dataset
     random_seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    
+
     Other Parameters
     ----------------
     gpu_compat : bool
@@ -138,6 +135,7 @@ class FullRank(Approximation):
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
     """
+
     def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
                  scale_cost_to_minibatch=False,
                  gpu_compat=False, random_seed=None, **kwargs):
@@ -173,7 +171,9 @@ def tril_index_matrix(self):
         num_tril_entries = self.num_tril_entries
         tril_index_matrix = np.zeros([n, n], dtype=int)
         tril_index_matrix[np.tril_indices(n)] = np.arange(num_tril_entries)
-        tril_index_matrix[np.tril_indices(n)[::-1]] = np.arange(num_tril_entries)
+        tril_index_matrix[
+            np.tril_indices(n)[::-1]
+        ] = np.arange(num_tril_entries)
         return tril_index_matrix
 
     def create_shared_params(self, **kwargs):
@@ -184,14 +184,14 @@ def create_shared_params(self, **kwargs):
             start_ = self.model.test_point.copy()
             pm.sampling._update_start_vals(start_, start, self.model)
             start = start_
-        start = self.gbij.map(start)
+        start = pm.floatX(self.gbij.map(start))
         n = self.global_size
         L_tril = (
             np.eye(n)
             [np.tril_indices(n)]
             .astype(theano.config.floatX)
         )
-        return {'mu': theano.shared(pm.floatX(start), 'mu'),
+        return {'mu': theano.shared(start, 'mu'),
                 'L_tril': theano.shared(L_tril, 'L_tril')
                 }
 
@@ -220,7 +220,7 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
         """Construct FullRank from MeanField approximation
 
         Parameters
-        ----------  
+        ----------
         mean_field : :class:`MeanField`
             approximation to start with
 
@@ -264,9 +264,9 @@ class Empirical(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    scale_cost_to_minibatch : `bool` 
+    scale_cost_to_minibatch : `bool`
         Scale cost to minibatch instead of full dataset, default False
-    model : :class:`pymc3.Model` 
+    model : :class:`pymc3.Model`
         PyMC3 model for inference
     random_seed : None or int
         leave None to use package global RandomStream or other
@@ -279,6 +279,7 @@ class Empirical(Approximation):
     ...     trace = sample(1000, step=step)
     ...     histogram = Empirical(trace[100:])
     """
+
     def __init__(self, trace, local_rv=None,
                  scale_cost_to_minibatch=False,
                  model=None, random_seed=None, **kwargs):
@@ -368,15 +369,15 @@ def from_noise(cls, size, jitter=.01, local_rv=None,
 
         Parameters
         ----------
-        size : `int` 
+        size : `int`
             number of initial particles
-        jitter : `float` 
+        jitter : `float`
             initial sd
         local_rv : `dict`
             mapping {model_variable -> local_variable}
             Local Vars are used for Autoencoding Variational Bayes
             See (AEVB; Kingma and Welling, 2014) for details
-        start : `Point` 
+        start : `Point`
             initial point
         model : :class:`pymc3.Model`
             PyMC3 model for inference
@@ -386,20 +387,25 @@ def from_noise(cls, size, jitter=.01, local_rv=None,
         kwargs : other kwargs passed to init
 
         Returns
-        -------    
+        -------
         :class:`Empirical`
         """
-        hist = cls(None, local_rv=local_rv, model=model, random_seed=random_seed, **kwargs)
+        hist = cls(
+            None,
+            local_rv=local_rv,
+            model=model,
+            random_seed=random_seed,
+            **kwargs)
         if start is None:
             start = hist.model.test_point
         else:
             start_ = hist.model.test_point.copy()
             pm.sampling._update_start_vals(start_, start, hist.model)
             start = start_
-        start = hist.gbij.map(start)
+        start = pm.floatX(hist.gbij.map(start))
         # Initialize particles
         x0 = np.tile(start, (size, 1))
-        x0 += np.random.normal(0, jitter, x0.shape)
+        x0 += pm.floatX(np.random.normal(0, jitter, x0.shape))
         hist.histogram.set_value(x0)
         return hist
 
@@ -408,7 +414,7 @@ def sample_approx(approx, draws=100, include_transformed=True):
     """Draw samples from variational posterior.
 
     Parameters
-    ----------  
+    ----------
     approx : :class:`Approximation`
         Approximation to sample from
     draws : `int`
@@ -417,7 +423,7 @@ def sample_approx(approx, draws=100, include_transformed=True):
         If True, transformed variables are also sampled. Default is True.
 
     Returns
-    -------    
+    -------
     trace : class:`pymc3.backends.base.MultiTrace`
         Samples drawn from variational posterior.
     """
diff --git a/pymc3/variational/callbacks.py b/pymc3/variational/callbacks.py
index b6691227e1..964826281a 100644
--- a/pymc3/variational/callbacks.py
+++ b/pymc3/variational/callbacks.py
@@ -12,12 +12,13 @@ def __call__(self, approx, loss, i):
 
 
 def relative(current, prev, eps=1e-6):
-    return (np.abs(current - prev)+eps)/(np.abs(prev)+eps)
+    return (np.abs(current - prev) + eps) / (np.abs(prev) + eps)
 
 
 def absolute(current, prev):
     return np.abs(current - prev)
 
+
 _diff = dict(
     relative=relative,
     absolute=absolute
@@ -32,7 +33,7 @@ class CheckParametersConvergence(Callback):
     every : int
         check frequency
     tolerance : float
-        if diff norm < tolerance : break 
+        if diff norm < tolerance : break
     diff : str
         difference type one of {'absolute', 'relative'}
     ord : {non-zero int, inf, -inf, 'fro', 'nuc'}, optional
@@ -50,7 +51,8 @@ class CheckParametersConvergence(Callback):
     ...     )
     """
 
-    def __init__(self, every=100, tolerance=1e-3, diff='relative', ord=np.inf):
+    def __init__(self, every=100, tolerance=1e-3,
+                 diff='relative', ord=np.inf):
         self._diff = _diff[diff]
         self.ord = ord
         self.every = every
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 957c2d9cb2..8cf0ec26a7 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -8,7 +8,7 @@
 
 import pymc3 as pm
 from pymc3.variational.approximations import MeanField, FullRank, Empirical
-from pymc3.variational.operators import KL, KSD
+from pymc3.variational.operators import KL, KSD, AKSD
 from pymc3.variational.opvi import Approximation
 from pymc3.variational import test_functions
 
@@ -19,6 +19,7 @@
     'ADVI',
     'FullRankADVI',
     'SVGD',
+    'ASVGD',
     'Inference',
     'fit'
 ]
@@ -41,7 +42,8 @@ class Inference(object):
         See (AEVB; Kingma and Welling, 2014) for details
     model : Model
         PyMC3 Model
-    kwargs : kwargs for :class:`Approximation`
+    kwargs : kwargs
+        additional kwargs for :class:`Approximation`
     """
 
     def __init__(self, op, approx, tf, local_rv=None, model=None, **kwargs):
@@ -53,7 +55,8 @@ def __init__(self, op, approx, tf, local_rv=None, model=None, **kwargs):
         elif isinstance(approx, Approximation):    # pragma: no cover
             pass
         else:   # pragma: no cover
-            raise TypeError('approx should be Approximation instance or Approximation subclass')
+            raise TypeError(
+                'approx should be Approximation instance or Approximation subclass')
         self.objective = op(approx)(tf)
 
     approx = property(lambda self: self.objective.approx)
@@ -104,7 +107,8 @@ def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
             calls provided functions after each iteration step
         progressbar : bool
             whether to show progressbar or not
-        kwargs : kwargs for :func:`ObjectiveFunction.step_function`
+        kwargs : kwargs
+            additional kwargs for :func:`ObjectiveFunction.step_function`
 
         Returns
         -------
@@ -154,6 +158,9 @@ def _infmean(input_array):
                 if i % 10 == 0:
                     avg_loss = _infmean(scores[max(0, i - 1000):i + 1])
                     progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
+                    avg_loss = scores[max(0, i - 1000):i + 1].mean()
+                    progress.set_description(
+                        'Average Loss = {:,.5g}'.format(avg_loss))
                 for callback in callbacks:
                     callback(self.approx, scores[:i + 1], i)
         except (KeyboardInterrupt, StopIteration) as e:
@@ -171,10 +178,12 @@ def _infmean(input_array):
                     i, 100 * i // n, avg_loss))
         else:
             if n < 10:
-                logger.info('Finished [100%]: Loss = {:,.5g}'.format(scores[-1]))
+                logger.info(
+                    'Finished [100%]: Loss = {:,.5g}'.format(scores[-1]))
             else:
                 avg_loss = _infmean(scores[max(0, i - 1000):i + 1])
-                logger.info('Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss))
+                logger.info(
+                    'Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss))
         finally:
             progress.close()
         self.hist = np.concatenate([self.hist, scores])
@@ -183,13 +192,13 @@ def _infmean(input_array):
 class ADVI(Inference):
     R"""
     Automatic Differentiation Variational Inference (ADVI)
-    
+
     This class implements the meanfield ADVI, where the variational
     posterior distribution is assumed to be spherical Gaussian without
     correlation of parameters and fit to the true posterior distribution.
     The means and standard deviations of the variational posterior are referred
     to as variational parameters.
-    
+
     For explanation, we classify random variables in probabilistic models into
     three types. Observed random variables
     :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
@@ -268,42 +277,42 @@ class ADVI(Inference):
 
     When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
     set to :math:`N/M`, where :math:`M` is the number of observations in each
-    mini-batch. This is done with supplying `total_size` parameter to 
+    mini-batch. This is done with supplying `total_size` parameter to
     observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`).
     In this case it is possible to automatically determine appropriate scaling for :math:`logp`
-    of observed nodes. Interesting to note that it is possible to have two independent 
+    of observed nodes. Interesting to note that it is possible to have two independent
     observed variables with different `total_size` and iterate them independently
-    during inference.  
+    during inference.
 
     For working with ADVI, we need to give
-    
+
     -   The probabilistic model
 
         `model` with three types of RVs (`observed_RVs`,
-        `global_RVs` and `local_RVs`). 
-    
+        `global_RVs` and `local_RVs`).
+
     -   (optional) Minibatches
 
-        The tensors to which mini-bathced samples are supplied are 
-        handled separately by using callbacks in :func:`Inference.fit` method 
-        that change storage of shared theano variable or by :func:`pymc3.generator` 
-        that automatically iterates over minibatches and defined beforehand. 
-    
+        The tensors to which mini-bathced samples are supplied are
+        handled separately by using callbacks in :func:`Inference.fit` method
+        that change storage of shared theano variable or by :func:`pymc3.generator`
+        that automatically iterates over minibatches and defined beforehand.
+
     -   (optional) Parameters of deterministic mappings
 
-        They have to be passed along with other params to :func:`Inference.fit` method 
-        as `more_obj_params` argument. 
+        They have to be passed along with other params to :func:`Inference.fit` method
+        as `more_obj_params` argument.
 
-    For more information concerning training stage please reference 
+    For more information concerning training stage please reference
     :func:`pymc3.variational.opvi.ObjectiveFunction.step_function`
-    
+
     Parameters
     ----------
     local_rv : dict[var->tuple]
         mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    model : :class:`pymc3.Model` 
+    model : :class:`pymc3.Model`
         PyMC3 model for inference
     cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
@@ -315,7 +324,7 @@ class ADVI(Inference):
         Scale cost to minibatch instead of full dataset, default False
     random_seed : None or int
         leave None to use package global RandomStream or other
-        valid value to create instance specific one    
+        valid value to create instance specific one
     start : `Point`
         starting point for inference
 
@@ -374,10 +383,10 @@ class FullRankADVI(Inference):
     Parameters
     ----------
     local_rv : dict[var->tuple]
-        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
+        mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    model : :class:`pymc3.Model` 
+    model : :class:`pymc3.Model`
         PyMC3 model for inference
     cost_part_grad_scale : `scalar`
         Scaling score part of gradient can be useful near optimum for
@@ -498,7 +507,7 @@ class SVGD(Inference):
     they fit target distribution best.
 
     Algorithm is outlined below
-    
+
     *Input:* A target distribution with density function :math:`p(x)`
             and a set of initial particles :math:`{x^0_i}^n_{i=1}`
 
@@ -506,8 +515,8 @@ class SVGD(Inference):
 
     .. math::
 
-        x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\
-        \hat{\phi}^{*}(x) = \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
+        x_i^{l+1} &\leftarrow x_i^{l} + \epsilon_l \hat{\phi}^{*}(x_i^l) \\
+        \hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
 
     Parameters
     ----------
@@ -552,7 +561,95 @@ def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_function
             model=model, random_seed=random_seed)
 
 
-def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, **kwargs):
+class ASVGD(Inference):
+    R"""
+    Amortized Stein Variational Gradient Descent
+
+    This inference is based on Kernelized Stein Discrepancy
+    it's main idea is to move initial noisy particles so that
+    they fit target distribution best.
+
+    Algorithm is outlined below
+
+    *Input:* Parametrized random generator :math:`R_{\theta}`
+
+    *Output:* :math:`R_{\theta^{*}}` that approximates the target distribution.
+
+    .. math::
+
+        \Delta x_i &= \hat{\phi}^{*}(x_i) \\
+        \hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x_j,x) \nabla_{x_j} logp(x_j)+ \nabla_{x_j} k(x_j,x)] \\
+        \Delta_{\theta} &= \frac{1}{n}\sum^{n}_{i=1}\Delta x_i\frac{\partial x_i}{\partial \theta} 
+
+    Parameters
+    ----------
+    approx : :class:`Approximation`
+    local_rv : dict[var->tuple]
+        mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
+        Local Vars are used for Autoencoding Variational Bayes
+        See (AEVB; Kingma and Welling, 2014) for details
+    kernel : `callable`
+        kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))`
+    model : :class:`Model`
+    kwargs : kwargs for :class:`Approximation`
+
+    References
+    ----------
+    -   Dilin Wang, Yihao Feng, Qiang Liu (2016)
+        Learning to Sample Using Stein Discrepancy
+        http://bayesiandeeplearning.org/papers/BDL_21.pdf
+
+    -   Dilin Wang, Qiang Liu (2016)
+        Learning to Draw Samples: With Application to Amortized MLE for Generative Adversarial Learning
+        https://arxiv.org/abs/1611.01722
+    """
+
+    def __init__(self, approx=FullRank, local_rv=None,
+                 kernel=test_functions.rbf, model=None, **kwargs):
+        super(ASVGD, self).__init__(
+            op=AKSD,
+            approx=approx,
+            local_rv=local_rv,
+            tf=kernel,
+            model=model,
+            **kwargs
+        )
+
+    def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
+            obj_n_mc=30, **kwargs):
+        """
+        Performs Amortized Stein Variational Gradient Descent
+
+        Parameters
+        ----------
+        n : int
+            number of iterations
+        score : bool
+            evaluate loss on each iteration or not
+        callbacks : list[function : (Approximation, losses, i) -> None]
+            calls provided functions after each iteration step
+        progressbar : bool
+            whether to show progressbar or not
+        obj_n_mc : int
+            sample `n` particles for Stein gradient
+        kwargs : kwargs
+            additional kwargs for :func:`ObjectiveFunction.step_function`
+
+        Returns
+        -------
+        Approximation
+        """
+        return super(ASVGD, self).fit(
+            n=n, score=score, callbacks=callbacks,
+            progressbar=progressbar, obj_n_mc=obj_n_mc, **kwargs)
+
+    def run_profiling(self, n=1000, score=None, obj_n_mc=30, **kwargs):
+        return super(ASVGD, self).run_profiling(
+            n=n, score=score, obj_n_mc=obj_n_mc, **kwargs)
+
+
+def fit(n=10000, local_rv=None, method='advi', model=None,
+        random_seed=None, start=None, inf_kwargs=None, **kwargs):
     R"""
     Handy shortcut for using inference methods in functional way
 
@@ -565,12 +662,14 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
     method : str or :class:`Inference`
-        string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd'}
+        string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi', 'svgd', 'asvgd'}
     model : :class:`pymc3.Model`
         PyMC3 model for inference
     random_seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
+    inf_kwargs : dict
+        additional kwargs passed to :class:`Inference`
     start : `Point`
         starting point for inference
 
@@ -578,26 +677,34 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta
     ----------------
     frac : `float`
         if method is 'advi->fullrank_advi' represents advi fraction when training
-    kwargs : kwargs for :func:`Inference.fit`
+    kwargs : kwargs
+        additional kwargs for :func:`Inference.fit`
 
     Returns
     -------
     :class:`Approximation`
     """
+    if inf_kwargs is None:
+        inf_kwargs = dict()
     if model is None:
         model = pm.modelcontext(model)
     _select = dict(
         advi=ADVI,
         fullrank_advi=FullRankADVI,
-        svgd=SVGD
+        svgd=SVGD,
+        asvgd=ASVGD
     )
     if isinstance(method, str) and method.lower() == 'advi->fullrank_advi':
         frac = kwargs.pop('frac', .5)
         if not 0. < frac < 1.:
             raise ValueError('frac should be in (0, 1)')
         n1 = int(n * frac)
-        n2 = n-n1
-        inference = ADVI(local_rv=local_rv, model=model, random_seed=random_seed, start=start)
+        n2 = n - n1
+        inference = ADVI(
+            local_rv=local_rv,
+            model=model,
+            random_seed=random_seed,
+            start=start)
         logger.info('fitting advi ...')
         inference.fit(n1, **kwargs)
         inference = FullRankADVI.from_advi(inference)
@@ -608,7 +715,8 @@ def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, sta
         try:
             inference = _select[method.lower()](
                 local_rv=local_rv, model=model, random_seed=random_seed,
-                start=start
+                start=start,
+                **inf_kwargs
             )
         except KeyError:
             raise KeyError('method should be one of %s '
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
index dd78a813bc..5eb815f467 100644
--- a/pymc3/variational/operators.py
+++ b/pymc3/variational/operators.py
@@ -1,7 +1,7 @@
 from theano import theano, tensor as tt
 from pymc3.variational.opvi import Operator, ObjectiveFunction, _warn_not_used
-from pymc3.variational.updates import adam
-import pymc3 as pm
+from pymc3.variational.stein import Stein
+from pymc3.variational import updates
 
 __all__ = [
     'KL',
@@ -17,6 +17,7 @@ class KL(Operator):
 
         KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv
     """
+
     def apply(self, f):
         z = self.input
         return self.logq_norm(z) - self.logp_norm(z)
@@ -25,16 +26,44 @@ def apply(self, f):
 
 
 class KSDObjective(ObjectiveFunction):
-    def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam,
-                        more_obj_params=None, more_replacements=None):
-        if obj_n_mc is not None:
-            _warn_not_used('obj_n_mc', self.op)
-        d_obj_padams = self(None)
-        d_obj_padams = theano.clone(d_obj_padams, more_replacements, strict=False)
-        updates.update(obj_optimizer([d_obj_padams], self.obj_params))
+    """Helper class for construction loss and updates for variational inference
+
+    Parameters
+    ----------
+    op : :class:`KSD`
+        OPVI Functional operator
+    tf : :class:`TestFunction`
+        OPVI TestFunction
+    """
 
-    def __call__(self, z):
-        return self.op.apply(self.tf)
+    def __init__(self, op, tf):
+        if not isinstance(op, KSD):
+            raise TypeError('Op should be KSD')
+        ObjectiveFunction.__init__(self, op, tf)
+
+    def get_input(self, n_mc):
+        if hasattr(self.approx, 'histogram'):
+            if n_mc is not None:
+                _warn_not_used('n_mc', self.op)
+            return self.approx.histogram
+        elif n_mc is not None and n_mc > 1:
+            return self.approx.random(n_mc)
+        else:
+            raise ValueError('Variational type approximation requires '
+                             'sample size (`n_mc` : int > 1 should be passed)')
+
+    def __call__(self, z, **kwargs):
+        op = self.op  # type: KSD
+        grad = op.apply(self.tf)
+        if 'more_obj_params' in kwargs:
+            params = self.obj_params + kwargs['more_obj_params']
+        else:
+            params = self.test_params + kwargs['more_tf_params']
+            grad *= -1
+        grad = theano.clone(grad, {op.input_matrix: z})
+        grad = tt.grad(None, params, known_grads={z: grad})
+        grad = updates.total_norm_constraint(grad, 10)
+        return grad
 
 
 class KSD(Operator):
@@ -45,7 +74,7 @@ class KSD(Operator):
         and a set of initial particles :math:`\{x^0_i\}^n_{i=1}`
 
     Output: A set of particles :math:`\{x_i\}^n_{i=1}` that approximates the target distribution.
-    
+
     .. math::
 
         x_i^{l+1} \leftarrow \epsilon_l \hat{\phi}^{*}(x_i^l) \\
@@ -68,22 +97,14 @@ class KSD(Operator):
     OBJECTIVE = KSDObjective
 
     def __init__(self, approx):
-        if not isinstance(approx, pm.Empirical):
-            raise ValueError('approx should be an Empirical approximation, got %r' % approx)
         Operator.__init__(self, approx)
+        self.input_matrix = tt.matrix('KSD input matrix')
 
     def apply(self, f):
         # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
-        X = self.approx.histogram
-        t = self.approx.normalizing_constant
-        dlogpdx = theano.scan(
-            fn=lambda zg: theano.grad(self.logp_norm(zg), zg),
-            sequences=[X]
-        )[0]    # bottleneck
-        Kxy, dxkxy = f(X)
-        # scaling factor
-        # not needed for Kxy as we already scaled dlogpdx
-        dxkxy /= t
-        n = X.shape[0].astype('float32') / t
-        svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n
-        return -1 * svgd_grad   # gradient
+        stein = Stein(self.approx, f, self.input_matrix)
+        return -1 * stein.grad
+
+
+class AKSD(KSD):
+    SUPPORT_AEVB = True
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index 39f17f2030..dc0ff7fa5a 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -69,10 +69,11 @@ class ObjectiveFunction(object):
     Parameters
     ----------
     op : :class:`Operator`
-        OPVI Functional operator 
+        OPVI Functional operator
     tf : :class:`TestFunction`
         OPVI TestFunction
     """
+
     def __init__(self, op, tf):
         self.op = op
         self.tf = tf
@@ -132,10 +133,6 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize
             more_updates = dict()
         if more_replacements is None:
             more_replacements = dict()
-        if not self.op.RETURNS_LOSS and (more_obj_params or more_tf_params):
-            raise ValueError('%s does not support, differentiation with '
-                             'additional params, try passing your updates '
-                             'via more_updates kwarg' % self.op.__class__)
         resulting_updates = ObjectiveUpdates()
         if self.test_params:
             self.add_test_updates(
@@ -160,20 +157,33 @@ def updates(self, obj_n_mc=None, tf_n_mc=None, obj_optimizer=adam, test_optimize
         resulting_updates.update(more_updates)
         return resulting_updates
 
-    def add_test_updates(self, updates, tf_n_mc=None, test_optimizer=adam, more_tf_params=None, more_replacements=None):
-        tf_z = self.random(tf_n_mc)
-        tf_target = -self(tf_z)
+    def add_test_updates(self, updates, tf_n_mc=None, test_optimizer=adam,
+                         more_tf_params=None, more_replacements=None):
+        tf_z = self.get_input(tf_n_mc)
+        tf_target = self(tf_z, more_tf_params=more_tf_params)
         tf_target = theano.clone(tf_target, more_replacements, strict=False)
-        updates.update(test_optimizer(tf_target, self.test_params + more_tf_params))
-
-    def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam, more_obj_params=None, more_replacements=None):
-        obj_z = self.random(obj_n_mc)
-        obj_target = self(obj_z)
+        updates.update(
+            test_optimizer(
+                tf_target,
+                self.test_params +
+                more_tf_params))
+
+    def add_obj_updates(self, updates, obj_n_mc=None, obj_optimizer=adam,
+                        more_obj_params=None, more_replacements=None):
+        obj_z = self.get_input(obj_n_mc)
+        obj_target = self(obj_z, more_obj_params=more_obj_params)
         obj_target = theano.clone(obj_target, more_replacements, strict=False)
-        updates.update(obj_optimizer(obj_target, self.obj_params + more_obj_params))
+        updates.update(
+            obj_optimizer(
+                obj_target,
+                self.obj_params +
+                more_obj_params))
         if self.op.RETURNS_LOSS:
             updates.loss = obj_target
 
+    def get_input(self, n_mc):
+        return self.random(n_mc)
+
     @memoize
     @change_flags(compute_test_value='off')
     def step_function(self, obj_n_mc=None, tf_n_mc=None,
@@ -184,7 +194,7 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None,
         R"""Step function that should be called on each optimization step.
 
         Generally it solves the following problem:
-        
+
         .. math::
 
                 \mathbf{\lambda^{*}} = \inf_{\lambda} \sup_{\theta} t(\mathbb{E}_{\lambda}[(O^{p,q}f_{\theta})(z)])
@@ -228,7 +238,8 @@ def step_function(self, obj_n_mc=None, tf_n_mc=None,
                                more_updates=more_updates,
                                more_replacements=more_replacements)
         if score:
-            step_fn = theano.function([], updates.loss, updates=updates, **fn_kwargs)
+            step_fn = theano.function(
+                [], updates.loss, updates=updates, **fn_kwargs)
         else:
             step_fn = theano.function([], None, updates=updates, **fn_kwargs)
         return step_fn
@@ -257,7 +268,10 @@ def score_function(self, sc_n_mc=None, more_replacements=None, fn_kwargs=None):
             raise NotImplementedError('%s does not have loss' % self.op)
         if more_replacements is None:
             more_replacements = {}
-        loss = theano.clone(self(self.random(sc_n_mc)), more_replacements, strict=False)
+        loss = theano.clone(
+            self(self.random(sc_n_mc)),
+            more_replacements,
+            strict=False)
         return theano.function([], loss, **fn_kwargs)
 
     def __getstate__(self):
@@ -266,14 +280,22 @@ def __getstate__(self):
     def __setstate__(self, state):
         self.__init__(*state)
 
-    def __call__(self, z):
+    def __call__(self, z, **kwargs):
+        if 'more_tf_params' in kwargs:
+            m = -1
+        else:
+            m = 1
         if z.ndim > 1:
             a = theano.scan(
-                lambda z_: theano.clone(self.op.apply(self.tf), {self.op.input: z_}, strict=False),
+                lambda z_: theano.clone(
+                    self.op.apply(self.tf),
+                    {self.op.input: z_}, strict=False),
                 sequences=z, n_steps=z.shape[0])[0].mean()
         else:
-            a = theano.clone(self.op.apply(self.tf), {self.op.input: z}, strict=False)
-        return self.op.T(a)
+            a = theano.clone(
+                self.op.apply(self.tf),
+                {self.op.input: z}, strict=False)
+        return m * self.op.T(a)
 
 
 class Operator(object):
@@ -305,32 +327,14 @@ def __init__(self, approx):
     flat_view = property(lambda self: self.approx.flat_view)
     input = property(lambda self: self.approx.flat_view.input)
 
-    def logp(self, z):
-        factors = ([tt.sum(var.logpt)for var in self.model.basic_RVs] +
-                   [tt.sum(var) for var in self.model.potentials])
-
-        p = self.approx.to_flat_input(tt.add(*factors))
-        p = theano.clone(p, {self.input: z})
-        return p
-
-    def logp_norm(self, z):
-        t = self.approx.normalizing_constant
-        factors = ([tt.sum(var.logpt) / t for var in self.model.basic_RVs] +
-                   [tt.sum(var) / t for var in self.model.potentials])
-        logpt = tt.add(*factors)
-        p = self.approx.to_flat_input(logpt)
-        p = theano.clone(p, {self.input: z})
-        return p
-
-    def logq(self, z):
-        return self.approx.logq(z)
-
-    def logq_norm(self, z):
-        return self.approx.logq_norm(z)
+    logp = property(lambda self: self.approx.logp)
+    logq = property(lambda self: self.approx.logq)
+    logp_norm = property(lambda self: self.approx.logp_norm)
+    logq_norm = property(lambda self: self.approx.logq_norm)
 
     def apply(self, f):   # pragma: no cover
         R"""Operator itself
-        
+
         .. math::
 
             (O^{p,q}f_{\theta})(z)
@@ -357,7 +361,9 @@ def __call__(self, f=None):
                     f = TestFunction.from_function(f)
         else:
             if f is not None:
-                warnings.warn('TestFunction for %s is redundant and removed' % self)
+                warnings.warn(
+                    'TestFunction for %s is redundant and removed' %
+                    self)
             else:
                 pass
             f = TestFunction()
@@ -399,7 +405,8 @@ def cast_to_list(params):
     elif params is None:
         return []
     else:
-        raise TypeError('Unknown type %s for %r, need list, dict or shared variable')
+        raise TypeError(
+            'Unknown type %s for %r, need list, dict or shared variable')
 
 
 class TestFunction(object):
@@ -432,7 +439,7 @@ def _setup(self, dim):
 
         Parameters
         ----------
-        dim : int 
+        dim : int
             dimension of posterior distribution
         """
         pass
@@ -455,14 +462,14 @@ class Approximation(object):
         mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-    model : :class:`Model` 
+    model : :class:`Model`
         PyMC3 model for inference
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
-        Yuhuai Wu, David Duvenaud, 2016) for details     
+        Yuhuai Wu, David Duvenaud, 2016) for details
     scale_cost_to_minibatch : bool, default False
         Scale cost to minibatch instead of full dataset
     random_seed : None or int
@@ -568,10 +575,12 @@ def seed(self, random_seed=None):
 
     @property
     def normalizing_constant(self):
-        t = self.to_flat_input(tt.max([v.scaling for v in self.model.basic_RVs]))
+        t = self.to_flat_input(
+            tt.max([v.scaling for v in self.model.basic_RVs]))
         t = theano.clone(t, {self.input: tt.zeros(self.total_size)})
         # if not scale_cost_to_minibatch: t=1
-        t = tt.switch(self.scale_cost_to_minibatch, t, tt.constant(1, dtype=t.dtype))
+        t = tt.switch(self.scale_cost_to_minibatch, t,
+                      tt.constant(1, dtype=t.dtype))
         return t
 
     def _setup(self, **kwargs):
@@ -589,8 +598,12 @@ def get_local_vars(self, **kwargs):
     def check_model(self, model, **kwargs):
         """Checks that model is valid for variational inference
         """
-        vars_ = [var for var in model.vars if not isinstance(var, pm.model.ObservedRV)]
-        if any([var.dtype in pm.discrete_types for var in vars_]):  # pragma: no cover
+        vars_ = [
+            var for var in model.vars
+            if not isinstance(var, pm.model.ObservedRV)
+        ]
+        if any([var.dtype in pm.discrete_types for var in vars_]
+               ):  # pragma: no cover
             raise ValueError('Model should not include discrete RVs')
 
     def create_shared_params(self, **kwargs):
@@ -630,7 +643,8 @@ def construct_replacements(self, include=None, exclude=None,
             Replacements
         """
         if include is not None and exclude is not None:
-            raise ValueError('Only one parameter is supported {include|exclude}, got two')
+            raise ValueError(
+                'Only one parameter is supported {include|exclude}, got two')
         if include is not None:    # pragma: no cover
             replacements = {k: v for k, v
                             in self.flat_view.replacements.items() if k in include}
@@ -724,11 +738,11 @@ def initial(self, size, no_rand=False, l=None):
 
         Parameters
         ----------
-        size : `int` 
+        size : `int`
             number of samples
         no_rand : `bool`
             return zeros if True
-        l : `int` 
+        l : `int`
             length of sample, defaults to latent space dim
 
         Returns
@@ -924,6 +938,22 @@ def logq(self, z):
     def logq_norm(self, z):
         return self.logq(z) / self.normalizing_constant
 
+    def logp(self, z):
+        factors = ([tt.sum(var.logpt)for var in self.model.basic_RVs] +
+                   [tt.sum(var) for var in self.model.potentials])
+        p = self.to_flat_input(tt.add(*factors))
+        p = theano.clone(p, {self.input: z})
+        return p
+
+    def logp_norm(self, z):
+        t = self.normalizing_constant
+        factors = ([tt.sum(var.logpt) / t for var in self.model.basic_RVs] +
+                   [tt.sum(var) / t for var in self.model.potentials])
+        logpt = tt.add(*factors)
+        p = self.to_flat_input(logpt)
+        p = theano.clone(p, {self.input: z})
+        return p
+
     def view(self, space, name, reshape=True):
         """Construct view on a variable from flattened `space`
 
@@ -949,7 +979,9 @@ def view(self, space, name, reshape=True):
         elif space.ndim < 2:
             view = space[slc]
         else:   # pragma: no cover
-            raise ValueError('Space should have no more than 2 dims, got %d' % space.ndim)
+            raise ValueError(
+                'Space should have no more than 2 dims, got %d' %
+                space.ndim)
         if reshape:
             if len(_shape) > 0:
                 if theano_is_here:
diff --git a/pymc3/variational/stein.py b/pymc3/variational/stein.py
new file mode 100644
index 0000000000..0e6ef30f70
--- /dev/null
+++ b/pymc3/variational/stein.py
@@ -0,0 +1,71 @@
+from theano import theano, tensor as tt
+from pymc3.variational.test_functions import rbf
+from pymc3.theanof import memoize
+
+__all__ = [
+    'Stein'
+]
+
+
+class Stein(object):
+    def __init__(self, approx, kernel=rbf, input_matrix=None):
+        self.approx = approx
+        self._kernel_f = kernel
+        if input_matrix is None:
+            input_matrix = tt.matrix('stein_input_matrix')
+            input_matrix.tag.test_value = approx.random(10).tag.test_value
+        self.input_matrix = input_matrix
+
+    @property
+    @memoize
+    def grad(self):
+        t = self.approx.normalizing_constant
+        Kxy, dxkxy = self.Kxy, self.dxkxy
+        dlogpdx = self.dlogp  # Normalized
+        n = self.input_matrix.shape[0].astype('float32')
+        svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy/t) / n
+        return svgd_grad
+
+    @property
+    def Kxy(self):
+        return self._kernel()[0]
+
+    @property
+    def dxkxy(self):
+        return self._kernel()[1]
+
+    @property
+    @memoize
+    def logp(self):
+        return theano.scan(
+            fn=lambda zg: self.approx.logp_norm(zg),
+            sequences=[self.input_matrix]
+        )[0]
+
+    @property
+    @memoize
+    def dlogp(self):
+        return theano.scan(
+            fn=lambda zg: theano.grad(self.approx.logp_norm(zg), zg),
+            sequences=[self.input_matrix]
+        )[0]
+
+    @memoize
+    def _kernel(self):
+        return self._kernel_f(self.input_matrix)
+
+    def get_approx_input(self, size=100):
+        """
+
+        Parameters
+        ----------
+        size : if approx is not Empirical, takes `n=size` random samples
+
+        Returns
+        -------
+        matrix
+        """
+        if hasattr(self.approx, 'histogram'):
+            return self.approx.histogram
+        else:
+            return self.approx.random(size)
diff --git a/pymc3/variational/test_functions.py b/pymc3/variational/test_functions.py
index ed9badfdc2..012fb5f156 100644
--- a/pymc3/variational/test_functions.py
+++ b/pymc3/variational/test_functions.py
@@ -1,4 +1,4 @@
-from theano import tensor as tt
+from theano import theano, tensor as tt
 from .opvi import TestFunction
 
 
@@ -10,9 +10,9 @@
 class Kernel(TestFunction):
     """
     Dummy base class for kernel SVGD in case we implement more
-    
+
     .. math::
-    
+
         f(x) -> (k(x,.), \nabla_x k(x,.))
 
     """
@@ -21,26 +21,30 @@ class Kernel(TestFunction):
 class RBF(Kernel):
     def __call__(self, X):
         XY = X.dot(X.T)
-        x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1))
+        x2 = tt.sum(X ** 2, axis=1).dimshuffle(0, 'x')
         X2e = tt.repeat(x2, X.shape[0], axis=1)
-        H = tt.sub(tt.add(X2e, X2e.T), 2 * XY)
+        H = X2e + X2e.T - 2. * XY
 
         V = tt.sort(H.flatten())
         length = V.shape[0]
         # median distance
-        h = tt.switch(tt.eq((length % 2), 0),
+        m = tt.switch(tt.eq((length % 2), 0),
                       # if even vector
-                      tt.mean(V[((length//2)-1):((length//2)+1)]),
+                      tt.mean(V[((length // 2) - 1):((length // 2) + 1)]),
                       # if odd vector
                       V[length // 2])
 
-        h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0))
+        h = .5 * m / tt.log(tt.cast(H.shape[0] + 1., theano.config.floatX))
 
-        Kxy = tt.exp(-H / h ** 2 / 2.0)
+        #  RBF
+        Kxy = tt.exp(-H / h / 2.0)
+
+        # Derivative
         dxkxy = -tt.dot(Kxy, X)
         sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x')
-        dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2)
+        dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / h
 
         return Kxy, dxkxy
 
+
 rbf = RBF()
diff --git a/pymc3/variational/updates.py b/pymc3/variational/updates.py
index 781f13b36f..598162674b 100755
--- a/pymc3/variational/updates.py
+++ b/pymc3/variational/updates.py
@@ -187,12 +187,12 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3):
     -------
     OrderedDict
         A dictionary mapping each parameter to its update expression
-        
+
     Notes
     -----
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
-    
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -210,7 +210,8 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3):
     if loss_or_grads is None and params is None:
         return partial(sgd, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
@@ -268,7 +269,8 @@ def apply_momentum(updates, params=None, momentum=0.9):
     return updates
 
 
-def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
+def momentum(loss_or_grads=None, params=None,
+             learning_rate=1e-3, momentum=0.9):
     """Stochastic Gradient Descent (SGD) updates with momentum
 
     Generates update expressions of the form:
@@ -300,12 +302,12 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
 
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
-    
+
     See Also
     --------
     apply_momentum : Generic function applying momentum to updates
     nesterov_momentum : Nesterov's variant of SGD with momentum
-    
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -323,7 +325,8 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
     if loss_or_grads is None and params is None:
         return partial(pm.updates.momentum, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     updates = sgd(loss_or_grads, params, learning_rate)
     return apply_momentum(updates, momentum=momentum)
 
@@ -382,7 +385,8 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9):
     return updates
 
 
-def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
+def nesterov_momentum(loss_or_grads=None, params=None,
+                      learning_rate=1e-3, momentum=0.9):
     """Stochastic Gradient Descent (SGD) updates with Nesterov momentum
 
     Generates update expressions of the form:
@@ -417,7 +421,7 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen
     position in parameter space. Here, we use the formulation described at
     https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617,
     which allows the gradient to be evaluated at the current parameters.
-    
+
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
 
@@ -442,7 +446,8 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen
     if loss_or_grads is None and params is None:
         return partial(nesterov_momentum, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     updates = sgd(loss_or_grads, params, learning_rate)
     return apply_nesterov_momentum(updates, momentum=momentum)
 
@@ -480,7 +485,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
     as such the learning rate is monotonically decreasing.
 
     Epsilon is not included in the typical formula, see [2]_.
-    
+
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
 
@@ -492,7 +497,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
 
     .. [2] Chris Dyer:
            Notes on AdaGrad. http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf
-    
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -510,7 +515,8 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
     if loss_or_grads is None and params is None:
         return partial(adagrad, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
@@ -574,7 +580,8 @@ def adagrad_window(loss_or_grads=None, params=None,
     return updates
 
 
-def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon=1e-6):
+def rmsprop(loss_or_grads=None, params=None,
+            learning_rate=1.0, rho=0.9, epsilon=1e-6):
     """RMSProp updates
 
     Scale learning rates by dividing with the moving average of the root mean
@@ -610,8 +617,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     .. math::
        r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
        \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
-    
-    
+
+
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
 
@@ -620,7 +627,7 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     .. [1] Tieleman, tt. and Hinton, G. (2012):
            Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
            Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
-           
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -638,7 +645,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     if loss_or_grads is None and params is None:
         return partial(rmsprop, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
@@ -657,7 +665,8 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     return updates
 
 
-def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsilon=1e-6):
+def adadelta(loss_or_grads=None, params=None,
+             learning_rate=1.0, rho=0.95, epsilon=1e-6):
     """ Adadelta updates
 
     Scale learning rates by the ratio of accumulated gradients to accumulated
@@ -703,7 +712,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
        \\eta_t &= \\eta \\frac{\\sqrt{s_{t-1} + \\epsilon}}
                              {\sqrt{r_t + \epsilon}}\\\\
        s_t &= \\rho s_{t-1} + (1-\\rho)*(\\eta_t*g)^2
-    
+
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
 
@@ -730,7 +739,8 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
     if loss_or_grads is None and params is None:
         return partial(adadelta, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
@@ -793,7 +803,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9,
     The paper [1]_ includes an additional hyperparameter lambda. This is only
     needed to prove convergence of the algorithm and has no practical use
     (personal communication with the authors), it is therefore omitted here.
-    
+
     Optimizer can be called without both loss_or_grads and params
     in that case partial function is returned
 
@@ -802,7 +812,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9,
     .. [1] Kingma, Diederik, and Jimmy Ba (2014):
            Adam: A Method for Stochastic Optimization.
            arXiv preprint arXiv:1412.6980.
-    
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -820,7 +830,8 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9,
     if loss_or_grads is None and params is None:
         return partial(adam, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     all_grads = get_or_compute_grads(loss_or_grads, params)
     t_prev = theano.shared(pm.theanof.floatX(0.))
     updates = OrderedDict()
@@ -829,7 +840,7 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9,
     one = tt.constant(1)
 
     t = t_prev + 1
-    a_t = learning_rate*tt.sqrt(one-beta2**t)/(one-beta1**t)
+    a_t = learning_rate * tt.sqrt(one - beta2**t) / (one - beta1**t)
 
     for param, g_t in zip(params, all_grads):
         value = param.get_value(borrow=True)
@@ -838,9 +849,9 @@ def adam(loss_or_grads=None, params=None, learning_rate=0.001, beta1=0.9,
         v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                broadcastable=param.broadcastable)
 
-        m_t = beta1*m_prev + (one-beta1)*g_t
-        v_t = beta2*v_prev + (one-beta2)*g_t**2
-        step = a_t*m_t/(tt.sqrt(v_t) + epsilon)
+        m_t = beta1 * m_prev + (one - beta1) * g_t
+        v_t = beta2 * v_prev + (one - beta2) * g_t**2
+        step = a_t * m_t / (tt.sqrt(v_t) + epsilon)
 
         updates[m_prev] = m_t
         updates[v_prev] = v_t
@@ -876,7 +887,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9,
     -------
     OrderedDict
         A dictionary mapping each parameter to its update expression
-    
+
     Notes
     -----
     Optimizer can be called without both loss_or_grads and params
@@ -887,7 +898,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9,
     .. [1] Kingma, Diederik, and Jimmy Ba (2014):
            Adam: A Method for Stochastic Optimization.
            arXiv preprint arXiv:1412.6980.
-    
+
     Examples
     --------
     >>> a = theano.shared(1.)
@@ -905,7 +916,8 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9,
     if loss_or_grads is None and params is None:
         return partial(adamax, **_get_call_kwargs(locals()))
     elif loss_or_grads is None or params is None:
-        raise ValueError('Please provide both `loss_or_grads` and `params` to get updates')
+        raise ValueError(
+            'Please provide both `loss_or_grads` and `params` to get updates')
     all_grads = get_or_compute_grads(loss_or_grads, params)
     t_prev = theano.shared(pm.theanof.floatX(0.))
     updates = OrderedDict()
@@ -914,7 +926,7 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9,
     one = tt.constant(1)
 
     t = t_prev + 1
-    a_t = learning_rate/(one-beta1**t)
+    a_t = learning_rate / (one - beta1**t)
 
     for param, g_t in zip(params, all_grads):
         value = param.get_value(borrow=True)
@@ -923,9 +935,9 @@ def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9,
         u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                broadcastable=param.broadcastable)
 
-        m_t = beta1*m_prev + (one-beta1)*g_t
-        u_t = tt.maximum(beta2*u_prev, abs(g_t))
-        step = a_t*m_t/(u_t + epsilon)
+        m_t = beta1 * m_prev + (one - beta1) * g_t
+        u_t = tt.maximum(beta2 * u_prev, abs(g_t))
+        step = a_t * m_t / (u_t + epsilon)
 
         updates[m_prev] = m_t
         updates[u_prev] = u_t
@@ -1075,7 +1087,7 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7,
     dtype = np.dtype(theano.config.floatX).type
     target_norm = tt.clip(norm, 0, dtype(max_norm))
     multiplier = target_norm / (dtype(epsilon) + norm)
-    tensor_vars_scaled = [step*multiplier for step in tensor_vars]
+    tensor_vars_scaled = [step * multiplier for step in tensor_vars]
 
     if return_norm:
         return tensor_vars_scaled, norm