diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index ef46d5a3fe5f1..32f49dd77d3bf 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -1029,7 +1029,7 @@ Vector Machine, a Decision Tree, and a K-nearest neighbor classifier::
    >>> # Training classifiers
    >>> clf1 = DecisionTreeClassifier(max_depth=4)
    >>> clf2 = KNeighborsClassifier(n_neighbors=7)
-   >>> clf3 = SVC(kernel='rbf', probability=True)
+   >>> clf3 = SVC(gamma='scale', kernel='rbf', probability=True)
    >>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2,1,2])
 
    >>> clf1 = clf1.fit(X,y)
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 03aacf194df7d..36045a7fa6199 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -98,9 +98,9 @@ Usage examples:
     >>> from sklearn.model_selection import cross_val_score
     >>> iris = datasets.load_iris()
     >>> X, y = iris.data, iris.target
-    >>> clf = svm.SVC(probability=True, random_state=0)
+    >>> clf = svm.SVC(gamma='scale', probability=True, random_state=0)
     >>> cross_val_score(clf, X, y, scoring='neg_log_loss') # doctest: +ELLIPSIS
-    array([-0.07..., -0.16..., -0.06...])
+    array([-0.09..., -0.16..., -0.07...])
     >>> model = svm.SVC()
     >>> cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
@@ -1775,7 +1775,7 @@ Next, let's compare the accuracy of ``SVC`` and ``most_frequent``::
 We see that ``SVC`` doesn't do much better than a dummy classifier. Now, let's
 change the kernel::
 
-  >>> clf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
+  >>> clf = SVC(gamma='scale', kernel='rbf', C=1).fit(X_train, y_train)
   >>> clf.score(X_test, y_test)  # doctest: +ELLIPSIS
   0.97...
 
diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst
index d64657717ba79..15ecf3c2d88f6 100644
--- a/doc/modules/model_persistence.rst
+++ b/doc/modules/model_persistence.rst
@@ -18,12 +18,12 @@ persistence model, namely `pickle <https://docs.python.org/2/library/pickle.html
 
   >>> from sklearn import svm
   >>> from sklearn import datasets
-  >>> clf = svm.SVC()
+  >>> clf = svm.SVC(gamma='scale')
   >>> iris = datasets.load_iris()
   >>> X, y = iris.data, iris.target
   >>> clf.fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
   SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
       max_iter=-1, probability=False, random_state=None, shrinking=True,
       tol=0.001, verbose=False)
 
diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst
index 0667d259c4b24..bc03640d97a3f 100644
--- a/doc/modules/pipeline.rst
+++ b/doc/modules/pipeline.rst
@@ -172,7 +172,7 @@ object::
      >>> from sklearn.datasets import load_digits
      >>> digits = load_digits()
      >>> pca1 = PCA()
-     >>> svm1 = SVC()
+     >>> svm1 = SVC(gamma='scale')
      >>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)])
      >>> pipe.fit(digits.data, digits.target)
      ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
@@ -193,7 +193,7 @@ object::
 
      >>> cachedir = mkdtemp()
      >>> pca2 = PCA()
-     >>> svm2 = SVC()
+     >>> svm2 = SVC(gamma='scale')
      >>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)],
      ...                        memory=cachedir)
      >>> cached_pipe.fit(digits.data, digits.target)
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 8f253437690c3..f0c6a0e76a576 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -75,10 +75,10 @@ n_features]`` holding the training samples, and an array y of class labels
     >>> from sklearn import svm
     >>> X = [[0, 0], [1, 1]]
     >>> y = [0, 1]
-    >>> clf = svm.SVC()
+    >>> clf = svm.SVC(gamma='scale')
     >>> clf.fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-        decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+        decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
         max_iter=-1, probability=False, random_state=None, shrinking=True,
         tol=0.001, verbose=False)
 
@@ -119,10 +119,10 @@ n_classes)``::
 
     >>> X = [[0], [1], [2], [3]]
     >>> Y = [0, 1, 2, 3]
-    >>> clf = svm.SVC(decision_function_shape='ovo')
+    >>> clf = svm.SVC(gamma='scale', decision_function_shape='ovo')
     >>> clf.fit(X, Y) # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-        decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
+        decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
         max_iter=-1, probability=False, random_state=None, shrinking=True,
         tol=0.001, verbose=False)
     >>> dec = clf.decision_function([[1]])
@@ -318,8 +318,9 @@ floating point values instead of integer values::
     >>> y = [0.5, 2.5]
     >>> clf = svm.SVR()
     >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
-    SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
-        kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
+    SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
+        gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
+        tol=0.001, verbose=False)
     >>> clf.predict([[1, 1]])
     array([ 1.5])
 
@@ -534,7 +535,7 @@ test vectors must be provided.
     >>> gram = np.dot(X, X.T)
     >>> clf.fit(gram, y) # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-        decision_function_shape='ovr', degree=3, gamma='auto',
+        decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
         kernel='precomputed', max_iter=-1, probability=False,
         random_state=None, shrinking=True, tol=0.001, verbose=False)
     >>> # predict on training examples
diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index a29162269d53a..edb2f046d7bee 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -216,12 +216,12 @@ persistence model, `pickle <https://docs.python.org/2/library/pickle.html>`_::
 
   >>> from sklearn import svm
   >>> from sklearn import datasets
-  >>> clf = svm.SVC()
+  >>> clf = svm.SVC(gamma='scale')
   >>> iris = datasets.load_iris()
   >>> X, y = iris.data, iris.target
   >>> clf.fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
   SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)
 
@@ -291,10 +291,10 @@ maintained::
     >>> from sklearn import datasets
     >>> from sklearn.svm import SVC
     >>> iris = datasets.load_iris()
-    >>> clf = SVC()
+    >>> clf = SVC(gamma='scale')
     >>> clf.fit(iris.data, iris.target)  # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
       max_iter=-1, probability=False, random_state=None, shrinking=True,
       tol=0.001, verbose=False)
 
@@ -303,7 +303,7 @@ maintained::
 
     >>> clf.fit(iris.data, iris.target_names[iris.target])  # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
       max_iter=-1, probability=False, random_state=None, shrinking=True,
       tol=0.001, verbose=False)
 
@@ -332,19 +332,19 @@ more than once will overwrite what was learned by any previous ``fit()``::
   >>> clf = SVC()
   >>> clf.set_params(kernel='linear').fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
   SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
-    max_iter=-1, probability=False, random_state=None, shrinking=True,
-    tol=0.001, verbose=False)
+    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
+    kernel='linear', max_iter=-1, probability=False, random_state=None,
+    shrinking=True, tol=0.001, verbose=False)
   >>> clf.predict(X_test)
   array([1, 0, 1, 1, 0])
 
-  >>> clf.set_params(kernel='rbf').fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
+  >>> clf.set_params(kernel='rbf', gamma='scale').fit(X, y)  # doctest: +NORMALIZE_WHITESPACE
   SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)
   >>> clf.predict(X_test)
-  array([0, 0, 0, 1, 0])
+  array([1, 0, 1, 1, 0])
 
 Here, the default kernel ``rbf`` is first changed to ``linear`` after the
 estimator has been constructed via ``SVC()``, and changed back to ``rbf`` to
@@ -364,7 +364,8 @@ the target data fit upon::
     >>> X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
     >>> y = [0, 0, 1, 1, 2]
 
-    >>> classif = OneVsRestClassifier(estimator=SVC(random_state=0))
+    >>> classif = OneVsRestClassifier(estimator=SVC(gamma='scale',
+    ...                                             random_state=0))
     >>> classif.fit(X, y).predict(X)
     array([0, 0, 1, 1, 2])
 
diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
index 6f53e9af791c5..49e69d9ec80d4 100644
--- a/doc/tutorial/statistical_inference/supervised_learning.rst
+++ b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -455,9 +455,9 @@ classification --:class:`SVC` (Support Vector Classification).
     >>> svc = svm.SVC(kernel='linear')
     >>> svc.fit(iris_X_train, iris_y_train)    # doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
-        decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
-        max_iter=-1, probability=False, random_state=None, shrinking=True,
-        tol=0.001, verbose=False)
+        decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
+        kernel='linear', max_iter=-1, probability=False, random_state=None,
+        shrinking=True, tol=0.001, verbose=False)
 
 
 .. warning:: **Normalizing data**
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index dd9895f7e8577..33eb61d6f2cc3 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -399,6 +399,12 @@ Linear, kernelized and related models
   estimators will report at most ``max_iter`` iterations even if more were
   performed. :issue:`10723` by `Joel Nothman`_.
 
+- The default value of ``gamma`` parameter of :class:`svm.SVC`,
+  :class:`svm.NuSVC`, :class:`svm.SVR`, :class:`NuSVR`, :class:`OneClassSVM`
+  will change from ``'auto'`` to ``'scale'`` in version 0.22 to account
+  better for unscaled features. :issue:`8361` by :user:`Gaurav Dhingra <gxyd>`
+  and :user:`Ting Neo <neokt>`.
+
 Metrics
 
 - Deprecate ``reorder`` parameter in :func:`metrics.auc` as it's no longer required
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 50820d4512b5b..014e613ccd260 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -69,7 +69,7 @@ def test_classification():
                            Perceptron(tol=1e-3),
                            DecisionTreeClassifier(),
                            KNeighborsClassifier(),
-                           SVC()]:
+                           SVC(gamma="scale")]:
         for params in grid:
             BaggingClassifier(base_estimator=base_estimator,
                               random_state=rng,
@@ -115,7 +115,8 @@ def fit(self, X, y):
             for f in ['predict', 'predict_proba', 'predict_log_proba', 'decision_function']:
                 # Trained on sparse format
                 sparse_classifier = BaggingClassifier(
-                    base_estimator=CustomSVC(decision_function_shape='ovr'),
+                    base_estimator=CustomSVC(gamma='scale',
+                                             decision_function_shape='ovr'),
                     random_state=1,
                     **params
                 ).fit(X_train_sparse, y_train)
@@ -123,12 +124,13 @@ def fit(self, X, y):
 
                 # Trained on dense format
                 dense_classifier = BaggingClassifier(
-                    base_estimator=CustomSVC(decision_function_shape='ovr'),
+                    base_estimator=CustomSVC(gamma='scale',
+                                             decision_function_shape='ovr'),
                     random_state=1,
                     **params
                 ).fit(X_train, y_train)
                 dense_results = getattr(dense_classifier, f)(X_test)
-                assert_array_equal(sparse_results, dense_results)
+                assert_array_almost_equal(sparse_results, dense_results)
 
             sparse_type = type(X_train_sparse)
             types = [i.data_type_ for i in sparse_classifier.estimators_]
@@ -151,7 +153,7 @@ def test_regression():
                            DummyRegressor(),
                            DecisionTreeRegressor(),
                            KNeighborsRegressor(),
-                           SVR()]:
+                           SVR(gamma='scale')]:
         for params in grid:
             BaggingRegressor(base_estimator=base_estimator,
                              random_state=rng,
@@ -197,7 +199,7 @@ def fit(self, X, y):
 
             # Trained on sparse format
             sparse_classifier = BaggingRegressor(
-                base_estimator=CustomSVR(),
+                base_estimator=CustomSVR(gamma='scale'),
                 random_state=1,
                 **params
             ).fit(X_train_sparse, y_train)
@@ -205,7 +207,7 @@ def fit(self, X, y):
 
             # Trained on dense format
             dense_results = BaggingRegressor(
-                base_estimator=CustomSVR(),
+                base_estimator=CustomSVR(gamma='scale'),
                 random_state=1,
                 **params
             ).fit(X_train, y_train).predict(X_test)
@@ -310,7 +312,7 @@ def test_oob_score_classification():
                                                         iris.target,
                                                         random_state=rng)
 
-    for base_estimator in [DecisionTreeClassifier(), SVC()]:
+    for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]:
         clf = BaggingClassifier(base_estimator=base_estimator,
                                 n_estimators=100,
                                 bootstrap=True,
@@ -440,7 +442,8 @@ def test_parallel_classification():
     assert_array_almost_equal(y1, y3)
 
     # decision_function
-    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
+    ensemble = BaggingClassifier(SVC(gamma='scale',
+                                     decision_function_shape='ovr'),
                                  n_jobs=3,
                                  random_state=0).fit(X_train, y_train)
 
@@ -457,7 +460,8 @@ def test_parallel_classification():
                          "".format(X_test.shape[1], X_err.shape[1]),
                          ensemble.decision_function, X_err)
 
-    ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
+    ensemble = BaggingClassifier(SVC(gamma='scale',
+                                     decision_function_shape='ovr'),
                                  n_jobs=1,
                                  random_state=0).fit(X_train, y_train)
 
@@ -501,7 +505,7 @@ def test_gridsearch():
     parameters = {'n_estimators': (1, 2),
                   'base_estimator__C': (1, 2)}
 
-    GridSearchCV(BaggingClassifier(SVC()),
+    GridSearchCV(BaggingClassifier(SVC(gamma="scale")),
                  parameters,
                  scoring="roc_auc").fit(X, y)
 
@@ -550,7 +554,7 @@ def test_base_estimator():
 
     assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
 
-    ensemble = BaggingRegressor(SVR(),
+    ensemble = BaggingRegressor(SVR(gamma='scale'),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)
     assert_true(isinstance(ensemble.base_estimator_, SVR))
diff --git a/sklearn/ensemble/tests/test_voting_classifier.py b/sklearn/ensemble/tests/test_voting_classifier.py
index ef6c0c739763b..d5a8e055f5d45 100644
--- a/sklearn/ensemble/tests/test_voting_classifier.py
+++ b/sklearn/ensemble/tests/test_voting_classifier.py
@@ -251,7 +251,7 @@ def test_sample_weight():
     """Tests sample_weight parameter of VotingClassifier"""
     clf1 = LogisticRegression(random_state=123)
     clf2 = RandomForestClassifier(random_state=123)
-    clf3 = SVC(probability=True, random_state=123)
+    clf3 = SVC(gamma='scale', probability=True, random_state=123)
     eclf1 = VotingClassifier(estimators=[
         ('lr', clf1), ('rf', clf2), ('svc', clf3)],
         voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
index b6912de138dd6..4a8a806ed6a62 100755
--- a/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -280,29 +280,27 @@ def test_error():
 def test_base_estimator():
     # Test different base estimators.
     from sklearn.ensemble import RandomForestClassifier
-    from sklearn.svm import SVC
 
     # XXX doesn't work with y_class because RF doesn't support classes_
     # Shouldn't AdaBoost run a LabelBinarizer?
     clf = AdaBoostClassifier(RandomForestClassifier())
     clf.fit(X, y_regr)
 
-    clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
+    clf = AdaBoostClassifier(SVC(gamma="scale"), algorithm="SAMME")
     clf.fit(X, y_class)
 
     from sklearn.ensemble import RandomForestRegressor
-    from sklearn.svm import SVR
 
     clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
     clf.fit(X, y_regr)
 
-    clf = AdaBoostRegressor(SVR(), random_state=0)
+    clf = AdaBoostRegressor(SVR(gamma='scale'), random_state=0)
     clf.fit(X, y_regr)
 
     # Check that an empty discrete ensemble fails in fit, not predict.
     X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
     y_fail = ["foo", "bar", 1, 2]
-    clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
+    clf = AdaBoostClassifier(SVC(gamma="scale"), algorithm="SAMME")
     assert_raises_regexp(ValueError, "worse than random",
                          clf.fit, X_fail, y_fail)
 
@@ -344,14 +342,14 @@ def fit(self, X, y, sample_weight=None):
 
         # Trained on sparse format
         sparse_classifier = AdaBoostClassifier(
-            base_estimator=CustomSVC(probability=True),
+            base_estimator=CustomSVC(gamma='scale', probability=True),
             random_state=1,
             algorithm="SAMME"
         ).fit(X_train_sparse, y_train)
 
         # Trained on dense format
         dense_classifier = AdaBoostClassifier(
-            base_estimator=CustomSVC(probability=True),
+            base_estimator=CustomSVC(gamma='scale', probability=True),
             random_state=1,
             algorithm="SAMME"
         ).fit(X_train, y_train)
@@ -438,13 +436,13 @@ def fit(self, X, y, sample_weight=None):
 
         # Trained on sparse format
         sparse_classifier = AdaBoostRegressor(
-            base_estimator=CustomSVR(),
+            base_estimator=CustomSVR(gamma='scale'),
             random_state=1
         ).fit(X_train_sparse, y_train)
 
         # Trained on dense format
         dense_classifier = dense_results = AdaBoostRegressor(
-            base_estimator=CustomSVR(),
+            base_estimator=CustomSVR(gamma='scale'),
             random_state=1
         ).fit(X_train, y_train)
 
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 630de2860c637..6ba673d2fbf7c 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -740,7 +740,7 @@ class GridSearchCV(BaseSearchCV):
     >>> from sklearn import svm, grid_search, datasets
     >>> iris = datasets.load_iris()
     >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    >>> svr = svm.SVC()
+    >>> svr = svm.SVC(gamma="scale")
     >>> clf = grid_search.GridSearchCV(svr, parameters)
     >>> clf.fit(iris.data, iris.target)
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 4af8a03c3e194..d9e7d3166b8ff 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -937,7 +937,7 @@ class GridSearchCV(BaseSearchCV):
     >>> from sklearn.model_selection import GridSearchCV
     >>> iris = datasets.load_iris()
     >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    >>> svc = svm.SVC()
+    >>> svc = svm.SVC(gamma="scale")
     >>> clf = GridSearchCV(svc, parameters)
     >>> clf.fit(iris.data, iris.target)
     ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 6412a14c036ed..87d93894683b8 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -484,7 +484,7 @@ def test_grid_search_bad_param_grid():
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": []}
-    clf = SVC()
+    clf = SVC(gamma="scale")
     assert_raise_message(
         ValueError,
         "Parameter values for parameter (C) need to be a non-empty sequence.",
@@ -499,7 +499,7 @@ def test_grid_search_bad_param_grid():
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": np.ones(6).reshape(3, 2)}
-    clf = SVC()
+    clf = SVC(gamma="scale")
     assert_raises(ValueError, GridSearchCV, clf, param_dict)
 
 
@@ -828,7 +828,8 @@ def test_grid_search_cv_results():
     n_candidates = n_grid_points
 
     for iid in (False, True):
-        search = GridSearchCV(SVC(), cv=n_splits, iid=iid, param_grid=params)
+        search = GridSearchCV(SVC(gamma='scale'), cv=n_splits, iid=iid,
+                              param_grid=params)
         search.fit(X, y)
         assert_equal(iid, search.iid)
         cv_results = search.cv_results_
@@ -878,8 +879,9 @@ def test_random_search_cv_results():
     n_cand = n_search_iter
 
     for iid in (False, True):
-        search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_splits,
-                                    iid=iid, param_distributions=params)
+        search = RandomizedSearchCV(SVC(gamma='scale'), n_iter=n_search_iter,
+                                    cv=n_splits, iid=iid,
+                                    param_distributions=params)
         search.fit(X, y)
         assert_equal(iid, search.iid)
         cv_results = search.cv_results_
@@ -908,7 +910,8 @@ def test_search_iid_param():
     # create "cv" for splits
     cv = [[mask, ~mask], [~mask, mask]]
     # once with iid=True (default)
-    grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]}, cv=cv)
+    grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]},
+                               cv=cv)
     random_search = RandomizedSearchCV(SVC(), n_iter=2,
                                        param_distributions={'C': [1, 10]},
                                        cv=cv)
@@ -942,7 +945,8 @@ def test_search_iid_param():
         assert_almost_equal(test_mean, expected_test_mean)
         assert_almost_equal(test_std, expected_test_std)
         assert_array_almost_equal(test_cv_scores,
-                                  cross_val_score(SVC(C=1), X, y, cv=cv))
+                                  cross_val_score(SVC(C=1), X,
+                                                  y, cv=cv))
 
         # For the train scores, we do not take a weighted mean irrespective of
         # i.i.d. or not
@@ -998,9 +1002,9 @@ def test_grid_search_cv_results_multimetric():
         for scoring in ({'accuracy': make_scorer(accuracy_score),
                          'recall': make_scorer(recall_score)},
                         'accuracy', 'recall'):
-            grid_search = GridSearchCV(SVC(), cv=n_splits, iid=iid,
-                                       param_grid=params, scoring=scoring,
-                                       refit=False)
+            grid_search = GridSearchCV(SVC(gamma='scale'), cv=n_splits,
+                                       iid=iid, param_grid=params,
+                                       scoring=scoring, refit=False)
             grid_search.fit(X, y)
             assert_equal(grid_search.iid, iid)
             grid_searches.append(grid_search)
@@ -1095,8 +1099,8 @@ def test_search_cv_results_rank_tie_breaking():
     # which would result in a tie of their mean cv-scores
     param_grid = {'C': [1, 1.001, 0.001]}
 
-    grid_search = GridSearchCV(SVC(), param_grid=param_grid)
-    random_search = RandomizedSearchCV(SVC(), n_iter=3,
+    grid_search = GridSearchCV(SVC(gamma="scale"), param_grid=param_grid)
+    random_search = RandomizedSearchCV(SVC(gamma="scale"), n_iter=3,
                                        param_distributions=param_grid)
 
     for search in (grid_search, random_search):
@@ -1282,7 +1286,7 @@ def test_predict_proba_disabled():
     # Test predict_proba when disabled on estimator.
     X = np.arange(20).reshape(5, -1)
     y = [0, 0, 1, 1, 1]
-    clf = SVC(probability=False)
+    clf = SVC(gamma='scale', probability=False)
     gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
     assert_false(hasattr(gs, "predict_proba"))
 
@@ -1536,18 +1540,18 @@ def test_deprecated_grid_search_iid():
     depr_message = ("The default of the `iid` parameter will change from True "
                     "to False in version 0.22")
     X, y = make_blobs(n_samples=54, random_state=0, centers=2)
-    grid = GridSearchCV(SVC(), param_grid={'C': [1]}, cv=3)
+    grid = GridSearchCV(SVC(gamma='scale'), param_grid={'C': [1]}, cv=3)
     # no warning with equally sized test sets
     assert_no_warnings(grid.fit, X, y)
 
-    grid = GridSearchCV(SVC(), param_grid={'C': [1]}, cv=5)
+    grid = GridSearchCV(SVC(gamma='scale'), param_grid={'C': [1]}, cv=5)
     # warning because 54 % 5 != 0
     assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)
 
-    grid = GridSearchCV(SVC(), param_grid={'C': [1]}, cv=2)
+    grid = GridSearchCV(SVC(gamma='scale'), param_grid={'C': [1]}, cv=2)
     # warning because stratification into two classes and 27 % 2 != 0
     assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)
 
-    grid = GridSearchCV(SVC(), param_grid={'C': [1]}, cv=KFold(2))
+    grid = GridSearchCV(SVC(gamma='scale'), param_grid={'C': [1]}, cv=KFold(2))
     # no warning because no stratification and 54 % 2 == 0
     assert_no_warnings(grid.fit, X, y)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 23d127631ad7d..2929916619769 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -339,10 +339,10 @@ def test_cross_validate_invalid_scoring_param():
 
     # Multiclass Scorers that return multiple values are not supported yet
     assert_raises_regex(ValueError, "scoring must return a number, got",
-                        cross_validate, SVC(), X, y,
+                        cross_validate, SVC(gamma='scale'), X, y,
                         scoring=multivalued_scorer)
     assert_raises_regex(ValueError, "scoring must return a number, got",
-                        cross_validate, SVC(), X, y,
+                        cross_validate, SVC(gamma='scale'), X, y,
                         scoring={"foo": multivalued_scorer})
 
     assert_raises_regex(ValueError, "'mse' is not a valid scoring value.",
@@ -572,7 +572,7 @@ def test_cross_val_score_precomputed():
     assert_array_almost_equal(score_precomputed, score_linear)
 
     # test with callable
-    svm = SVC(kernel=lambda x, y: np.dot(x, y.T))
+    svm = SVC(gamma='scale', kernel=lambda x, y: np.dot(x, y.T))
     score_callable = cross_val_score(svm, X, y)
     assert_array_almost_equal(score_precomputed, score_callable)
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 03947d2f923e4..51c37097adca2 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1773,7 +1773,8 @@ def test_cv_pipeline_precomputed():
     y_true = np.ones((4,))
     K = X.dot(X.T)
     kcent = KernelCenterer()
-    pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
+    pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
+                        SVR(gamma='scale'))])
 
     # did the pipeline set the _pairwise attribute?
     assert_true(pipeline._pairwise)
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index eb5bb01508953..289064718bf65 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -168,7 +168,32 @@ def fit(self, X, y, sample_weight=None):
                              "boolean masks (use `indices=True` in CV)."
                              % (sample_weight.shape, X.shape))
 
-        if self.gamma == 'auto':
+        if self.gamma in ('scale', 'auto_deprecated'):
+            if sparse:
+                # std = sqrt(E[X^2] - E[X]^2)
+                X_std = np.sqrt((X.multiply(X)).mean() - (X.mean())**2)
+            else:
+                X_std = X.std()
+            if self.gamma == 'scale':
+                if X_std != 0:
+                    self._gamma = 1.0 / (X.shape[1] * X_std)
+                else:
+                    self._gamma = 1.0
+            else:
+                kernel_uses_gamma = (not callable(self.kernel) and self.kernel
+                                     not in ('linear', 'precomputed'))
+                if kernel_uses_gamma and not np.isclose(X_std, 1.0):
+                    # NOTE: when deprecation ends we need to remove explicitly
+                    # setting `gamma` in examples (also in tests). See
+                    # https://github.com/scikit-learn/scikit-learn/pull/10331
+                    # for the examples/tests that need to be reverted.
+                    warnings.warn("The default value of gamma will change "
+                                  "from 'auto' to 'scale' in version 0.22 to "
+                                  "account better for unscaled features. Set "
+                                  "gamma explicitly to 'auto' or 'scale' to "
+                                  "avoid this warning.", FutureWarning)
+                self._gamma = 1.0 / X.shape[1]
+        elif self.gamma == 'auto':
             self._gamma = 1.0 / X.shape[1]
         else:
             self._gamma = self.gamma
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 7a7955d30d29e..9ed8c4d6a2db1 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -446,12 +446,12 @@ class SVC(BaseSVC):
         Penalty parameter C of the error term.
 
     kernel : string, optional (default='rbf')
-         Specifies the kernel type to be used in the algorithm.
-         It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
-         a callable.
-         If none is given, 'rbf' will be used. If a callable is given it is
-         used to pre-compute the kernel matrix from data matrices; that matrix
-         should be an array of shape ``(n_samples, n_samples)``.
+        Specifies the kernel type to be used in the algorithm.
+        It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
+        a callable.
+        If none is given, 'rbf' will be used. If a callable is given it is
+        used to pre-compute the kernel matrix from data matrices; that matrix
+        should be an array of shape ``(n_samples, n_samples)``.
 
     degree : int, optional (default=3)
         Degree of the polynomial kernel function ('poly').
@@ -459,7 +459,13 @@ class SVC(BaseSVC):
 
     gamma : float, optional (default='auto')
         Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
-        If gamma is 'auto' then 1/n_features will be used instead.
+
+        Current default is 'auto' which uses 1 / n_features,
+        if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std())
+        as value of gamma. The current default of gamma, 'auto', will change
+        to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of
+        'auto' is used as a default indicating that no explicit value of gamma
+        was passed.
 
     coef0 : float, optional (default=0.0)
         Independent term in kernel function.
@@ -550,7 +556,7 @@ class SVC(BaseSVC):
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
     >>> from sklearn.svm import SVC
-    >>> clf = SVC()
+    >>> clf = SVC(gamma='auto')
     >>> clf.fit(X, y) #doctest: +NORMALIZE_WHITESPACE
     SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
         decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
@@ -573,7 +579,7 @@ class SVC(BaseSVC):
 
     _impl = 'c_svc'
 
-    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto',
+    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated',
                  coef0=0.0, shrinking=True, probability=False,
                  tol=1e-3, cache_size=200, class_weight=None,
                  verbose=False, max_iter=-1, decision_function_shape='ovr',
@@ -618,7 +624,13 @@ class NuSVC(BaseSVC):
 
     gamma : float, optional (default='auto')
         Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
-        If gamma is 'auto' then 1/n_features will be used instead.
+
+        Current default is 'auto' which uses 1 / n_features,
+        if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std())
+        as value of gamma. The current default of gamma, 'auto', will change
+        to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of
+        'auto' is used as a default indicating that no explicit value of gamma
+        was passed.
 
     coef0 : float, optional (default=0.0)
         Independent term in kernel function.
@@ -708,10 +720,10 @@ class NuSVC(BaseSVC):
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
     >>> from sklearn.svm import NuSVC
-    >>> clf = NuSVC()
+    >>> clf = NuSVC(gamma='scale')
     >>> clf.fit(X, y) #doctest: +NORMALIZE_WHITESPACE
     NuSVC(cache_size=200, class_weight=None, coef0=0.0,
-          decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
+          decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
           max_iter=-1, nu=0.5, probability=False, random_state=None,
           shrinking=True, tol=0.001, verbose=False)
     >>> print(clf.predict([[-0.8, -1]]))
@@ -729,9 +741,9 @@ class NuSVC(BaseSVC):
 
     _impl = 'nu_svc'
 
-    def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0,
-                 shrinking=True, probability=False, tol=1e-3, cache_size=200,
-                 class_weight=None, verbose=False, max_iter=-1,
+    def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='auto_deprecated',
+                 coef0=0.0, shrinking=True, probability=False, tol=1e-3,
+                 cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                  decision_function_shape='ovr', random_state=None):
 
         super(NuSVC, self).__init__(
@@ -776,7 +788,13 @@ class SVR(BaseLibSVM, RegressorMixin):
 
     gamma : float, optional (default='auto')
         Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
-        If gamma is 'auto' then 1/n_features will be used instead.
+
+        Current default is 'auto' which uses 1 / n_features,
+        if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std())
+        as value of gamma. The current default of gamma, 'auto', will change
+        to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of
+        'auto' is used as a default indicating that no explicit value of gamma
+        was passed.
 
     coef0 : float, optional (default=0.0)
         Independent term in kernel function.
@@ -831,9 +849,9 @@ class SVR(BaseLibSVM, RegressorMixin):
     >>> np.random.seed(0)
     >>> y = np.random.randn(n_samples)
     >>> X = np.random.randn(n_samples, n_features)
-    >>> clf = SVR(C=1.0, epsilon=0.2)
+    >>> clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
     >>> clf.fit(X, y) #doctest: +NORMALIZE_WHITESPACE
-    SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
+    SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
         kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
 
     See also
@@ -849,8 +867,8 @@ class SVR(BaseLibSVM, RegressorMixin):
 
     _impl = 'epsilon_svr'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0,
-                 tol=1e-3, C=1.0, epsilon=0.1, shrinking=True,
+    def __init__(self, kernel='rbf', degree=3, gamma='auto_deprecated',
+                 coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True,
                  cache_size=200, verbose=False, max_iter=-1):
 
         super(SVR, self).__init__(
@@ -894,7 +912,13 @@ class NuSVR(BaseLibSVM, RegressorMixin):
 
     gamma : float, optional (default='auto')
         Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
-        If gamma is 'auto' then 1/n_features will be used instead.
+
+        Current default is 'auto' which uses 1 / n_features,
+        if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std())
+        as value of gamma. The current default of gamma, 'auto', will change
+        to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of
+        'auto' is used as a default indicating that no explicit value of gamma
+        was passed.
 
     coef0 : float, optional (default=0.0)
         Independent term in kernel function.
@@ -946,9 +970,9 @@ class NuSVR(BaseLibSVM, RegressorMixin):
     >>> np.random.seed(0)
     >>> y = np.random.randn(n_samples)
     >>> X = np.random.randn(n_samples, n_features)
-    >>> clf = NuSVR(C=1.0, nu=0.1)
+    >>> clf = NuSVR(gamma='scale', C=1.0, nu=0.1)
     >>> clf.fit(X, y)  #doctest: +NORMALIZE_WHITESPACE
-    NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='auto',
+    NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale',
           kernel='rbf', max_iter=-1, nu=0.1, shrinking=True, tol=0.001,
           verbose=False)
 
@@ -965,8 +989,8 @@ class NuSVR(BaseLibSVM, RegressorMixin):
     _impl = 'nu_svr'
 
     def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
-                 gamma='auto', coef0=0.0, shrinking=True, tol=1e-3,
-                 cache_size=200, verbose=False, max_iter=-1):
+                 gamma='auto_deprecated', coef0=0.0, shrinking=True,
+                 tol=1e-3, cache_size=200, verbose=False, max_iter=-1):
 
         super(NuSVR, self).__init__(
             kernel=kernel, degree=degree, gamma=gamma, coef0=coef0,
@@ -1005,7 +1029,13 @@ class OneClassSVM(BaseLibSVM, OutlierMixin):
 
     gamma : float, optional (default='auto')
         Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
-        If gamma is 'auto' then 1/n_features will be used instead.
+
+        Current default is 'auto' which uses 1 / n_features,
+        if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std())
+        as value of gamma. The current default of gamma, 'auto', will change
+        to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of
+        'auto' is used as a default indicating that no explicit value of gamma
+        was passed.
 
     coef0 : float, optional (default=0.0)
         Independent term in kernel function.
@@ -1066,8 +1096,8 @@ class OneClassSVM(BaseLibSVM, OutlierMixin):
 
     _impl = 'one_class'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0,
-                 tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
+    def __init__(self, kernel='rbf', degree=3, gamma='auto_deprecated',
+                 coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
                  verbose=False, max_iter=-1, random_state=None):
 
         super(OneClassSVM, self).__init__(
diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py
index 56ea0b9eaf280..5fa83050a98f1 100644
--- a/sklearn/svm/tests/test_sparse.py
+++ b/sklearn/svm/tests/test_sparse.py
@@ -83,10 +83,10 @@ def test_svc():
     kernels = ["linear", "poly", "rbf", "sigmoid"]
     for dataset in datasets:
         for kernel in kernels:
-            clf = svm.SVC(kernel=kernel, probability=True, random_state=0,
-                          decision_function_shape='ovo')
-            sp_clf = svm.SVC(kernel=kernel, probability=True, random_state=0,
-                             decision_function_shape='ovo')
+            clf = svm.SVC(gamma='scale', kernel=kernel, probability=True,
+                          random_state=0, decision_function_shape='ovo')
+            sp_clf = svm.SVC(gamma='scale', kernel=kernel, probability=True,
+                             random_state=0, decision_function_shape='ovo')
             check_svm_model_equal(clf, sp_clf, *dataset)
 
 
@@ -127,15 +127,16 @@ def test_svc_with_custom_kernel():
     def kfunc(x, y):
         return safe_sparse_dot(x, y.T)
     clf_lin = svm.SVC(kernel='linear').fit(X_sp, Y)
-    clf_mylin = svm.SVC(kernel=kfunc).fit(X_sp, Y)
+    clf_mylin = svm.SVC(gamma='scale', kernel=kfunc).fit(X_sp, Y)
     assert_array_equal(clf_lin.predict(X_sp), clf_mylin.predict(X_sp))
 
 
 def test_svc_iris():
     # Test the sparse SVC with the iris dataset
     for k in ('linear', 'poly', 'rbf'):
-        sp_clf = svm.SVC(kernel=k).fit(iris.data, iris.target)
-        clf = svm.SVC(kernel=k).fit(iris.data.toarray(), iris.target)
+        sp_clf = svm.SVC(gamma='scale', kernel=k).fit(iris.data, iris.target)
+        clf = svm.SVC(gamma='scale', kernel=k).fit(iris.data.toarray(),
+                                                   iris.target)
 
         assert_array_almost_equal(clf.support_vectors_,
                                   sp_clf.support_vectors_.toarray())
@@ -175,16 +176,16 @@ def test_sparse_decision_function():
 def test_error():
     # Test that it gives proper exception on deficient input
     # impossible value of C
-    assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y)
+    assert_raises(ValueError, svm.SVC(gamma='scale', C=-1).fit, X, Y)
 
     # impossible value of nu
-    clf = svm.NuSVC(nu=0.0)
+    clf = svm.NuSVC(gamma='scale', nu=0.0)
     assert_raises(ValueError, clf.fit, X_sp, Y)
 
     Y2 = Y[:-1]  # wrong dimensions for labels
     assert_raises(ValueError, clf.fit, X_sp, Y2)
 
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     clf.fit(X_sp, Y)
     assert_array_equal(clf.predict(T), true_result)
 
@@ -241,7 +242,7 @@ def test_weight():
     X_ = sparse.csr_matrix(X_)
     for clf in (linear_model.LogisticRegression(),
                 svm.LinearSVC(random_state=0),
-                svm.SVC()):
+                svm.SVC(gamma="scale")):
         clf.set_params(class_weight={0: 5})
         clf.fit(X_[:180], y_[:180])
         y_pred = clf.predict(X_[180:])
@@ -250,7 +251,7 @@ def test_weight():
 
 def test_sample_weights():
     # Test weights on individual samples
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     clf.fit(X_sp, Y)
     assert_array_equal(clf.predict([X[2]]), [1.])
 
@@ -276,8 +277,8 @@ def test_sparse_oneclasssvm():
     kernels = ["linear", "poly", "rbf", "sigmoid"]
     for dataset in datasets:
         for kernel in kernels:
-            clf = svm.OneClassSVM(kernel=kernel)
-            sp_clf = svm.OneClassSVM(kernel=kernel)
+            clf = svm.OneClassSVM(gamma='scale', kernel=kernel)
+            sp_clf = svm.OneClassSVM(gamma='scale', kernel=kernel)
             check_svm_model_equal(clf, sp_clf, *dataset)
 
 
@@ -313,15 +314,15 @@ def test_sparse_realdata():
 def test_sparse_svc_clone_with_callable_kernel():
     # Test that the "dense_fit" is called even though we use sparse input
     # meaning that everything works fine.
-    a = svm.SVC(C=1, kernel=lambda x, y: x * y.T, probability=True,
-                random_state=0)
+    a = svm.SVC(gamma='scale', C=1, kernel=lambda x, y: x * y.T,
+                probability=True, random_state=0)
     b = base.clone(a)
 
     b.fit(X_sp, Y)
     pred = b.predict(X_sp)
     b.predict_proba(X_sp)
 
-    dense_svm = svm.SVC(C=1, kernel=lambda x, y: np.dot(x, y.T),
+    dense_svm = svm.SVC(gamma='scale', C=1, kernel=lambda x, y: np.dot(x, y.T),
                         probability=True, random_state=0)
     pred_dense = dense_svm.fit(X, Y).predict(X)
     assert_array_equal(pred_dense, pred)
@@ -329,17 +330,17 @@ def test_sparse_svc_clone_with_callable_kernel():
 
 
 def test_timeout():
-    sp = svm.SVC(C=1, kernel=lambda x, y: x * y.T, probability=True,
-                 random_state=0, max_iter=1)
+    sp = svm.SVC(gamma='scale', C=1, kernel=lambda x, y: x * y.T,
+                 probability=True, random_state=0, max_iter=1)
 
     assert_warns(ConvergenceWarning, sp.fit, X_sp, Y)
 
 
 def test_consistent_proba():
-    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    a = svm.SVC(gamma='scale', probability=True, max_iter=1, random_state=0)
     with ignore_warnings(category=ConvergenceWarning):
         proba_1 = a.fit(X, Y).predict_proba(X)
-    a = svm.SVC(probability=True, max_iter=1, random_state=0)
+    a = svm.SVC(gamma='scale', probability=True, max_iter=1, random_state=0)
     with ignore_warnings(category=ConvergenceWarning):
         proba_2 = a.fit(X, Y).predict_proba(X)
     assert_array_almost_equal(proba_1, proba_2)
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index 8bbf662b91e80..d3384ffe6a5e4 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -20,6 +20,7 @@
 from sklearn.utils.testing import assert_raises_regexp, assert_warns
 from sklearn.utils.testing import assert_warns_message, assert_raise_message
 from sklearn.utils.testing import ignore_warnings, assert_raises
+from sklearn.utils.testing import assert_no_warnings
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.exceptions import NotFittedError
 from sklearn.multiclass import OneVsRestClassifier
@@ -54,7 +55,7 @@ def test_libsvm_iris():
 
     # shuffle the dataset so that labels are not ordered
     for k in ('linear', 'rbf'):
-        clf = svm.SVC(kernel=k).fit(iris.data, iris.target)
+        clf = svm.SVC(gamma='scale', kernel=k).fit(iris.data, iris.target)
         assert_greater(np.mean(clf.predict(iris.data) == iris.target), 0.9)
         assert_true(hasattr(clf, "coef_") == (k == 'linear'))
 
@@ -119,7 +120,7 @@ def test_precomputed():
     # matrix. kernel is just a linear kernel
 
     kfunc = lambda x, y: np.dot(x, y.T)
-    clf = svm.SVC(kernel=kfunc)
+    clf = svm.SVC(gamma='scale', kernel=kfunc)
     clf.fit(X, Y)
     pred = clf.predict(T)
 
@@ -151,7 +152,7 @@ def test_precomputed():
     pred = clf.predict(K)
     assert_almost_equal(np.mean(pred == iris.target), .99, decimal=2)
 
-    clf = svm.SVC(kernel=kfunc)
+    clf = svm.SVC(gamma='scale', kernel=kfunc)
     clf.fit(iris.data, iris.target)
     assert_almost_equal(np.mean(pred == iris.target), .99, decimal=2)
 
@@ -171,7 +172,7 @@ def test_svr():
 
     # non-regression test; previously, BaseLibSVM would check that
     # len(np.unique(y)) < 2, which must only be done for SVC
-    svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data)))
+    svm.SVR(gamma='scale').fit(diabetes.data, np.ones(len(diabetes.data)))
     svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
 
 
@@ -230,22 +231,22 @@ def test_svr_errors():
     y = [0.0, 0.5]
 
     # Bad kernel
-    clf = svm.SVR(kernel=lambda x, y: np.array([[1.0]]))
+    clf = svm.SVR(gamma='scale', kernel=lambda x, y: np.array([[1.0]]))
     clf.fit(X, y)
     assert_raises(ValueError, clf.predict, X)
 
 
 def test_oneclass():
     # Test OneClassSVM
-    clf = svm.OneClassSVM()
+    clf = svm.OneClassSVM(gamma='scale')
     clf.fit(X)
     pred = clf.predict(T)
 
     assert_array_equal(pred, [-1, -1, -1])
     assert_equal(pred.dtype, np.dtype('intp'))
-    assert_array_almost_equal(clf.intercept_, [-1.008], decimal=3)
+    assert_array_almost_equal(clf.intercept_, [-1.117], decimal=3)
     assert_array_almost_equal(clf.dual_coef_,
-                              [[0.632, 0.233, 0.633, 0.234, 0.632, 0.633]],
+                              [[0.681, 0.139, 0.68, 0.14, 0.68, 0.68]],
                               decimal=3)
     assert_raises(AttributeError, lambda: clf.coef_)
 
@@ -306,8 +307,9 @@ def test_probability():
     # Predict probabilities using SVC
     # This uses cross validation, so we use a slightly bigger testing set.
 
-    for clf in (svm.SVC(probability=True, random_state=0, C=1.0),
-                svm.NuSVC(probability=True, random_state=0)):
+    for clf in (svm.SVC(gamma='scale', probability=True, random_state=0,
+                C=1.0), svm.NuSVC(gamma='scale', probability=True,
+                                  random_state=0)):
         clf.fit(iris.data, iris.target)
 
         prob_predict = clf.predict_proba(iris.data)
@@ -403,7 +405,7 @@ def test_svr_predict():
 
 def test_weight():
     # Test class weights
-    clf = svm.SVC(class_weight={1: 0.1})
+    clf = svm.SVC(gamma='scale', class_weight={1: 0.1})
     # we give a small weights to class 1
     clf.fit(X, Y)
     # so all predicted values belong to class 2
@@ -413,7 +415,7 @@ def test_weight():
                                  weights=[0.833, 0.167], random_state=2)
 
     for clf in (linear_model.LogisticRegression(),
-                svm.LinearSVC(random_state=0), svm.SVC()):
+                svm.LinearSVC(random_state=0), svm.SVC(gamma="scale")):
         clf.set_params(class_weight={0: .1, 1: 10})
         clf.fit(X_[:100], y_[:100])
         y_pred = clf.predict(X_[100:])
@@ -423,7 +425,7 @@ def test_weight():
 def test_sample_weights():
     # Test weights on individual samples
     # TODO: check on NuSVR, OneClass, etc.
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     clf.fit(X, Y)
     assert_array_equal(clf.predict([X[2]]), [1.])
 
@@ -432,7 +434,7 @@ def test_sample_weights():
     assert_array_equal(clf.predict([X[2]]), [2.])
 
     # test that rescaling all samples is the same as changing C
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     clf.fit(X, Y)
     dual_coef_no_weight = clf.dual_coef_
     clf.set_params(C=100)
@@ -470,17 +472,17 @@ def test_auto_weight():
 def test_bad_input():
     # Test that it gives proper exception on deficient input
     # impossible value of C
-    assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y)
+    assert_raises(ValueError, svm.SVC(gamma='scale', C=-1).fit, X, Y)
 
     # impossible value of nu
-    clf = svm.NuSVC(nu=0.0)
+    clf = svm.NuSVC(gamma='scale', nu=0.0)
     assert_raises(ValueError, clf.fit, X, Y)
 
     Y2 = Y[:-1]  # wrong dimensions for labels
     assert_raises(ValueError, clf.fit, X, Y2)
 
     # Test with arrays that are non-contiguous.
-    for clf in (svm.SVC(), svm.LinearSVC(random_state=0)):
+    for clf in (svm.SVC(gamma="scale"), svm.LinearSVC(random_state=0)):
         Xf = np.asfortranarray(X)
         assert_false(Xf.flags['C_CONTIGUOUS'])
         yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T)
@@ -495,18 +497,18 @@ def test_bad_input():
     assert_raises(ValueError, clf.fit, X, Y)
 
     # sample_weight bad dimensions
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     assert_raises(ValueError, clf.fit, X, Y, sample_weight=range(len(X) - 1))
 
     # predict with sparse input when trained with dense
-    clf = svm.SVC().fit(X, Y)
+    clf = svm.SVC(gamma="scale").fit(X, Y)
     assert_raises(ValueError, clf.predict, sparse.lil_matrix(X))
 
     Xt = np.array(X).T
     clf.fit(np.dot(X, Xt), Y)
     assert_raises(ValueError, clf.predict, X)
 
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     clf.fit(X, Y)
     assert_raises(ValueError, clf.predict, Xt)
 
@@ -524,7 +526,7 @@ def test_unicode_kernel():
                                     random_seed=0)
 
     # Test default behavior on both versions
-    clf = svm.SVC(kernel='linear', probability=True)
+    clf = svm.SVC(gamma='scale', kernel='linear', probability=True)
     clf.fit(X, Y)
     clf.predict_proba(T)
     svm.libsvm.cross_validation(iris.data,
@@ -811,7 +813,7 @@ def test_linearsvc_verbose():
 def test_svc_clone_with_callable_kernel():
     # create SVM with callable linear kernel, check that results are the same
     # as with built-in linear kernel
-    svm_callable = svm.SVC(kernel=lambda x, y: np.dot(x, y.T),
+    svm_callable = svm.SVC(gamma='scale', kernel=lambda x, y: np.dot(x, y.T),
                            probability=True, random_state=0,
                            decision_function_shape='ovr')
     # clone for checking clonability with lambda functions..
@@ -837,7 +839,7 @@ def test_svc_clone_with_callable_kernel():
 
 
 def test_svc_bad_kernel():
-    svc = svm.SVC(kernel=lambda x, y: x)
+    svc = svm.SVC(gamma='scale', kernel=lambda x, y: x)
     assert_raises(ValueError, svc.fit, X, Y)
 
 
@@ -850,11 +852,11 @@ def test_timeout():
 def test_unfitted():
     X = "foo!"  # input validation not required when SVM not fitted
 
-    clf = svm.SVC()
+    clf = svm.SVC(gamma="scale")
     assert_raises_regexp(Exception, r".*\bSVC\b.*\bnot\b.*\bfitted\b",
                          clf.predict, X)
 
-    clf = svm.NuSVR()
+    clf = svm.NuSVR(gamma='scale')
     assert_raises_regexp(Exception, r".*\bNuSVR\b.*\bnot\b.*\bfitted\b",
                          clf.predict, X)
 
@@ -913,12 +915,12 @@ def test_hasattr_predict_proba():
     # Method must be (un)available before or after fit, switched by
     # `probability` param
 
-    G = svm.SVC(probability=True)
+    G = svm.SVC(gamma='scale', probability=True)
     assert_true(hasattr(G, 'predict_proba'))
     G.fit(iris.data, iris.target)
     assert_true(hasattr(G, 'predict_proba'))
 
-    G = svm.SVC(probability=False)
+    G = svm.SVC(gamma='scale', probability=False)
     assert_false(hasattr(G, 'predict_proba'))
     G.fit(iris.data, iris.target)
     assert_false(hasattr(G, 'predict_proba'))
@@ -935,7 +937,7 @@ def test_decision_function_shape_two_class():
     for n_classes in [2, 3]:
         X, y = make_blobs(centers=n_classes, random_state=0)
         for estimator in [svm.SVC, svm.NuSVC]:
-            clf = OneVsRestClassifier(estimator(
+            clf = OneVsRestClassifier(estimator(gamma='scale',
                 decision_function_shape="ovr")).fit(X, y)
             assert_equal(len(clf.predict(X)), len(y))
 
@@ -980,3 +982,29 @@ def test_ovr_decision_function():
     # Test if the first point has lower decision value on every quadrant
     # compared to the second point
     assert_true(np.all(pred_class_deci_val[:, 0] < pred_class_deci_val[:, 1]))
+
+
+def test_gamma_auto():
+    X, y = [[0.0, 1.2], [1.0, 1.3]], [0, 1]
+
+    msg = ("The default value of gamma will change from 'auto' to 'scale' in "
+           "version 0.22 to account better for unscaled features. Set gamma "
+           "explicitly to 'auto' or 'scale' to avoid this warning.")
+
+    assert_warns_message(FutureWarning, msg,
+                         svm.SVC().fit, X, y)
+    assert_no_warnings(svm.SVC(kernel='linear').fit, X, y)
+    assert_no_warnings(svm.SVC(kernel='precomputed').fit, X, y)
+
+
+def test_gamma_scale():
+    X, y = [[0.], [1.]], [0, 1]
+
+    clf = svm.SVC(gamma='scale')
+    assert_no_warnings(clf.fit, X, y)
+    assert_equal(clf._gamma, 2.)
+
+    # X_std ~= 1 shouldn't raise warning, for when
+    # gamma is not explicitly set.
+    X, y = [[1, 2], [3, 2 * np.sqrt(6) / 3 + 2]], [0, 1]
+    assert_no_warnings(clf.fit, X, y)
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 4d756bdaa0cf8..6a29a621ff588 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -1036,7 +1036,7 @@ def test_shufflesplit_reproducible():
 
 
 def test_safe_split_with_precomputed_kernel():
-    clf = SVC()
+    clf = SVC(gamma="scale")
     clfp = SVC(kernel="precomputed")
 
     iris = load_iris()
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
index f3c003e8c5be5..7a42757daea89 100644
--- a/sklearn/tests/test_grid_search.py
+++ b/sklearn/tests/test_grid_search.py
@@ -688,7 +688,7 @@ def test_predict_proba_disabled():
     # Test predict_proba when disabled on estimator.
     X = np.arange(20).reshape(5, -1)
     y = [0, 0, 1, 1, 1]
-    clf = SVC(probability=False)
+    clf = SVC(gamma='scale', probability=False)
     gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
     assert_false(hasattr(gs, "predict_proba"))
 
diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index 54071844260b3..f89e156bfd55c 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -179,7 +179,8 @@ def test_ovr_fit_predict_sparse():
         assert_array_equal(pred, Y_pred_sprs.toarray())
 
         # Test decision_function
-        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
+        clf = svm.SVC(gamma="scale")
+        clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
         dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
         assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
 
@@ -274,7 +275,7 @@ def conduct_test(base_clf, test_predict_proba=False):
                      Ridge(), ElasticNet()):
         conduct_test(base_clf)
 
-    for base_clf in (MultinomialNB(), SVC(probability=True),
+    for base_clf in (MultinomialNB(), SVC(gamma='scale', probability=True),
                      LogisticRegression()):
         conduct_test(base_clf, test_predict_proba=True)
 
@@ -298,7 +299,7 @@ def test_ovr_multilabel():
 
 
 def test_ovr_fit_predict_svc():
-    ovr = OneVsRestClassifier(svm.SVC())
+    ovr = OneVsRestClassifier(svm.SVC(gamma="scale"))
     ovr.fit(iris.data, iris.target)
     assert_equal(len(ovr.estimators_), 3)
     assert_greater(ovr.score(iris.data, iris.target), .9)
@@ -343,18 +344,20 @@ def test_ovr_multilabel_predict_proba():
         clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
 
         # Decision function only estimator.
-        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
+        decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
+                                            ).fit(X_train, Y_train)
         assert_false(hasattr(decision_only, 'predict_proba'))
 
         # Estimator with predict_proba disabled, depending on parameters.
-        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
+        decision_only = OneVsRestClassifier(svm.SVC(gamma='scale',
+                                                    probability=False))
         assert_false(hasattr(decision_only, 'predict_proba'))
         decision_only.fit(X_train, Y_train)
         assert_false(hasattr(decision_only, 'predict_proba'))
         assert_true(hasattr(decision_only, 'decision_function'))
 
         # Estimator which can get predict_proba enabled after fitting
-        gs = GridSearchCV(svm.SVC(probability=False),
+        gs = GridSearchCV(svm.SVC(gamma='scale', probability=False),
                           param_grid={'probability': [True]})
         proba_after_fit = OneVsRestClassifier(gs)
         assert_false(hasattr(proba_after_fit, 'predict_proba'))
@@ -378,7 +381,8 @@ def test_ovr_single_label_predict_proba():
     clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
 
     # Decision function only estimator.
-    decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
+    decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
+                                        ).fit(X_train, Y_train)
     assert_false(hasattr(decision_only, 'predict_proba'))
 
     Y_pred = clf.predict(X_test)
@@ -401,7 +405,7 @@ def test_ovr_multilabel_decision_function():
                                                    random_state=0)
     X_train, Y_train = X[:80], Y[:80]
     X_test = X[80:]
-    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
+    clf = OneVsRestClassifier(svm.SVC(gamma="scale")).fit(X_train, Y_train)
     assert_array_equal((clf.decision_function(X_test) > 0).astype(int),
                        clf.predict(X_test))
 
@@ -412,7 +416,7 @@ def test_ovr_single_label_decision_function():
                                         random_state=0)
     X_train, Y_train = X[:80], Y[:80]
     X_test = X[80:]
-    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
+    clf = OneVsRestClassifier(svm.SVC(gamma="scale")).fit(X_train, Y_train)
     assert_array_equal(clf.decision_function(X_test).ravel() > 0,
                        clf.predict(X_test))
 
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 95172b103be7a..beb4be61d0b07 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -318,7 +318,7 @@ def test_pipeline_methods_pca_svm():
     X = iris.data
     y = iris.target
     # Test with PCA + SVC
-    clf = SVC(probability=True, random_state=0)
+    clf = SVC(gamma='scale', probability=True, random_state=0)
     pca = PCA(svd_solver='full', n_components='mle', whiten=True)
     pipe = Pipeline([('pca', pca), ('svc', clf)])
     pipe.fit(X, y)
@@ -337,7 +337,8 @@ def test_pipeline_methods_preprocessing_svm():
     n_classes = len(np.unique(y))
     scaler = StandardScaler()
     pca = PCA(n_components=2, svd_solver='randomized', whiten=True)
-    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')
+    clf = SVC(gamma='scale', probability=True, random_state=0,
+              decision_function_shape='ovr')
 
     for preprocessing in [scaler, pca]:
         pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
@@ -903,8 +904,8 @@ def test_pipeline_wrong_memory():
     y = iris.target
     # Define memory as an integer
     memory = 1
-    cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
-                           memory=memory)
+    cached_pipe = Pipeline([('transf', DummyTransf()),
+                            ('svc', SVC())], memory=memory)
     assert_raises_regex(ValueError, "'memory' should be None, a string or"
                         " have the same interface as "
                         "sklearn.externals.joblib.Memory."
@@ -942,7 +943,7 @@ def test_pipeline_memory():
     try:
         memory = Memory(cachedir=cachedir, verbose=10)
         # Test with Transformer + SVC
-        clf = SVC(probability=True, random_state=0)
+        clf = SVC(gamma='scale', probability=True, random_state=0)
         transf = DummyTransf()
         pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
         cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
@@ -976,7 +977,7 @@ def test_pipeline_memory():
         assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_)
         # Create a new pipeline with cloned estimators
         # Check that even changing the name step does not affect the cache hit
-        clf_2 = SVC(probability=True, random_state=0)
+        clf_2 = SVC(gamma='scale', probability=True, random_state=0)
         transf_2 = DummyTransf()
         cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                  memory=memory)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 95f5841ac80ba..a3a4175d7eff4 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -599,7 +599,7 @@ def test_check_is_fitted():
     assert_raises(TypeError, check_is_fitted, "SVR", "support_")
 
     ard = ARDRegression()
-    svr = SVR()
+    svr = SVR(gamma='scale')
 
     try:
         assert_raises(NotFittedError, check_is_fitted, ard, "coef_")