Skip to content

Commit fe13f04

Browse files
committed
DEPR: Deprecate str.split return_type
1 parent 13ca328 commit fe13f04

File tree

4 files changed

+123
-35
lines changed

4 files changed

+123
-35
lines changed

Diff for: doc/source/whatsnew/v0.16.1.txt

+29
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,28 @@ enhancements are performed to make string operation easier.
221221
idx.str.startswith('a')
222222
s[s.index.str.startswith('a')]
223223

224+
225+
- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`)
226+
227+
.. ipython:: python
228+
229+
s = Series(['a,b', 'a,c', 'b,c'])
230+
231+
# return Series
232+
s.str.split(',')
233+
234+
# return DataFrame
235+
s.str.split(',', expand=True)
236+
237+
idx = Index(['a,b', 'a,c', 'b,c'])
238+
239+
# return Index
240+
idx.str.split(',')
241+
242+
# return MultiIndex
243+
idx.str.split(',', expand=True)
244+
245+
224246
- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
225247

226248
.. _whatsnew_0161.api:
@@ -249,6 +271,13 @@ API changes
249271

250272
- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)
251273

274+
.. _whatsnew_0161.deprecations:
275+
276+
Deprecations
277+
^^^^^^^^^^^^
278+
279+
- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`)
280+
252281
.. _whatsnew_0161.performance:
253282

254283
Performance Improvements

Diff for: pandas/core/strings.py

+21-26
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pandas.compat import zip
44
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
55
import pandas.compat as compat
6-
from pandas.util.decorators import Appender
6+
from pandas.util.decorators import Appender, deprecate_kwarg
77
import re
88
import pandas.lib as lib
99
import warnings
@@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
696696
return _na_map(f, arr)
697697

698698

699-
def str_split(arr, pat=None, n=None, return_type='series'):
699+
def str_split(arr, pat=None, n=None):
700700
"""
701701
Split each string (a la re.split) in the Series/Index by given
702702
pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -706,28 +706,19 @@ def str_split(arr, pat=None, n=None, return_type='series'):
706706
pat : string, default None
707707
String or regular expression to split on. If None, splits on whitespace
708708
n : int, default None (all)
709-
return_type : {'series', 'index', 'frame'}, default 'series'
710-
If frame, returns a DataFrame (elements are strings)
711-
If series or index, returns the same type as the original object
712-
(elements are lists of strings).
709+
expand : bool, default False
710+
* If True, return DataFrame/MultiIndex expanding dimensionality.
711+
* If False, return Series/Index.
712+
return_type : deprecated, use `expand`
713713
714714
Notes
715715
-----
716716
Both 0 and -1 will be interpreted as return all splits
717717
718718
Returns
719719
-------
720-
split : Series/Index of objects or DataFrame
720+
split : Series/Index or DataFrame/MultiIndex of objects
721721
"""
722-
from pandas.core.series import Series
723-
from pandas.core.frame import DataFrame
724-
from pandas.core.index import Index
725-
726-
if return_type not in ('series', 'index', 'frame'):
727-
raise ValueError("return_type must be {'series', 'index', 'frame'}")
728-
if return_type == 'frame' and isinstance(arr, Index):
729-
raise ValueError("return_type='frame' is not supported for string "
730-
"methods on Index")
731722
if pat is None:
732723
if n is None or n == 0:
733724
n = -1
@@ -742,10 +733,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
742733
n = 0
743734
regex = re.compile(pat)
744735
f = lambda x: regex.split(x, maxsplit=n)
745-
if return_type == 'frame':
746-
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
747-
else:
748-
res = _na_map(f, arr)
736+
res = _na_map(f, arr)
749737
return res
750738

751739

@@ -1083,7 +1071,10 @@ def _wrap_result(self, result, **kwargs):
10831071
return DataFrame(result, index=self.series.index)
10841072

10851073
def _wrap_result_expand(self, result, expand=False):
1086-
from pandas.core.index import Index
1074+
if not isinstance(expand, bool):
1075+
raise ValueError("expand must be True or False")
1076+
1077+
from pandas.core.index import Index, MultiIndex
10871078
if not hasattr(result, 'ndim'):
10881079
return result
10891080

@@ -1096,7 +1087,9 @@ def _wrap_result_expand(self, result, expand=False):
10961087

10971088
if expand:
10981089
result = list(result)
1099-
return Index(result, name=name)
1090+
return MultiIndex.from_tuples(result, names=name)
1091+
else:
1092+
return Index(result, name=name)
11001093
else:
11011094
index = self.series.index
11021095
if expand:
@@ -1114,10 +1107,12 @@ def cat(self, others=None, sep=None, na_rep=None):
11141107
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
11151108
return self._wrap_result(result)
11161109

1110+
@deprecate_kwarg('return_type', 'expand',
1111+
mapping={'series': False, 'frame': True})
11171112
@copy(str_split)
1118-
def split(self, pat=None, n=-1, return_type='series'):
1119-
result = str_split(self.series, pat, n=n, return_type=return_type)
1120-
return self._wrap_result(result)
1113+
def split(self, pat=None, n=-1, expand=False):
1114+
result = str_split(self.series, pat, n=n)
1115+
return self._wrap_result_expand(result, expand=expand)
11211116

11221117
_shared_docs['str_partition'] = ("""
11231118
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1131,7 +1126,7 @@ def split(self, pat=None, n=-1, return_type='series'):
11311126
String to split on.
11321127
expand : bool, default True
11331128
* If True, return DataFrame/MultiIndex expanding dimensionality.
1134-
* If False, return Series/Index
1129+
* If False, return Series/Index.
11351130
11361131
Returns
11371132
-------

Diff for: pandas/tests/test_index.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1280,11 +1280,12 @@ def test_str_attribute(self):
12801280
idx = Index(['a b c', 'd e', 'f'])
12811281
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
12821282
tm.assert_index_equal(idx.str.split(), expected)
1283-
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
1284-
# return_type 'index' is an alias for 'series'
1285-
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
1286-
with self.assertRaisesRegexp(ValueError, 'not supported'):
1287-
idx.str.split(return_type='frame')
1283+
tm.assert_index_equal(idx.str.split(expand=False), expected)
1284+
1285+
expected = MultiIndex.from_tuples([('a', 'b', 'c'),
1286+
('d', 'e', np.nan),
1287+
('f', np.nan, np.nan)])
1288+
tm.assert_index_equal(idx.str.split(expand=True), expected)
12881289

12891290
# test boolean case, should return np.array instead of boolean Index
12901291
idx = Index(['a1', 'a2', 'b1', 'b2'])

Diff for: pandas/tests/test_strings.py

+67-4
Original file line numberDiff line numberDiff line change
@@ -1206,14 +1206,19 @@ def test_split(self):
12061206
result = values.str.split('__')
12071207
tm.assert_series_equal(result, exp)
12081208

1209+
result = values.str.split('__', expand=False)
1210+
tm.assert_series_equal(result, exp)
1211+
12091212
# mixed
12101213
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
12111214
None, 1, 2.])
1212-
1213-
rs = Series(mixed).str.split('_')
1215+
rs = mixed.str.split('_')
12141216
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
12151217
NA, NA, NA])
1218+
tm.assert_isinstance(rs, Series)
1219+
tm.assert_almost_equal(rs, xp)
12161220

1221+
rs = mixed.str.split('_', expand=False)
12171222
tm.assert_isinstance(rs, Series)
12181223
tm.assert_almost_equal(rs, xp)
12191224

@@ -1226,6 +1231,9 @@ def test_split(self):
12261231
[u('f'), u('g'), u('h')]])
12271232
tm.assert_series_equal(result, exp)
12281233

1234+
result = values.str.split('_', expand=False)
1235+
tm.assert_series_equal(result, exp)
1236+
12291237
def test_split_noargs(self):
12301238
# #1859
12311239
s = Series(['Wes McKinney', 'Travis Oliphant'])
@@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self):
12591267

12601268
def test_split_to_dataframe(self):
12611269
s = Series(['nosplit', 'alsonosplit'])
1262-
result = s.str.split('_', return_type='frame')
1270+
1271+
with tm.assert_produces_warning():
1272+
result = s.str.split('_', return_type='frame')
1273+
12631274
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
12641275
tm.assert_frame_equal(result, exp)
12651276

@@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self):
12821293
index=['preserve', 'me'])
12831294
tm.assert_frame_equal(result, exp)
12841295

1285-
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
1296+
with tm.assertRaisesRegexp(ValueError, "expand must be"):
1297+
s.str.split('_', return_type="some_invalid_type")
1298+
1299+
def test_split_to_dataframe_expand(self):
1300+
s = Series(['nosplit', 'alsonosplit'])
1301+
result = s.str.split('_', expand=True)
1302+
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
1303+
tm.assert_frame_equal(result, exp)
1304+
1305+
s = Series(['some_equal_splits', 'with_no_nans'])
1306+
result = s.str.split('_', expand=True)
1307+
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
1308+
2: ['splits', 'nans']})
1309+
tm.assert_frame_equal(result, exp)
1310+
1311+
s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
1312+
result = s.str.split('_', expand=True)
1313+
exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
1314+
2: ['splits', 'these'], 3: [NA, 'things'],
1315+
4: [NA, 'is'], 5: [NA, 'not']})
1316+
tm.assert_frame_equal(result, exp)
1317+
1318+
s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
1319+
result = s.str.split('_', expand=True)
1320+
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
1321+
index=['preserve', 'me'])
1322+
tm.assert_frame_equal(result, exp)
1323+
1324+
with tm.assertRaisesRegexp(ValueError, "expand must be"):
12861325
s.str.split('_', return_type="some_invalid_type")
12871326

1327+
def test_split_to_multiindex_expand(self):
1328+
idx = Index(['nosplit', 'alsonosplit'])
1329+
result = idx.str.split('_', expand=True)
1330+
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
1331+
tm.assert_index_equal(result, exp)
1332+
self.assertEqual(result.nlevels, 1)
1333+
1334+
idx = Index(['some_equal_splits', 'with_no_nans'])
1335+
result = idx.str.split('_', expand=True)
1336+
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
1337+
('with', 'no', 'nans')])
1338+
tm.assert_index_equal(result, exp)
1339+
self.assertEqual(result.nlevels, 3)
1340+
1341+
idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
1342+
result = idx.str.split('_', expand=True)
1343+
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA),
1344+
('one', 'of', 'these', 'things', 'is', 'not')])
1345+
tm.assert_index_equal(result, exp)
1346+
self.assertEqual(result.nlevels, 6)
1347+
1348+
with tm.assertRaisesRegexp(ValueError, "expand must be"):
1349+
idx.str.split('_', return_type="some_invalid_type")
1350+
12881351
def test_partition_series(self):
12891352
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
12901353

0 commit comments

Comments
 (0)