Skip to content

CLN: move safe_sort from core.algorithms to core.sorting #17034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 1 addition & 99 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from pandas.core.dtypes.missing import isnull

from pandas.core import common as com
from pandas.compat import string_types
from pandas._libs import algos, lib, hashtable as htable
from pandas._libs.tslib import iNaT

Expand Down Expand Up @@ -431,104 +430,6 @@ def isin(comps, values):
return f(comps, values)


def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.

.. versionadded:: 0.19.0

Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.

Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.

Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError("Only list-like objects are allowed to be passed to"
"safe_sort as values")
values = np.asarray(values)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, string_types) for x in values],
dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return _ensure_object(np.concatenate([nums, strs]))

sorter = None
if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError("Only list-like objects or None are allowed to be"
"passed to safe_sort as labels")
labels = _ensure_platform_int(np.asarray(labels))

from pandas import Index
if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
(hash_klass, _), values = _get_data_algo(values, _hashtables)
t = hash_klass(len(values))
t.map_locations(values)
sorter = _ensure_platform_int(t.lookup(ordered))

reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = (labels < -len(values)) | (labels >= len(values)) | \
(labels == na_sentinel)

# (Out of bound indices will be masked with `na_sentinel` next, so we may
# deal with them here without performance loss using `mode='wrap'`.)
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)

return ordered, _ensure_platform_int(new_labels)


def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
"""
Encode input values as an enumerated type or categorical variable
Expand Down Expand Up @@ -568,6 +469,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
uniques = uniques.to_array()

if sort and len(uniques) > 0:
from pandas.core.sorting import safe_sort
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
assume_unique=True)

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import pandas.core.dtypes.concat as _concat
import pandas.core.missing as missing
import pandas.core.algorithms as algos
import pandas.core.sorting as sorting
from pandas.io.formats.printing import pprint_thing
from pandas.core.ops import _comp_method_OBJECT_ARRAY
from pandas.core.strings import StringAccessorMixin
Expand Down Expand Up @@ -2306,7 +2307,7 @@ def difference(self, other):
assume_unique=True)
the_diff = this.values.take(label_diff)
try:
the_diff = algos.safe_sort(the_diff)
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

Expand Down Expand Up @@ -2366,7 +2367,7 @@ def symmetric_difference(self, other, result_name=None):

the_diff = _concat._concat_compat([left_diff, right_diff])
try:
the_diff = algos.safe_sort(the_diff)
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from pandas.core.sorting import is_int64_overflow_possible
import pandas.core.algorithms as algos
import pandas.core.sorting as sorting
import pandas.core.common as com
from pandas._libs import hashtable as libhashtable, join as libjoin, lib
from pandas.errors import MergeError
Expand Down Expand Up @@ -1491,7 +1492,7 @@ def _sort_labels(uniques, left, right):
l = len(left)
labels = np.concatenate([left, right])

_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
new_labels = _ensure_int64(new_labels)
new_left, new_right = new_labels[:l], new_labels[l:]

Expand Down
108 changes: 107 additions & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
""" miscellaneous sorting / groupby utilities """

import numpy as np
from pandas.compat import long
from pandas.compat import long, string_types, PY3
from pandas.core.categorical import Categorical
from pandas.core.dtypes.common import (
_ensure_platform_int,
_ensure_int64,
is_list_like,
is_categorical_dtype)
from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.missing import isnull
import pandas.core.algorithms as algorithms
from pandas._libs import lib, algos, hashtable
Expand Down Expand Up @@ -376,3 +378,107 @@ def _reorder_by_uniques(uniques, labels):
uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)

return uniques, labels


def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.

.. versionadded:: 0.19.0

Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.

Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.

Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError("Only list-like objects are allowed to be passed to"
"safe_sort as values")

if not isinstance(values, np.ndarray):

# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, string_types) for x in values],
dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError("Only list-like objects or None are allowed to be"
"passed to safe_sort as labels")
labels = _ensure_platform_int(np.asarray(labels))

from pandas import Index
if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
(hash_klass, _), values = algorithms._get_data_algo(
values, algorithms._hashtables)
t = hash_klass(len(values))
t.map_locations(values)
sorter = _ensure_platform_int(t.lookup(ordered))

reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = (labels < -len(values)) | (labels >= len(values)) | \
(labels == na_sentinel)

# (Out of bound indices will be masked with `na_sentinel` next, so we may
# deal with them here without performance loss using `mode='wrap'`.)
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)

return ordered, _ensure_platform_int(new_labels)
88 changes: 0 additions & 88 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np
import pytest
import warnings

from numpy.random import RandomState
from numpy import nan
Expand Down Expand Up @@ -60,93 +59,6 @@ def test_strings(self):
tm.assert_series_equal(result, expected)


class TestSafeSort(object):

def test_basic_sort(self):
values = [3, 1, 2, 0, 4]
result = algos.safe_sort(values)
expected = np.array([0, 1, 2, 3, 4])
tm.assert_numpy_array_equal(result, expected)

values = list("baaacb")
result = algos.safe_sort(values)
expected = np.array(list("aaabbc"))
tm.assert_numpy_array_equal(result, expected)

values = []
result = algos.safe_sort(values)
expected = np.array([])
tm.assert_numpy_array_equal(result, expected)

def test_labels(self):
values = [3, 1, 2, 0, 4]
expected = np.array([0, 1, 2, 3, 4])

labels = [0, 1, 1, 2, 3, 0, -1, 4]
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

# na_sentinel
labels = [0, 1, 1, 2, 3, 0, 99, 4]
result, result_labels = algos.safe_sort(values, labels,
na_sentinel=99)
expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

# out of bound indices
labels = [0, 101, 102, 2, 3, 0, 99, 4]
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

labels = []
result, result_labels = algos.safe_sort(values, labels)
expected_labels = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

def test_mixed_integer(self):
values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
result = algos.safe_sort(values)
expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
tm.assert_numpy_array_equal(result, expected)

values = np.array(['b', 1, 0, 'a'], dtype=object)
labels = [0, 1, 2, 3, 0, -1, 1]
result, result_labels = algos.safe_sort(values, labels)
expected = np.array([0, 1, 'a', 'b'], dtype=object)
expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)

def test_unsortable(self):
# GH 13714
arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
if compat.PY2 and not pd._np_version_under1p10:
# RuntimeWarning: tp_compare didn't return -1 or -2 for exception
with warnings.catch_warnings():
pytest.raises(TypeError, algos.safe_sort, arr)
else:
pytest.raises(TypeError, algos.safe_sort, arr)

def test_exceptions(self):
with tm.assert_raises_regex(TypeError,
"Only list-like objects are allowed"):
algos.safe_sort(values=1)

with tm.assert_raises_regex(TypeError,
"Only list-like objects or None"):
algos.safe_sort(values=[0, 1, 2], labels=1)

with tm.assert_raises_regex(ValueError,
"values should be unique"):
algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1])


class TestFactorize(object):

def test_basic(self):
Expand Down
Loading