CLN: ASV reindex (#18938)

mroeschke · jreback · commit 95e79a77ccfc · 2017-12-26T16:47:32.000-05:00
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
@@ -1,89 +1,77 @@
-from .pandas_vb_common import *
-from random import shuffle
+import numpy as np
+import pandas.util.testing as tm
+from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
+                    date_range)
+from .pandas_vb_common import setup, lib  # noqa
 
 
-class Reindexing(object):
+class Reindex(object):
+
     goal_time = 0.2
 
     def setup(self):
-        self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
-        self.df = DataFrame(np.random.rand(10000, 10), index=self.rng,
+        rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
+        self.df = DataFrame(np.random.rand(10000, 10), index=rng,
                             columns=range(10))
         self.df['foo'] = 'bar'
-        self.rng2 = Index(self.rng[::2])
-
+        self.rng_subset = Index(rng[::2])
         self.df2 = DataFrame(index=range(10000),
                              data=np.random.rand(10000, 30), columns=range(30))
-
-        # multi-index
         N = 5000
         K = 200
         level1 = tm.makeStringIndex(N).values.repeat(K)
         level2 = np.tile(tm.makeStringIndex(K).values, N)
         index = MultiIndex.from_arrays([level1, level2])
-        self.s1 = Series(np.random.randn((N * K)), index=index)
-        self.s2 = self.s1[::2]
+        self.s = Series(np.random.randn(N * K), index=index)
+        self.s_subset = self.s[::2]
 
     def time_reindex_dates(self):
-        self.df.reindex(self.rng2)
+        self.df.reindex(self.rng_subset)
 
     def time_reindex_columns(self):
         self.df2.reindex(columns=self.df.columns[1:5])
 
     def time_reindex_multiindex(self):
-        self.s1.reindex(self.s2.index)
+        self.s.reindex(self.s_subset.index)
 
 
-#----------------------------------------------------------------------
-# Pad / backfill
+class ReindexMethod(object):
 
-
-class FillMethod(object):
     goal_time = 0.2
+    params = ['pad', 'backfill']
+    param_names = ['method']
 
-    def setup(self):
-        self.rng = date_range('1/1/2000', periods=100000, freq='1min')
-        self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
-        self.ts2 = self.ts[::2]
-        self.ts3 = self.ts2.reindex(self.ts.index)
-        self.ts4 = self.ts3.astype('float32')
-
-    def pad(self, source_series, target_index):
-        try:
-            source_series.reindex(target_index, method='pad')
-        except:
-            source_series.reindex(target_index, fillMethod='pad')
-
-    def backfill(self, source_series, target_index):
-        try:
-            source_series.reindex(target_index, method='backfill')
-        except:
-            source_series.reindex(target_index, fillMethod='backfill')
-
-    def time_backfill_dates(self):
-        self.backfill(self.ts2, self.ts.index)
+    def setup(self, method):
+        N = 100000
+        self.idx = date_range('1/1/2000', periods=N, freq='1min')
+        self.ts = Series(np.random.randn(N), index=self.idx)[::2]
 
-    def time_pad_daterange(self):
-        self.pad(self.ts2, self.ts.index)
+    def time_reindex_method(self, method):
+        self.ts.reindex(self.idx, method=method)
 
-    def time_backfill(self):
-        self.ts3.fillna(method='backfill')
 
-    def time_backfill_float32(self):
-        self.ts4.fillna(method='backfill')
+class Fillna(object):
 
-    def time_pad(self):
-        self.ts3.fillna(method='pad')
+    goal_time = 0.2
+    params = ['pad', 'backfill']
+    param_names = ['method']
 
-    def time_pad_float32(self):
-        self.ts4.fillna(method='pad')
+    def setup(self, method):
+        N = 100000
+        self.idx = date_range('1/1/2000', periods=N, freq='1min')
+        ts = Series(np.random.randn(N), index=self.idx)[::2]
+        self.ts_reindexed = ts.reindex(self.idx)
+        self.ts_float32 = self.ts_reindexed.astype('float32')
 
+    def time_reindexed(self, method):
+        self.ts_reindexed.fillna(method=method)
 
-#----------------------------------------------------------------------
-# align on level
+    def time_float_32(self, method):
+        self.ts_float32.fillna(method=method)
 
 
 class LevelAlign(object):
+
     goal_time = 0.2
 
     def setup(self):
@@ -92,7 +80,6 @@ def setup(self):
             labels=[np.arange(10).repeat(10000),
                     np.tile(np.arange(100).repeat(100), 10),
                     np.tile(np.tile(np.arange(100), 100), 10)])
-        random.shuffle(self.index.values)
         self.df = DataFrame(np.random.randn(len(self.index), 4),
                             index=self.index)
         self.df_level = DataFrame(np.random.randn(100, 4),
@@ -102,103 +89,85 @@ def time_align_level(self):
         self.df.align(self.df_level, level=1, copy=False)
 
     def time_reindex_level(self):
-        self.df_level.reindex(self.df.index, level=1)
+        self.df_level.reindex(self.index, level=1)
 
 
-#----------------------------------------------------------------------
-# drop_duplicates
+class DropDuplicates(object):
 
-
-class Duplicates(object):
     goal_time = 0.2
-
-    def setup(self):
-        self.N = 10000
-        self.K = 10
-        self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
-        self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
-        self.df = DataFrame({'key1': self.key1, 'key2': self.key2,
-                             'value': np.random.randn((self.N * self.K)),})
-        self.col_array_list = list(self.df.values.T)
-
-        self.df2 = self.df.copy()
-        self.df2.ix[:10000, :] = np.nan
+    params = [True, False]
+    param_names = ['inplace']
+
+    def setup(self, inplace):
+        N = 10000
+        K = 10
+        key1 = tm.makeStringIndex(N).values.repeat(K)
+        key2 = tm.makeStringIndex(N).values.repeat(K)
+        self.df = DataFrame({'key1': key1, 'key2': key2,
+                             'value': np.random.randn(N * K)})
+        self.df_nan = self.df.copy()
+        self.df_nan.iloc[:10000, :] = np.nan
 
         self.s = Series(np.random.randint(0, 1000, size=10000))
-        self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
-
-        np.random.seed(1234)
-        self.N = 1000000
-        self.K = 10000
-        self.key1 = np.random.randint(0, self.K, size=self.N)
-        self.df_int = DataFrame({'key1': self.key1})
-        self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
-                                                       dtype=bool)
-                                  for i in range(10)})
+        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))
 
-    def time_frame_drop_dups(self):
-        self.df.drop_duplicates(['key1', 'key2'])
+        N = 1000000
+        K = 10000
+        key1 = np.random.randint(0, K, size=N)
+        self.df_int = DataFrame({'key1': key1})
+        self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
+                                                   dtype=bool))
 
-    def time_frame_drop_dups_inplace(self):
-        self.df.drop_duplicates(['key1', 'key2'], inplace=True)
+    def time_frame_drop_dups(self, inplace):
+        self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)
 
-    def time_frame_drop_dups_na(self):
-        self.df2.drop_duplicates(['key1', 'key2'])
+    def time_frame_drop_dups_na(self, inplace):
+        self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)
 
-    def time_frame_drop_dups_na_inplace(self):
-        self.df2.drop_duplicates(['key1', 'key2'], inplace=True)
+    def time_series_drop_dups_int(self, inplace):
+        self.s.drop_duplicates(inplace=inplace)
 
-    def time_series_drop_dups_int(self):
-        self.s.drop_duplicates()
+    def time_series_drop_dups_string(self, inplace):
+        self.s_str.drop_duplicates(inplace=inplace)
 
-    def time_series_drop_dups_string(self):
-        self.s2.drop_duplicates()
+    def time_frame_drop_dups_int(self, inplace):
+        self.df_int.drop_duplicates(inplace=inplace)
 
-    def time_frame_drop_dups_int(self):
-        self.df_int.drop_duplicates()
-
-    def time_frame_drop_dups_bool(self):
-        self.df_bool.drop_duplicates()
-
-#----------------------------------------------------------------------
-# blog "pandas escaped the zoo"
+    def time_frame_drop_dups_bool(self, inplace):
+        self.df_bool.drop_duplicates(inplace=inplace)
 
 
 class Align(object):
+    # blog "pandas escaped the zoo"
     goal_time = 0.2
 
     def setup(self):
         n = 50000
         indices = tm.makeStringIndex(n)
         subsample_size = 40000
-
-        def sample(values, k):
-            sampler = np.arange(len(values))
-            shuffle(sampler)
-            return values.take(sampler[:k])
-
-        self.x = Series(np.random.randn(50000), indices)
+        self.x = Series(np.random.randn(n), indices)
         self.y = Series(np.random.randn(subsample_size),
-                        index=sample(indices, subsample_size))
+                        index=np.random.choice(indices, subsample_size,
+                                               replace=False))
 
     def time_align_series_irregular_string(self):
-        (self.x + self.y)
+        self.x + self.y
 
 
 class LibFastZip(object):
+
     goal_time = 0.2
 
     def setup(self):
-        self.N = 10000
-        self.K = 10
-        self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
-        self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
-        self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
-        self.col_array_list = list(self.df.values.T)
-
-        self.df2 = self.df.copy()
-        self.df2.ix[:10000, :] = np.nan
-        self.col_array_list2 = list(self.df2.values.T)
+        N = 10000
+        K = 10
+        key1 = tm.makeStringIndex(N).values.repeat(K)
+        key2 = tm.makeStringIndex(N).values.repeat(K)
+        col_array = np.vstack([key1, key2, np.random.randn(N * K)])
+        col_array2 = col_array.copy()
+        col_array2[:, :10000] = np.nan
+        self.col_array_list = list(col_array)
+        self.col_array_list2 = list(col_array2)
 
     def time_lib_fast_zip(self):
         lib.fast_zip(self.col_array_list)