Skip to content

Commit 95e79a7

Browse files
mroeschkejreback
authored andcommitted
CLN: ASV reindex (#18938)
1 parent a088c7b commit 95e79a7

File tree

1 file changed

+86
-117
lines changed

1 file changed

+86
-117
lines changed

asv_bench/benchmarks/reindex.py

+86-117
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,77 @@
1-
from .pandas_vb_common import *
2-
from random import shuffle
1+
import numpy as np
2+
import pandas.util.testing as tm
3+
from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
4+
date_range)
5+
from .pandas_vb_common import setup, lib # noqa
36

47

5-
class Reindexing(object):
8+
class Reindex(object):
9+
610
goal_time = 0.2
711

812
def setup(self):
9-
self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
10-
self.df = DataFrame(np.random.rand(10000, 10), index=self.rng,
13+
rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
14+
self.df = DataFrame(np.random.rand(10000, 10), index=rng,
1115
columns=range(10))
1216
self.df['foo'] = 'bar'
13-
self.rng2 = Index(self.rng[::2])
14-
17+
self.rng_subset = Index(rng[::2])
1518
self.df2 = DataFrame(index=range(10000),
1619
data=np.random.rand(10000, 30), columns=range(30))
17-
18-
# multi-index
1920
N = 5000
2021
K = 200
2122
level1 = tm.makeStringIndex(N).values.repeat(K)
2223
level2 = np.tile(tm.makeStringIndex(K).values, N)
2324
index = MultiIndex.from_arrays([level1, level2])
24-
self.s1 = Series(np.random.randn((N * K)), index=index)
25-
self.s2 = self.s1[::2]
25+
self.s = Series(np.random.randn(N * K), index=index)
26+
self.s_subset = self.s[::2]
2627

2728
def time_reindex_dates(self):
28-
self.df.reindex(self.rng2)
29+
self.df.reindex(self.rng_subset)
2930

3031
def time_reindex_columns(self):
3132
self.df2.reindex(columns=self.df.columns[1:5])
3233

3334
def time_reindex_multiindex(self):
34-
self.s1.reindex(self.s2.index)
35+
self.s.reindex(self.s_subset.index)
3536

3637

37-
#----------------------------------------------------------------------
38-
# Pad / backfill
38+
class ReindexMethod(object):
3939

40-
41-
class FillMethod(object):
4240
goal_time = 0.2
41+
params = ['pad', 'backfill']
42+
param_names = ['method']
4343

44-
def setup(self):
45-
self.rng = date_range('1/1/2000', periods=100000, freq='1min')
46-
self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
47-
self.ts2 = self.ts[::2]
48-
self.ts3 = self.ts2.reindex(self.ts.index)
49-
self.ts4 = self.ts3.astype('float32')
50-
51-
def pad(self, source_series, target_index):
52-
try:
53-
source_series.reindex(target_index, method='pad')
54-
except:
55-
source_series.reindex(target_index, fillMethod='pad')
56-
57-
def backfill(self, source_series, target_index):
58-
try:
59-
source_series.reindex(target_index, method='backfill')
60-
except:
61-
source_series.reindex(target_index, fillMethod='backfill')
62-
63-
def time_backfill_dates(self):
64-
self.backfill(self.ts2, self.ts.index)
44+
def setup(self, method):
45+
N = 100000
46+
self.idx = date_range('1/1/2000', periods=N, freq='1min')
47+
self.ts = Series(np.random.randn(N), index=self.idx)[::2]
6548

66-
def time_pad_daterange(self):
67-
self.pad(self.ts2, self.ts.index)
49+
def time_reindex_method(self, method):
50+
self.ts.reindex(self.idx, method=method)
6851

69-
def time_backfill(self):
70-
self.ts3.fillna(method='backfill')
7152

72-
def time_backfill_float32(self):
73-
self.ts4.fillna(method='backfill')
53+
class Fillna(object):
7454

75-
def time_pad(self):
76-
self.ts3.fillna(method='pad')
55+
goal_time = 0.2
56+
params = ['pad', 'backfill']
57+
param_names = ['method']
7758

78-
def time_pad_float32(self):
79-
self.ts4.fillna(method='pad')
59+
def setup(self, method):
60+
N = 100000
61+
self.idx = date_range('1/1/2000', periods=N, freq='1min')
62+
ts = Series(np.random.randn(N), index=self.idx)[::2]
63+
self.ts_reindexed = ts.reindex(self.idx)
64+
self.ts_float32 = self.ts_reindexed.astype('float32')
8065

66+
def time_reindexed(self, method):
67+
self.ts_reindexed.fillna(method=method)
8168

82-
#----------------------------------------------------------------------
83-
# align on level
69+
def time_float_32(self, method):
70+
self.ts_float32.fillna(method=method)
8471

8572

8673
class LevelAlign(object):
74+
8775
goal_time = 0.2
8876

8977
def setup(self):
@@ -92,7 +80,6 @@ def setup(self):
9280
labels=[np.arange(10).repeat(10000),
9381
np.tile(np.arange(100).repeat(100), 10),
9482
np.tile(np.tile(np.arange(100), 100), 10)])
95-
random.shuffle(self.index.values)
9683
self.df = DataFrame(np.random.randn(len(self.index), 4),
9784
index=self.index)
9885
self.df_level = DataFrame(np.random.randn(100, 4),
@@ -102,103 +89,85 @@ def time_align_level(self):
10289
self.df.align(self.df_level, level=1, copy=False)
10390

10491
def time_reindex_level(self):
105-
self.df_level.reindex(self.df.index, level=1)
92+
self.df_level.reindex(self.index, level=1)
10693

10794

108-
#----------------------------------------------------------------------
109-
# drop_duplicates
95+
class DropDuplicates(object):
11096

111-
112-
class Duplicates(object):
11397
goal_time = 0.2
114-
115-
def setup(self):
116-
self.N = 10000
117-
self.K = 10
118-
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
119-
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
120-
self.df = DataFrame({'key1': self.key1, 'key2': self.key2,
121-
'value': np.random.randn((self.N * self.K)),})
122-
self.col_array_list = list(self.df.values.T)
123-
124-
self.df2 = self.df.copy()
125-
self.df2.ix[:10000, :] = np.nan
98+
params = [True, False]
99+
param_names = ['inplace']
100+
101+
def setup(self, inplace):
102+
N = 10000
103+
K = 10
104+
key1 = tm.makeStringIndex(N).values.repeat(K)
105+
key2 = tm.makeStringIndex(N).values.repeat(K)
106+
self.df = DataFrame({'key1': key1, 'key2': key2,
107+
'value': np.random.randn(N * K)})
108+
self.df_nan = self.df.copy()
109+
self.df_nan.iloc[:10000, :] = np.nan
126110

127111
self.s = Series(np.random.randint(0, 1000, size=10000))
128-
self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
129-
130-
np.random.seed(1234)
131-
self.N = 1000000
132-
self.K = 10000
133-
self.key1 = np.random.randint(0, self.K, size=self.N)
134-
self.df_int = DataFrame({'key1': self.key1})
135-
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
136-
dtype=bool)
137-
for i in range(10)})
112+
self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))
138113

139-
def time_frame_drop_dups(self):
140-
self.df.drop_duplicates(['key1', 'key2'])
114+
N = 1000000
115+
K = 10000
116+
key1 = np.random.randint(0, K, size=N)
117+
self.df_int = DataFrame({'key1': key1})
118+
self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
119+
dtype=bool))
141120

142-
def time_frame_drop_dups_inplace(self):
143-
self.df.drop_duplicates(['key1', 'key2'], inplace=True)
121+
def time_frame_drop_dups(self, inplace):
122+
self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)
144123

145-
def time_frame_drop_dups_na(self):
146-
self.df2.drop_duplicates(['key1', 'key2'])
124+
def time_frame_drop_dups_na(self, inplace):
125+
self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)
147126

148-
def time_frame_drop_dups_na_inplace(self):
149-
self.df2.drop_duplicates(['key1', 'key2'], inplace=True)
127+
def time_series_drop_dups_int(self, inplace):
128+
self.s.drop_duplicates(inplace=inplace)
150129

151-
def time_series_drop_dups_int(self):
152-
self.s.drop_duplicates()
130+
def time_series_drop_dups_string(self, inplace):
131+
self.s_str.drop_duplicates(inplace=inplace)
153132

154-
def time_series_drop_dups_string(self):
155-
self.s2.drop_duplicates()
133+
def time_frame_drop_dups_int(self, inplace):
134+
self.df_int.drop_duplicates(inplace=inplace)
156135

157-
def time_frame_drop_dups_int(self):
158-
self.df_int.drop_duplicates()
159-
160-
def time_frame_drop_dups_bool(self):
161-
self.df_bool.drop_duplicates()
162-
163-
#----------------------------------------------------------------------
164-
# blog "pandas escaped the zoo"
136+
def time_frame_drop_dups_bool(self, inplace):
137+
self.df_bool.drop_duplicates(inplace=inplace)
165138

166139

167140
class Align(object):
141+
# blog "pandas escaped the zoo"
168142
goal_time = 0.2
169143

170144
def setup(self):
171145
n = 50000
172146
indices = tm.makeStringIndex(n)
173147
subsample_size = 40000
174-
175-
def sample(values, k):
176-
sampler = np.arange(len(values))
177-
shuffle(sampler)
178-
return values.take(sampler[:k])
179-
180-
self.x = Series(np.random.randn(50000), indices)
148+
self.x = Series(np.random.randn(n), indices)
181149
self.y = Series(np.random.randn(subsample_size),
182-
index=sample(indices, subsample_size))
150+
index=np.random.choice(indices, subsample_size,
151+
replace=False))
183152

184153
def time_align_series_irregular_string(self):
185-
(self.x + self.y)
154+
self.x + self.y
186155

187156

188157
class LibFastZip(object):
158+
189159
goal_time = 0.2
190160

191161
def setup(self):
192-
self.N = 10000
193-
self.K = 10
194-
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
195-
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
196-
self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
197-
self.col_array_list = list(self.df.values.T)
198-
199-
self.df2 = self.df.copy()
200-
self.df2.ix[:10000, :] = np.nan
201-
self.col_array_list2 = list(self.df2.values.T)
162+
N = 10000
163+
K = 10
164+
key1 = tm.makeStringIndex(N).values.repeat(K)
165+
key2 = tm.makeStringIndex(N).values.repeat(K)
166+
col_array = np.vstack([key1, key2, np.random.randn(N * K)])
167+
col_array2 = col_array.copy()
168+
col_array2[:, :10000] = np.nan
169+
self.col_array_list = list(col_array)
170+
self.col_array_list2 = list(col_array2)
202171

203172
def time_lib_fast_zip(self):
204173
lib.fast_zip(self.col_array_list)

0 commit comments

Comments
 (0)