1
- from .pandas_vb_common import *
2
- from random import shuffle
1
+ import numpy as np
2
+ import pandas .util .testing as tm
3
+ from pandas import (DataFrame , Series , DatetimeIndex , MultiIndex , Index ,
4
+ date_range )
5
+ from .pandas_vb_common import setup , lib # noqa
3
6
4
7
5
- class Reindexing (object ):
8
+ class Reindex (object ):
9
+
6
10
goal_time = 0.2
7
11
8
12
def setup (self ):
9
- self . rng = DatetimeIndex (start = '1/1/1970' , periods = 10000 , freq = '1min' )
10
- self .df = DataFrame (np .random .rand (10000 , 10 ), index = self . rng ,
13
+ rng = DatetimeIndex (start = '1/1/1970' , periods = 10000 , freq = '1min' )
14
+ self .df = DataFrame (np .random .rand (10000 , 10 ), index = rng ,
11
15
columns = range (10 ))
12
16
self .df ['foo' ] = 'bar'
13
- self .rng2 = Index (self .rng [::2 ])
14
-
17
+ self .rng_subset = Index (rng [::2 ])
15
18
self .df2 = DataFrame (index = range (10000 ),
16
19
data = np .random .rand (10000 , 30 ), columns = range (30 ))
17
-
18
- # multi-index
19
20
N = 5000
20
21
K = 200
21
22
level1 = tm .makeStringIndex (N ).values .repeat (K )
22
23
level2 = np .tile (tm .makeStringIndex (K ).values , N )
23
24
index = MultiIndex .from_arrays ([level1 , level2 ])
24
- self .s1 = Series (np .random .randn (( N * K ) ), index = index )
25
- self .s2 = self .s1 [::2 ]
25
+ self .s = Series (np .random .randn (N * K ), index = index )
26
+ self .s_subset = self .s [::2 ]
26
27
27
28
def time_reindex_dates (self ):
28
- self .df .reindex (self .rng2 )
29
+ self .df .reindex (self .rng_subset )
29
30
30
31
def time_reindex_columns (self ):
31
32
self .df2 .reindex (columns = self .df .columns [1 :5 ])
32
33
33
34
def time_reindex_multiindex (self ):
34
- self .s1 .reindex (self .s2 .index )
35
+ self .s .reindex (self .s_subset .index )
35
36
36
37
37
- #----------------------------------------------------------------------
38
- # Pad / backfill
38
+ class ReindexMethod (object ):
39
39
40
-
41
- class FillMethod (object ):
42
40
goal_time = 0.2
41
+ params = ['pad' , 'backfill' ]
42
+ param_names = ['method' ]
43
43
44
- def setup (self ):
45
- self .rng = date_range ('1/1/2000' , periods = 100000 , freq = '1min' )
46
- self .ts = Series (np .random .randn (len (self .rng )), index = self .rng )
47
- self .ts2 = self .ts [::2 ]
48
- self .ts3 = self .ts2 .reindex (self .ts .index )
49
- self .ts4 = self .ts3 .astype ('float32' )
50
-
51
- def pad (self , source_series , target_index ):
52
- try :
53
- source_series .reindex (target_index , method = 'pad' )
54
- except :
55
- source_series .reindex (target_index , fillMethod = 'pad' )
56
-
57
- def backfill (self , source_series , target_index ):
58
- try :
59
- source_series .reindex (target_index , method = 'backfill' )
60
- except :
61
- source_series .reindex (target_index , fillMethod = 'backfill' )
62
-
63
- def time_backfill_dates (self ):
64
- self .backfill (self .ts2 , self .ts .index )
44
+ def setup (self , method ):
45
+ N = 100000
46
+ self .idx = date_range ('1/1/2000' , periods = N , freq = '1min' )
47
+ self .ts = Series (np .random .randn (N ), index = self .idx )[::2 ]
65
48
66
- def time_pad_daterange (self ):
67
- self .pad (self .ts2 , self . ts . index )
49
+ def time_reindex_method (self , method ):
50
+ self .ts . reindex (self .idx , method = method )
68
51
69
- def time_backfill (self ):
70
- self .ts3 .fillna (method = 'backfill' )
71
52
72
- def time_backfill_float32 (self ):
73
- self .ts4 .fillna (method = 'backfill' )
53
+ class Fillna (object ):
74
54
75
- def time_pad (self ):
76
- self .ts3 .fillna (method = 'pad' )
55
+ goal_time = 0.2
56
+ params = ['pad' , 'backfill' ]
57
+ param_names = ['method' ]
77
58
78
- def time_pad_float32 (self ):
79
- self .ts4 .fillna (method = 'pad' )
59
+ def setup (self , method ):
60
+ N = 100000
61
+ self .idx = date_range ('1/1/2000' , periods = N , freq = '1min' )
62
+ ts = Series (np .random .randn (N ), index = self .idx )[::2 ]
63
+ self .ts_reindexed = ts .reindex (self .idx )
64
+ self .ts_float32 = self .ts_reindexed .astype ('float32' )
80
65
66
+ def time_reindexed (self , method ):
67
+ self .ts_reindexed .fillna (method = method )
81
68
82
- #----------------------------------------------------------------------
83
- # align on level
69
+ def time_float_32 ( self , method ):
70
+ self . ts_float32 . fillna ( method = method )
84
71
85
72
86
73
class LevelAlign (object ):
74
+
87
75
goal_time = 0.2
88
76
89
77
def setup (self ):
@@ -92,7 +80,6 @@ def setup(self):
92
80
labels = [np .arange (10 ).repeat (10000 ),
93
81
np .tile (np .arange (100 ).repeat (100 ), 10 ),
94
82
np .tile (np .tile (np .arange (100 ), 100 ), 10 )])
95
- random .shuffle (self .index .values )
96
83
self .df = DataFrame (np .random .randn (len (self .index ), 4 ),
97
84
index = self .index )
98
85
self .df_level = DataFrame (np .random .randn (100 , 4 ),
@@ -102,103 +89,85 @@ def time_align_level(self):
102
89
self .df .align (self .df_level , level = 1 , copy = False )
103
90
104
91
def time_reindex_level (self ):
105
- self .df_level .reindex (self .df . index , level = 1 )
92
+ self .df_level .reindex (self .index , level = 1 )
106
93
107
94
108
- #----------------------------------------------------------------------
109
- # drop_duplicates
95
+ class DropDuplicates (object ):
110
96
111
-
112
- class Duplicates (object ):
113
97
goal_time = 0.2
114
-
115
- def setup ( self ):
116
- self . N = 10000
117
- self . K = 10
118
- self . key1 = tm . makeStringIndex ( self . N ). values . repeat ( self . K )
119
- self . key2 = tm . makeStringIndex ( self . N ). values . repeat ( self . K )
120
- self . df = DataFrame ({ 'key1' : self . key1 , 'key2' : self . key2 ,
121
- 'value' : np . random . randn (( self . N * self . K )),} )
122
- self .col_array_list = list ( self . df . values . T )
123
-
124
- self .df2 = self .df .copy ()
125
- self .df2 . ix [:10000 , :] = np .nan
98
+ params = [ True , False ]
99
+ param_names = [ 'inplace' ]
100
+
101
+ def setup ( self , inplace ):
102
+ N = 10000
103
+ K = 10
104
+ key1 = tm . makeStringIndex ( N ). values . repeat ( K )
105
+ key2 = tm . makeStringIndex ( N ). values . repeat ( K )
106
+ self .df = DataFrame ({ 'key1' : key1 , 'key2' : key2 ,
107
+ 'value' : np . random . randn ( N * K )})
108
+ self .df_nan = self .df .copy ()
109
+ self .df_nan . iloc [:10000 , :] = np .nan
126
110
127
111
self .s = Series (np .random .randint (0 , 1000 , size = 10000 ))
128
- self .s2 = Series (np .tile (tm .makeStringIndex (1000 ).values , 10 ))
129
-
130
- np .random .seed (1234 )
131
- self .N = 1000000
132
- self .K = 10000
133
- self .key1 = np .random .randint (0 , self .K , size = self .N )
134
- self .df_int = DataFrame ({'key1' : self .key1 })
135
- self .df_bool = DataFrame ({i : np .random .randint (0 , 2 , size = self .K ,
136
- dtype = bool )
137
- for i in range (10 )})
112
+ self .s_str = Series (np .tile (tm .makeStringIndex (1000 ).values , 10 ))
138
113
139
- def time_frame_drop_dups (self ):
140
- self .df .drop_duplicates (['key1' , 'key2' ])
114
+ N = 1000000
115
+ K = 10000
116
+ key1 = np .random .randint (0 , K , size = N )
117
+ self .df_int = DataFrame ({'key1' : key1 })
118
+ self .df_bool = DataFrame (np .random .randint (0 , 2 , size = (K , 10 ),
119
+ dtype = bool ))
141
120
142
- def time_frame_drop_dups_inplace (self ):
143
- self .df .drop_duplicates (['key1' , 'key2' ], inplace = True )
121
+ def time_frame_drop_dups (self , inplace ):
122
+ self .df .drop_duplicates (['key1' , 'key2' ], inplace = inplace )
144
123
145
- def time_frame_drop_dups_na (self ):
146
- self .df2 .drop_duplicates (['key1' , 'key2' ])
124
+ def time_frame_drop_dups_na (self , inplace ):
125
+ self .df_nan .drop_duplicates (['key1' , 'key2' ], inplace = inplace )
147
126
148
- def time_frame_drop_dups_na_inplace (self ):
149
- self .df2 .drop_duplicates ([ 'key1' , 'key2' ], inplace = True )
127
+ def time_series_drop_dups_int (self , inplace ):
128
+ self .s .drop_duplicates (inplace = inplace )
150
129
151
- def time_series_drop_dups_int (self ):
152
- self .s .drop_duplicates ()
130
+ def time_series_drop_dups_string (self , inplace ):
131
+ self .s_str .drop_duplicates (inplace = inplace )
153
132
154
- def time_series_drop_dups_string (self ):
155
- self .s2 .drop_duplicates ()
133
+ def time_frame_drop_dups_int (self , inplace ):
134
+ self .df_int .drop_duplicates (inplace = inplace )
156
135
157
- def time_frame_drop_dups_int (self ):
158
- self .df_int .drop_duplicates ()
159
-
160
- def time_frame_drop_dups_bool (self ):
161
- self .df_bool .drop_duplicates ()
162
-
163
- #----------------------------------------------------------------------
164
- # blog "pandas escaped the zoo"
136
+ def time_frame_drop_dups_bool (self , inplace ):
137
+ self .df_bool .drop_duplicates (inplace = inplace )
165
138
166
139
167
140
class Align (object ):
141
+ # blog "pandas escaped the zoo"
168
142
goal_time = 0.2
169
143
170
144
def setup (self ):
171
145
n = 50000
172
146
indices = tm .makeStringIndex (n )
173
147
subsample_size = 40000
174
-
175
- def sample (values , k ):
176
- sampler = np .arange (len (values ))
177
- shuffle (sampler )
178
- return values .take (sampler [:k ])
179
-
180
- self .x = Series (np .random .randn (50000 ), indices )
148
+ self .x = Series (np .random .randn (n ), indices )
181
149
self .y = Series (np .random .randn (subsample_size ),
182
- index = sample (indices , subsample_size ))
150
+ index = np .random .choice (indices , subsample_size ,
151
+ replace = False ))
183
152
184
153
def time_align_series_irregular_string (self ):
185
- ( self .x + self .y )
154
+ self .x + self .y
186
155
187
156
188
157
class LibFastZip (object ):
158
+
189
159
goal_time = 0.2
190
160
191
161
def setup (self ):
192
- self .N = 10000
193
- self .K = 10
194
- self .key1 = tm .makeStringIndex (self .N ).values .repeat (self .K )
195
- self .key2 = tm .makeStringIndex (self .N ).values .repeat (self .K )
196
- self .df = DataFrame ({'key1' : self .key1 , 'key2' : self .key2 , 'value' : np .random .randn ((self .N * self .K )), })
197
- self .col_array_list = list (self .df .values .T )
198
-
199
- self .df2 = self .df .copy ()
200
- self .df2 .ix [:10000 , :] = np .nan
201
- self .col_array_list2 = list (self .df2 .values .T )
162
+ N = 10000
163
+ K = 10
164
+ key1 = tm .makeStringIndex (N ).values .repeat (K )
165
+ key2 = tm .makeStringIndex (N ).values .repeat (K )
166
+ col_array = np .vstack ([key1 , key2 , np .random .randn (N * K )])
167
+ col_array2 = col_array .copy ()
168
+ col_array2 [:, :10000 ] = np .nan
169
+ self .col_array_list = list (col_array )
170
+ self .col_array_list2 = list (col_array2 )
202
171
203
172
def time_lib_fast_zip (self ):
204
173
lib .fast_zip (self .col_array_list )
0 commit comments