15
15
16
16
import pytest
17
17
import os
18
-
19
18
from distutils .version import LooseVersion
20
-
21
19
import pandas as pd
20
+ import numpy as np
22
21
from pandas import Index
23
22
from pandas .compat import is_platform_little_endian
24
23
import pandas
25
24
import pandas .util .testing as tm
26
25
from pandas .tseries .offsets import Day , MonthEnd
26
+ import shutil
27
27
28
28
29
29
@pytest .fixture (scope = 'module' )
@@ -307,24 +307,101 @@ def test_pickle_v0_15_2():
307
307
# ---------------------
308
308
# test pickle compression
309
309
# ---------------------
310
+ _compression_to_extension = {
311
+ None : ".none" ,
312
+ 'gzip' : '.gz' ,
313
+ 'bz2' : '.bz2' ,
314
+ 'zip' : '.zip' ,
315
+ 'xz' : '.xz' ,
316
+ }
317
+
318
+
310
319
def get_random_path ():
311
320
return u'__%s__.pickle' % tm .rands (10 )
312
321
313
322
323
+ def compress_file (src_path , dest_path , compression ):
324
+ if compression is None :
325
+ shutil .copyfile (src_path , dest_path )
326
+ return
327
+
328
+ if compression == 'gzip' :
329
+ import gzip
330
+ f = gzip .open (dest_path , "w" )
331
+ elif compression == 'bz2' :
332
+ import bz2
333
+ f = bz2 .BZ2File (dest_path , "w" )
334
+ elif compression == 'zip' :
335
+ import zipfile
336
+ zip_file = zipfile .ZipFile (dest_path , "w" ,
337
+ compression = zipfile .ZIP_DEFLATED )
338
+ zip_file .write (src_path , os .path .basename (src_path ))
339
+ elif compression == 'xz' :
340
+ lzma = pandas .compat .import_lzma ()
341
+ f = lzma .LZMAFile (dest_path , "w" )
342
+ else :
343
+ msg = 'Unrecognized compression type: {}' .format (compression )
344
+ raise ValueError (msg )
345
+
346
+ if compression != "zip" :
347
+ f .write (open (src_path , "rb" ).read ())
348
+ f .close ()
349
+
350
+
351
+ def decompress_file (src_path , dest_path , compression ):
352
+ if compression is None :
353
+ shutil .copyfile (src_path , dest_path )
354
+ return
355
+
356
+ if compression == 'gzip' :
357
+ import gzip
358
+ f = gzip .open (src_path , "r" )
359
+ elif compression == 'bz2' :
360
+ import bz2
361
+ f = bz2 .BZ2File (src_path , "r" )
362
+ elif compression == 'zip' :
363
+ import zipfile
364
+ zip_file = zipfile .ZipFile (src_path )
365
+ zip_names = zip_file .namelist ()
366
+ if len (zip_names ) == 1 :
367
+ f = zip_file .open (zip_names .pop ())
368
+ else :
369
+ raise ValueError ('ZIP file {} error. Only one file per ZIP.'
370
+ .format (src_path ))
371
+ elif compression == 'xz' :
372
+ lzma = pandas .compat .import_lzma ()
373
+ f = lzma .LZMAFile (src_path , "r" )
374
+ else :
375
+ msg = 'Unrecognized compression type: {}' .format (compression )
376
+ raise ValueError (msg )
377
+
378
+ open (dest_path , "wb" ).write (f .read ())
379
+ f .close ()
380
+
381
+
314
382
@pytest .mark .parametrize ('compression' , [None , 'gzip' , 'bz2' , 'xz' ])
315
- def test_compression_explicit (compression ):
383
+ def test_write_explicit (compression ):
316
384
# issue 11666
317
385
if compression == 'xz' :
318
386
tm ._skip_if_no_lzma ()
319
- with tm .ensure_clean (get_random_path ()) as path :
387
+
388
+ base = get_random_path ()
389
+ path1 = base + ".compressed"
390
+ path2 = base + ".raw"
391
+
392
+ with tm .ensure_clean (path1 ) as p1 , tm .ensure_clean (path2 ) as p2 :
320
393
df = tm .makeDataFrame ()
321
- df .to_pickle (path , compression = compression )
322
- df2 = pd .read_pickle (path , compression = compression )
394
+ # write to compressed file
395
+ df .to_pickle (p1 , compression = compression )
396
+ # decompress
397
+ decompress_file (p1 , p2 , compression = compression )
398
+ # read decompressed file
399
+ df2 = pd .read_pickle (p2 , compression = None )
323
400
tm .assert_frame_equal (df , df2 )
324
401
325
402
326
403
@pytest .mark .parametrize ('compression' , ['' , 'None' , 'bad' , '7z' ])
327
- def test_compression_explicit_bad (compression ):
404
+ def test_write_explicit_bad (compression ):
328
405
with tm .assertRaisesRegexp (ValueError ,
329
406
"Unrecognized compression type" ):
330
407
with tm .ensure_clean (get_random_path ()) as path :
@@ -333,10 +410,105 @@ def test_compression_explicit_bad(compression):
333
410
334
411
335
412
@pytest .mark .parametrize ('ext' , ['' , '.gz' , '.bz2' , '.xz' , '.no_compress' ])
336
- def test_compression_infer (ext ):
413
+ def test_write_infer (ext ):
337
414
if ext == '.xz' :
338
415
tm ._skip_if_no_lzma ()
339
- with tm .ensure_clean (get_random_path () + ext ) as path :
416
+
417
+ base = get_random_path ()
418
+ path1 = base + ext
419
+ path2 = base + ".raw"
420
+ compression = None
421
+ for c in _compression_to_extension :
422
+ if _compression_to_extension [c ] == ext :
423
+ compression = c
424
+ break
425
+
426
+ with tm .ensure_clean (path1 ) as p1 , tm .ensure_clean (path2 ) as p2 :
340
427
df = tm .makeDataFrame ()
341
- df .to_pickle (path )
342
- tm .assert_frame_equal (df , pd .read_pickle (path ))
428
+ # write to compressed file by inferred compression method
429
+ df .to_pickle (p1 )
430
+ # decompress
431
+ decompress_file (p1 , p2 , compression = compression )
432
+ # read decompressed file
433
+ df2 = pd .read_pickle (p2 , compression = None )
434
+ tm .assert_frame_equal (df , df2 )
435
+
436
+
437
+ @pytest .mark .parametrize ('compression' , [None , 'gzip' , 'bz2' , 'xz' , "zip" ])
438
+ def test_read_explicit (compression ):
439
+ # issue 11666
440
+ if compression == 'xz' :
441
+ tm ._skip_if_no_lzma ()
442
+
443
+ base = get_random_path ()
444
+ path1 = base + ".raw"
445
+ path2 = base + ".compressed"
446
+
447
+ with tm .ensure_clean (path1 ) as p1 , tm .ensure_clean (path2 ) as p2 :
448
+ df = tm .makeDataFrame ()
449
+ # write to uncompressed file
450
+ df .to_pickle (p1 , compression = None )
451
+ # compress
452
+ compress_file (p1 , p2 , compression = compression )
453
+ # read compressed file
454
+ df2 = pd .read_pickle (p2 , compression = compression )
455
+ tm .assert_frame_equal (df , df2 )
456
+
457
+
458
+ @pytest .mark .parametrize ('ext' , ['' , '.gz' , '.bz2' , '.xz' , '.zip' ,
459
+ '.no_compress' ])
460
+ def test_read_infer (ext ):
461
+ if ext == '.xz' :
462
+ tm ._skip_if_no_lzma ()
463
+
464
+ base = get_random_path ()
465
+ path1 = base + ".raw"
466
+ path2 = base + ext
467
+ compression = None
468
+ for c in _compression_to_extension :
469
+ if _compression_to_extension [c ] == ext :
470
+ compression = c
471
+ break
472
+
473
+ with tm .ensure_clean (path1 ) as p1 , tm .ensure_clean (path2 ) as p2 :
474
+ df = tm .makeDataFrame ()
475
+ # write to uncompressed file
476
+ df .to_pickle (p1 , compression = None )
477
+ # compress
478
+ compress_file (p1 , p2 , compression = compression )
479
+ # read compressed file by inferred compression method
480
+ df2 = pd .read_pickle (p2 )
481
+ tm .assert_frame_equal (df , df2 )
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+ def notest_zip ():
499
+ df = pd .DataFrame ({
500
+ 'A' : np .random .randn (100 ).repeat (10 ),
501
+ 'B' : np .random .randn (100 ).repeat (10 ),
502
+ 'C' : np .random .randn (100 ).repeat (10 )})
503
+ os .chdir ("d:\\ test" )
504
+
505
+ df .to_pickle ("data.raw" )
506
+ compress_file ("data.raw" , "data.zip" , "zip" )
507
+ compress_file ("data.raw" , "data.xz" , "xz" )
508
+ compress_file ("data.raw" , "data.bz2" , "bz2" )
509
+ compress_file ("data.raw" , "data.gz" , "gzip" )
510
+
511
+ decompress_file ("data.zip" , "data.zip.raw" , "zip" )
512
+ decompress_file ("data.xz" , "data.xz.raw" , "xz" )
513
+ decompress_file ("data.bz2" , "data.bz2.raw" , "bz2" )
514
+ decompress_file ("data.gz" , "data.gz.raw" , "gzip" )
0 commit comments