Skip to content

Commit d50e430

Browse files
committed
update docs. re-write all tests to avoid round-trip read/write comparison.
1 parent 86afd25 commit d50e430

File tree

3 files changed

+243
-11
lines changed

3 files changed

+243
-11
lines changed

doc/source/io.rst

+32
Original file line numberDiff line numberDiff line change
@@ -2908,6 +2908,38 @@ any pickled pandas object (or any other pickled object) from file:
29082908
import os
29092909
os.remove('foo.pkl')
29102910
2911+
The ``to_pickle`` and ``read_pickle`` methods can read and write compressed pickle files.
2912+
For ``read_pickle`` method, ``compression`` parameter can be one of
2913+
{``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``.
2914+
If 'infer', then use gzip, bz2, zip, or xz if filename ends in '.gz', '.bz2', '.zip', or
2915+
'.xz', respectively. If using 'zip', the ZIP file must contain only one data file to be
2916+
read in. Set to ``None`` for no decompression.
2917+
``to_pickle`` works in a similar way, except that 'zip' format is not supported. If the
2918+
filename ends with '.zip', an exception will be raised.
2919+
2920+
.. versionadded:: 0.20.0
2921+
2922+
.. ipython:: python
2923+
2924+
df = pd.DataFrame({
2925+
'A': np.random.randn(1000),
2926+
'B': np.random.randn(1000),
2927+
'C': np.random.randn(1000)})
2928+
df.to_pickle("data.pkl.xz")
2929+
df.to_pickle("data.pkl.compress", compression="gzip")
2930+
df["A"].to_pickle("s1.pkl.bz2")
2931+
2932+
df = pd.read_pickle("data.pkl.xz")
2933+
df = pd.read_pickle("data.pkl.compress", compression="gzip")
2934+
s = pd.read_pickle("s1.pkl.bz2")
2935+
2936+
.. ipython:: python
2937+
:suppress:
2938+
import os
2939+
os.remove("data.pkl.xz")
2940+
os.remove("data.pkl.compress")
2941+
os.remove("s1.pkl.bz2")
2942+
29112943
.. warning::
29122944

29132945
Loading pickled data received from untrusted sources can be unsafe.

doc/source/whatsnew/v0.20.0.txt

+28
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,34 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
9999

100100
.. _whatsnew_0200.enhancements.uint64_support:
101101

102+
Pickle file I/O now supports compression
103+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
104+
105+
``read_pickle`` and ``to_pickle`` can now read from and write to compressed
106+
pickle files. Compression methods can be explicit parameter or be inferred
107+
from file extension.
108+
109+
.. ipython:: python
110+
111+
df = pd.DataFrame({
112+
'A': np.random.randn(1000),
113+
'B': np.random.randn(1000),
114+
'C': np.random.randn(1000)})
115+
df.to_pickle("data.pkl.xz")
116+
df.to_pickle("data.pkl.compress", compression="gzip")
117+
df["A"].to_pickle("s1.pkl.bz2")
118+
119+
df = pd.read_pickle("data.pkl.xz")
120+
df = pd.read_pickle("data.pkl.compress", compression="gzip")
121+
s = pd.read_pickle("s1.pkl.bz2")
122+
123+
.. ipython:: python
124+
:suppress:
125+
import os
126+
os.remove("data.pkl.xz")
127+
os.remove("data.pkl.compress")
128+
os.remove("s1.pkl.bz2")
129+
102130
UInt64 Support Improved
103131
^^^^^^^^^^^^^^^^^^^^^^^
104132

pandas/tests/io/test_pickle.py

+183-11
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@
1515

1616
import pytest
1717
import os
18-
1918
from distutils.version import LooseVersion
20-
2119
import pandas as pd
20+
import numpy as np
2221
from pandas import Index
2322
from pandas.compat import is_platform_little_endian
2423
import pandas
2524
import pandas.util.testing as tm
2625
from pandas.tseries.offsets import Day, MonthEnd
26+
import shutil
2727

2828

2929
@pytest.fixture(scope='module')
@@ -307,24 +307,101 @@ def test_pickle_v0_15_2():
307307
# ---------------------
308308
# test pickle compression
309309
# ---------------------
310+
_compression_to_extension = {
311+
None: ".none",
312+
'gzip': '.gz',
313+
'bz2': '.bz2',
314+
'zip': '.zip',
315+
'xz': '.xz',
316+
}
317+
318+
310319
def get_random_path():
311320
return u'__%s__.pickle' % tm.rands(10)
312321

313322

323+
def compress_file(src_path, dest_path, compression):
324+
if compression is None:
325+
shutil.copyfile(src_path, dest_path)
326+
return
327+
328+
if compression == 'gzip':
329+
import gzip
330+
f = gzip.open(dest_path, "w")
331+
elif compression == 'bz2':
332+
import bz2
333+
f = bz2.BZ2File(dest_path, "w")
334+
elif compression == 'zip':
335+
import zipfile
336+
zip_file = zipfile.ZipFile(dest_path, "w",
337+
compression=zipfile.ZIP_DEFLATED)
338+
zip_file.write(src_path, os.path.basename(src_path))
339+
elif compression == 'xz':
340+
lzma = pandas.compat.import_lzma()
341+
f = lzma.LZMAFile(dest_path, "w")
342+
else:
343+
msg = 'Unrecognized compression type: {}'.format(compression)
344+
raise ValueError(msg)
345+
346+
if compression != "zip":
347+
f.write(open(src_path, "rb").read())
348+
f.close()
349+
350+
351+
def decompress_file(src_path, dest_path, compression):
352+
if compression is None:
353+
shutil.copyfile(src_path, dest_path)
354+
return
355+
356+
if compression == 'gzip':
357+
import gzip
358+
f = gzip.open(src_path, "r")
359+
elif compression == 'bz2':
360+
import bz2
361+
f = bz2.BZ2File(src_path, "r")
362+
elif compression == 'zip':
363+
import zipfile
364+
zip_file = zipfile.ZipFile(src_path)
365+
zip_names = zip_file.namelist()
366+
if len(zip_names) == 1:
367+
f = zip_file.open(zip_names.pop())
368+
else:
369+
raise ValueError('ZIP file {} error. Only one file per ZIP.'
370+
.format(src_path))
371+
elif compression == 'xz':
372+
lzma = pandas.compat.import_lzma()
373+
f = lzma.LZMAFile(src_path, "r")
374+
else:
375+
msg = 'Unrecognized compression type: {}'.format(compression)
376+
raise ValueError(msg)
377+
378+
open(dest_path, "wb").write(f.read())
379+
f.close()
380+
381+
314382
@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz'])
315-
def test_compression_explicit(compression):
383+
def test_write_explicit(compression):
316384
# issue 11666
317385
if compression == 'xz':
318386
tm._skip_if_no_lzma()
319-
with tm.ensure_clean(get_random_path()) as path:
387+
388+
base = get_random_path()
389+
path1 = base + ".compressed"
390+
path2 = base + ".raw"
391+
392+
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
320393
df = tm.makeDataFrame()
321-
df.to_pickle(path, compression=compression)
322-
df2 = pd.read_pickle(path, compression=compression)
394+
# write to compressed file
395+
df.to_pickle(p1, compression=compression)
396+
# decompress
397+
decompress_file(p1, p2, compression=compression)
398+
# read decompressed file
399+
df2 = pd.read_pickle(p2, compression=None)
323400
tm.assert_frame_equal(df, df2)
324401

325402

326403
@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
327-
def test_compression_explicit_bad(compression):
404+
def test_write_explicit_bad(compression):
328405
with tm.assertRaisesRegexp(ValueError,
329406
"Unrecognized compression type"):
330407
with tm.ensure_clean(get_random_path()) as path:
@@ -333,10 +410,105 @@ def test_compression_explicit_bad(compression):
333410

334411

335412
@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress'])
336-
def test_compression_infer(ext):
413+
def test_write_infer(ext):
337414
if ext == '.xz':
338415
tm._skip_if_no_lzma()
339-
with tm.ensure_clean(get_random_path() + ext) as path:
416+
417+
base = get_random_path()
418+
path1 = base + ext
419+
path2 = base + ".raw"
420+
compression = None
421+
for c in _compression_to_extension:
422+
if _compression_to_extension[c] == ext:
423+
compression = c
424+
break
425+
426+
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
340427
df = tm.makeDataFrame()
341-
df.to_pickle(path)
342-
tm.assert_frame_equal(df, pd.read_pickle(path))
428+
# write to compressed file by inferred compression method
429+
df.to_pickle(p1)
430+
# decompress
431+
decompress_file(p1, p2, compression=compression)
432+
# read decompressed file
433+
df2 = pd.read_pickle(p2, compression=None)
434+
tm.assert_frame_equal(df, df2)
435+
436+
437+
@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"])
438+
def test_read_explicit(compression):
439+
# issue 11666
440+
if compression == 'xz':
441+
tm._skip_if_no_lzma()
442+
443+
base = get_random_path()
444+
path1 = base + ".raw"
445+
path2 = base + ".compressed"
446+
447+
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
448+
df = tm.makeDataFrame()
449+
# write to uncompressed file
450+
df.to_pickle(p1, compression=None)
451+
# compress
452+
compress_file(p1, p2, compression=compression)
453+
# read compressed file
454+
df2 = pd.read_pickle(p2, compression=compression)
455+
tm.assert_frame_equal(df, df2)
456+
457+
458+
@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip',
459+
'.no_compress'])
460+
def test_read_infer(ext):
461+
if ext == '.xz':
462+
tm._skip_if_no_lzma()
463+
464+
base = get_random_path()
465+
path1 = base + ".raw"
466+
path2 = base + ext
467+
compression = None
468+
for c in _compression_to_extension:
469+
if _compression_to_extension[c] == ext:
470+
compression = c
471+
break
472+
473+
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
474+
df = tm.makeDataFrame()
475+
# write to uncompressed file
476+
df.to_pickle(p1, compression=None)
477+
# compress
478+
compress_file(p1, p2, compression=compression)
479+
# read compressed file by inferred compression method
480+
df2 = pd.read_pickle(p2)
481+
tm.assert_frame_equal(df, df2)
482+
483+
484+
485+
486+
487+
488+
489+
490+
491+
492+
493+
494+
495+
496+
497+
498+
def notest_zip():
499+
df = pd.DataFrame({
500+
'A': np.random.randn(100).repeat(10),
501+
'B': np.random.randn(100).repeat(10),
502+
'C': np.random.randn(100).repeat(10)})
503+
os.chdir("d:\\test")
504+
505+
df.to_pickle("data.raw")
506+
compress_file("data.raw", "data.zip", "zip")
507+
compress_file("data.raw", "data.xz", "xz")
508+
compress_file("data.raw", "data.bz2", "bz2")
509+
compress_file("data.raw", "data.gz", "gzip")
510+
511+
decompress_file("data.zip", "data.zip.raw", "zip")
512+
decompress_file("data.xz", "data.xz.raw", "xz")
513+
decompress_file("data.bz2", "data.bz2.raw", "bz2")
514+
decompress_file("data.gz", "data.gz.raw", "gzip")

0 commit comments

Comments
 (0)