Skip to content

Commit 0cfc950

Browse files
goldenbulljreback
authored andcommitted
ENH: add compression support for 'read_pickle' and 'to_pickle'
closes #11666 Author: goldenbull <[email protected]> Author: Chen Jinniu <[email protected]> Closes #13317 from goldenbull/pickle_io_compression and squashes the following commits: e9c5fd2 [goldenbull] docs update d50e430 [goldenbull] update docs. re-write all tests to avoid round-trip read/write comparison. 86afd25 [goldenbull] change test to new pytest parameterized style 945e7bb [goldenbull] Merge remote-tracking branch 'origin/master' into pickle_io_compression ccbeaa9 [goldenbull] move pickle compression tests into a new class 9a07250 [goldenbull] Remove prepared compressed data. _get_handle will take care of compressed I/O 1cb810b [goldenbull] add zip decompression support. refactor using lambda. b8c4175 [goldenbull] add compressed pickle data file to io/tests 6df6611 [goldenbull] pickle compression code update 81d55a0 [Chen Jinniu] Merge branch 'master' into pickle_io_compression 025a0cd [goldenbull] add compression support for pickle
1 parent a4bba28 commit 0cfc950

File tree

6 files changed

+324
-19
lines changed

6 files changed

+324
-19
lines changed

Diff for: doc/source/io.rst

+39
Original file line numberDiff line numberDiff line change
@@ -3046,6 +3046,45 @@ any pickled pandas object (or any other pickled object) from file:
30463046

30473047
These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated.
30483048

3049+
.. _io.pickle.compression:
3050+
3051+
Read/Write compressed pickle files
3052+
''''''''''''''
3053+
3054+
.. versionadded:: 0.20.0
3055+
3056+
:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read
3057+
and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports
3058+
both read and write. ``zip`` file supports read only and must contain only one data file
3059+
to be read in.
3060+
Compression type can be an explicitely parameter or be inferred from the file extension.
3061+
If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
3062+
``'.xz'``, respectively.
3063+
3064+
.. ipython:: python
3065+
3066+
df = pd.DataFrame({
3067+
'A': np.random.randn(1000),
3068+
'B': np.random.randn(1000),
3069+
'C': np.random.randn(1000)})
3070+
df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type
3071+
df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension
3072+
df.to_pickle("data.pkl.gz") # default, using "infer"
3073+
df["A"].to_pickle("s1.pkl.bz2")
3074+
3075+
df = pd.read_pickle("data.pkl.compress", compression="gzip")
3076+
df = pd.read_pickle("data.pkl.xz", compression="infer")
3077+
df = pd.read_pickle("data.pkl.gz")
3078+
s = pd.read_pickle("s1.pkl.bz2")
3079+
3080+
.. ipython:: python
3081+
:suppress:
3082+
import os
3083+
os.remove("data.pkl.compress")
3084+
os.remove("data.pkl.xz")
3085+
os.remove("data.pkl.gz")
3086+
os.remove("s1.pkl.bz2")
3087+
30493088
.. _io.msgpack:
30503089

30513090
msgpack

Diff for: doc/source/whatsnew/v0.20.0.txt

+34
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,40 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
9494
df = pd.read_table(url, compression='bz2') # explicitly specify compression
9595
df.head(2)
9696

97+
.. _whatsnew_0200.enhancements.pickle_compression:
98+
99+
Pickle file I/O now supports compression
100+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101+
102+
:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle`
103+
can now read from and write to compressed pickle files. Compression methods
104+
can be an explicit parameter or be inferred from the file extension.
105+
See :ref:`Read/Write compressed pickle files <io.pickle.compression>`
106+
107+
.. ipython:: python
108+
109+
df = pd.DataFrame({
110+
'A': np.random.randn(1000),
111+
'B': np.random.randn(1000),
112+
'C': np.random.randn(1000)})
113+
df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type
114+
df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension
115+
df.to_pickle("data.pkl.gz") # default, using "infer"
116+
df["A"].to_pickle("s1.pkl.bz2")
117+
118+
df = pd.read_pickle("data.pkl.compress", compression="gzip")
119+
df = pd.read_pickle("data.pkl.xz", compression="infer")
120+
df = pd.read_pickle("data.pkl.gz")
121+
s = pd.read_pickle("s1.pkl.bz2")
122+
123+
.. ipython:: python
124+
:suppress:
125+
import os
126+
os.remove("data.pkl.compress")
127+
os.remove("data.pkl.xz")
128+
os.remove("data.pkl.gz")
129+
os.remove("s1.pkl.bz2")
130+
97131
.. _whatsnew_0200.enhancements.uint64_support:
98132

99133
UInt64 Support Improved

Diff for: pandas/core/generic.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1355,17 +1355,21 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
13551355
if_exists=if_exists, index=index, index_label=index_label,
13561356
chunksize=chunksize, dtype=dtype)
13571357

1358-
def to_pickle(self, path):
1358+
def to_pickle(self, path, compression='infer'):
13591359
"""
13601360
Pickle (serialize) object to input file path.
13611361
13621362
Parameters
13631363
----------
13641364
path : string
13651365
File path
1366+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
1367+
a string representing the compression to use in the output file
1368+
1369+
.. versionadded:: 0.20.0
13661370
"""
13671371
from pandas.io.pickle import to_pickle
1368-
return to_pickle(self, path)
1372+
return to_pickle(self, path, compression=compression)
13691373

13701374
def to_clipboard(self, excel=None, sep=None, **kwargs):
13711375
"""

Diff for: pandas/io/common.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def _infer_compression(filepath_or_buffer, compression):
305305

306306

307307
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
308-
memory_map=False):
308+
memory_map=False, is_text=True):
309309
"""
310310
Get file handle for given path/buffer and mode.
311311
@@ -320,7 +320,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
320320
Supported compression protocols are gzip, bz2, zip, and xz
321321
memory_map : boolean, default False
322322
See parsers._parser_params for more information.
323-
323+
is_text : boolean, default True
324+
whether file/buffer is in text format (csv, json, etc.), or in binary
325+
mode (pickle, etc.)
324326
Returns
325327
-------
326328
f : file-like
@@ -394,13 +396,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
394396
elif encoding:
395397
# Python 3 and encoding
396398
f = open(path_or_buf, mode, encoding=encoding)
397-
else:
399+
elif is_text:
398400
# Python 3 and no explicit encoding
399401
f = open(path_or_buf, mode, errors='replace')
402+
else:
403+
# Python 3 and binary mode
404+
f = open(path_or_buf, mode)
400405
handles.append(f)
401406

402407
# in Python 3, convert BytesIO or fileobjects passed with an encoding
403-
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
408+
if compat.PY3 and is_text and\
409+
(compression or isinstance(f, need_text_wrapping)):
404410
from io import TextIOWrapper
405411
f = TextIOWrapper(f, encoding=encoding)
406412
handles.append(f)

Diff for: pandas/io/pickle.py

+41-11
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
from numpy.lib.format import read_array, write_array
55
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
66
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
7+
from pandas.io.common import _get_handle, _infer_compression
78

89

9-
def to_pickle(obj, path):
10+
def to_pickle(obj, path, compression='infer'):
1011
"""
1112
Pickle (serialize) object to input file path
1213
@@ -15,12 +16,23 @@ def to_pickle(obj, path):
1516
obj : any object
1617
path : string
1718
File path
19+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
20+
a string representing the compression to use in the output file
21+
22+
.. versionadded:: 0.20.0
1823
"""
19-
with open(path, 'wb') as f:
24+
inferred_compression = _infer_compression(path, compression)
25+
f, fh = _get_handle(path, 'wb',
26+
compression=inferred_compression,
27+
is_text=False)
28+
try:
2029
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
30+
finally:
31+
for _f in fh:
32+
_f.close()
2133

2234

23-
def read_pickle(path):
35+
def read_pickle(path, compression='infer'):
2436
"""
2537
Load pickled pandas object (or any other pickled object) from the specified
2638
file path
@@ -32,12 +44,32 @@ def read_pickle(path):
3244
----------
3345
path : string
3446
File path
47+
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
48+
For on-the-fly decompression of on-disk data. If 'infer', then use
49+
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
50+
or 'zip' respectively, and no decompression otherwise.
51+
Set to None for no decompression.
52+
53+
.. versionadded:: 0.20.0
3554
3655
Returns
3756
-------
3857
unpickled : type of object stored in file
3958
"""
4059

60+
inferred_compression = _infer_compression(path, compression)
61+
62+
def read_wrapper(func):
63+
# wrapper file handle open/close operation
64+
f, fh = _get_handle(path, 'rb',
65+
compression=inferred_compression,
66+
is_text=False)
67+
try:
68+
return func(f)
69+
finally:
70+
for _f in fh:
71+
_f.close()
72+
4173
def try_read(path, encoding=None):
4274
# try with cPickle
4375
# try with current pickle, if we have a Type Error then
@@ -48,26 +80,24 @@ def try_read(path, encoding=None):
4880
# cpickle
4981
# GH 6899
5082
try:
51-
with open(path, 'rb') as fh:
52-
return pkl.load(fh)
83+
return read_wrapper(lambda f: pkl.load(f))
5384
except Exception:
5485
# reg/patched pickle
5586
try:
56-
with open(path, 'rb') as fh:
57-
return pc.load(fh, encoding=encoding, compat=False)
58-
87+
return read_wrapper(
88+
lambda f: pc.load(f, encoding=encoding, compat=False))
5989
# compat pickle
6090
except:
61-
with open(path, 'rb') as fh:
62-
return pc.load(fh, encoding=encoding, compat=True)
63-
91+
return read_wrapper(
92+
lambda f: pc.load(f, encoding=encoding, compat=True))
6493
try:
6594
return try_read(path)
6695
except:
6796
if PY3:
6897
return try_read(path, encoding='latin1')
6998
raise
7099

100+
71101
# compat with sparse pickle / unpickle
72102

73103

0 commit comments

Comments
 (0)