Skip to content

add compression support for 'read_pickle' and 'to_pickle' #13317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
39 changes: 39 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2926,6 +2926,45 @@ any pickled pandas object (or any other pickled object) from file:

These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated.

.. _io.pickle.compression:

Read/Write compressed pickle files
''''''''''''''

.. versionadded:: 0.20.0

:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read
and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports
both read and write. ``zip`` file supports read only and must contain only one data file
to be read in.
Compression type can be an explicitely parameter or be inferred from the file extension.
If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
``'.xz'``, respectively.

.. ipython:: python

df = pd.DataFrame({
'A': np.random.randn(1000),
'B': np.random.randn(1000),
'C': np.random.randn(1000)})
df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type
df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension
df.to_pickle("data.pkl.gz") # default, using "infer"
df["A"].to_pickle("s1.pkl.bz2")

df = pd.read_pickle("data.pkl.compress", compression="gzip")
df = pd.read_pickle("data.pkl.xz", compression="infer")
df = pd.read_pickle("data.pkl.gz")
s = pd.read_pickle("s1.pkl.bz2")

.. ipython:: python
:suppress:
import os
os.remove("data.pkl.compress")
os.remove("data.pkl.xz")
os.remove("data.pkl.gz")
os.remove("s1.pkl.bz2")

.. _io.msgpack:

msgpack
Expand Down
34 changes: 34 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,40 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
df = pd.read_table(url, compression='bz2') # explicitly specify compression
df.head(2)

.. _whatsnew_0200.enhancements.pickle_compression:

Pickle file I/O now supports compression
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle`
can now read from and write to compressed pickle files. Compression methods
can be an explicit parameter or be inferred from the file extension.
See :ref:`Read/Write compressed pickle files <io.pickle.compression>`

.. ipython:: python

df = pd.DataFrame({
'A': np.random.randn(1000),
'B': np.random.randn(1000),
'C': np.random.randn(1000)})
df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type
df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension
df.to_pickle("data.pkl.gz") # default, using "infer"
df["A"].to_pickle("s1.pkl.bz2")

df = pd.read_pickle("data.pkl.compress", compression="gzip")
df = pd.read_pickle("data.pkl.xz", compression="infer")
df = pd.read_pickle("data.pkl.gz")
s = pd.read_pickle("s1.pkl.bz2")

.. ipython:: python
:suppress:
import os
os.remove("data.pkl.compress")
os.remove("data.pkl.xz")
os.remove("data.pkl.gz")
os.remove("s1.pkl.bz2")

.. _whatsnew_0200.enhancements.uint64_support:

UInt64 Support Improved
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,17 +1278,21 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
if_exists=if_exists, index=index, index_label=index_label,
chunksize=chunksize, dtype=dtype)

def to_pickle(self, path):
def to_pickle(self, path, compression='infer'):
"""
Pickle (serialize) object to input file path.

Parameters
----------
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need an versionadded tag

a string representing the compression to use in the output file

.. versionadded:: 0.20.0
"""
from pandas.io.pickle import to_pickle
return to_pickle(self, path)
return to_pickle(self, path, compression=compression)

def to_clipboard(self, excel=None, sep=None, **kwargs):
"""
Expand Down
14 changes: 10 additions & 4 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def _infer_compression(filepath_or_buffer, compression):


def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False):
memory_map=False, is_text=True):
"""
Get file handle for given path/buffer and mode.

Expand All @@ -320,7 +320,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
Supported compression protocols are gzip, bz2, zip, and xz
memory_map : boolean, default False
See parsers._parser_params for more information.

is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.)
Returns
-------
f : file-like
Expand Down Expand Up @@ -394,13 +396,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding)
else:
elif is_text:
# Python 3 and no explicit encoding
f = open(path_or_buf, mode, errors='replace')
else:
# Python 3 and binary mode
f = open(path_or_buf, mode)
handles.append(f)

# in Python 3, convert BytesIO or fileobjects passed with an encoding
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
if compat.PY3 and is_text and\
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an opportunity to simplify things? What is the relationship between need_text_wrapping, compression, and is_text --- I will think about this, but I think currently, the logic for when TextIOWrapper gets applied is confusing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_handle needs to deal with varies situations:

  • py2 or py3
  • binary (pickle, msgpack) or text (csv)
  • if text, what's the encoding
  • compression
  • memory map
  • open for read or write

maybe we can spilt _get_handle into two or more functions to make each single function simpler?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #15008: would love if you could migrate you above comment to that issue. This way we can keep this pull request focussed and minimalist.

(compression or isinstance(f, need_text_wrapping)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)
Expand Down
52 changes: 41 additions & 11 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
from pandas.io.common import _get_handle, _infer_compression


def to_pickle(obj, path):
def to_pickle(obj, path, compression='infer'):
"""
Pickle (serialize) object to input file path

Expand All @@ -15,12 +16,23 @@ def to_pickle(obj, path):
obj : any object
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded

a string representing the compression to use in the output file

.. versionadded:: 0.20.0
"""
with open(path, 'wb') as f:
inferred_compression = _infer_compression(path, compression)
f, fh = _get_handle(path, 'wb',
compression=inferred_compression,
is_text=False)
try:
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
finally:
for _f in fh:
_f.close()


def read_pickle(path):
def read_pickle(path, compression='infer'):
"""
Load pickled pandas object (or any other pickled object) from the specified
file path
Expand All @@ -32,12 +44,32 @@ def read_pickle(path):
----------
path : string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't be too hard to get read_pickle to also support reading from URLs. @goldenbull -- not sure if you are also interested in adding this feature. Ultimately all read methods should support compression and URL reading.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the original api dosen't support URL reading, this feature can be added in future.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be a nice enhancement (and welcome to work on this!), but let's leave that for another PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's leave that for another PR

It would fit well with a pull request to accomplish #15008.

File path
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
or 'zip' respectively, and no decompression otherwise.
Set to None for no decompression.

.. versionadded:: 0.20.0

Returns
-------
unpickled : type of object stored in file
"""

inferred_compression = _infer_compression(path, compression)

def read_wrapper(func):
# wrapper file handle open/close operation
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return func(f)
finally:
for _f in fh:
_f.close()

def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
Expand All @@ -48,26 +80,24 @@ def try_read(path, encoding=None):
# cpickle
# GH 6899
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so these routines need to be changed to read in the file once e.g.

f, fh = _get_handle(....)
try:
buffer = BufferIO(fh.read())
finally:
   for _f in fh:
       f.close()  

then the operations are all

try:
    buffer.seek(0)
    pc.load(buffer....)
except:
    ...

etc, IOW, all we do is seek to the beginning of the buffer each time, rather than read the file in (potentially) 4 times.

try:
with open(path, 'rb') as fh:
return pkl.load(fh)
return read_wrapper(lambda f: pkl.load(f))
except Exception:
# reg/patched pickle
try:
with open(path, 'rb') as fh:
return pc.load(fh, encoding=encoding, compat=False)

return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=False))
# compat pickle
except:
with open(path, 'rb') as fh:
return pc.load(fh, encoding=encoding, compat=True)

return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=True))
try:
return try_read(path)
except:
if PY3:
return try_read(path, encoding='latin1')
raise


# compat with sparse pickle / unpickle


Expand Down
Loading