-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
add compression support for 'read_pickle' and 'to_pickle' #13317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
025a0cd
81d55a0
6df6611
b8c4175
1cb810b
9a07250
ccbeaa9
945e7bb
86afd25
d50e430
e9c5fd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -305,7 +305,7 @@ def _infer_compression(filepath_or_buffer, compression): | |
|
||
|
||
def _get_handle(path_or_buf, mode, encoding=None, compression=None, | ||
memory_map=False): | ||
memory_map=False, is_text=True): | ||
""" | ||
Get file handle for given path/buffer and mode. | ||
|
||
|
@@ -320,7 +320,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
Supported compression protocols are gzip, bz2, zip, and xz | ||
memory_map : boolean, default False | ||
See parsers._parser_params for more information. | ||
|
||
is_text : boolean, default True | ||
whether file/buffer is in text format (csv, json, etc.), or in binary | ||
mode (pickle, etc.) | ||
Returns | ||
------- | ||
f : file-like | ||
|
@@ -394,13 +396,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
elif encoding: | ||
# Python 3 and encoding | ||
f = open(path_or_buf, mode, encoding=encoding) | ||
else: | ||
elif is_text: | ||
# Python 3 and no explicit encoding | ||
f = open(path_or_buf, mode, errors='replace') | ||
else: | ||
# Python 3 and binary mode | ||
f = open(path_or_buf, mode) | ||
handles.append(f) | ||
|
||
# in Python 3, convert BytesIO or fileobjects passed with an encoding | ||
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): | ||
if compat.PY3 and is_text and\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there an opportunity to simplify things? What is the relationship between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
maybe we can spilt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened #15008: would love if you could migrate you above comment to that issue. This way we can keep this pull request focussed and minimalist. |
||
(compression or isinstance(f, need_text_wrapping)): | ||
from io import TextIOWrapper | ||
f = TextIOWrapper(f, encoding=encoding) | ||
handles.append(f) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,10 @@ | |
from numpy.lib.format import read_array, write_array | ||
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 | ||
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE | ||
from pandas.io.common import _get_handle, _infer_compression | ||
|
||
|
||
def to_pickle(obj, path): | ||
def to_pickle(obj, path, compression='infer'): | ||
""" | ||
Pickle (serialize) object to input file path | ||
|
||
|
@@ -15,12 +16,23 @@ def to_pickle(obj, path): | |
obj : any object | ||
path : string | ||
File path | ||
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded |
||
a string representing the compression to use in the output file | ||
|
||
.. versionadded:: 0.20.0 | ||
""" | ||
with open(path, 'wb') as f: | ||
inferred_compression = _infer_compression(path, compression) | ||
f, fh = _get_handle(path, 'wb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
|
||
|
||
def read_pickle(path): | ||
def read_pickle(path, compression='infer'): | ||
""" | ||
Load pickled pandas object (or any other pickled object) from the specified | ||
file path | ||
|
@@ -32,12 +44,32 @@ def read_pickle(path): | |
---------- | ||
path : string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It shouldn't be too hard to get There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the original api dosen't support URL reading, this feature can be added in future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be a nice enhancement (and welcome to work on this!), but let's leave that for another PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It would fit well with a pull request to accomplish #15008. |
||
File path | ||
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' | ||
For on-the-fly decompression of on-disk data. If 'infer', then use | ||
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', | ||
or 'zip' respectively, and no decompression otherwise. | ||
Set to None for no decompression. | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
Returns | ||
------- | ||
unpickled : type of object stored in file | ||
""" | ||
|
||
inferred_compression = _infer_compression(path, compression) | ||
|
||
def read_wrapper(func): | ||
# wrapper file handle open/close operation | ||
f, fh = _get_handle(path, 'rb', | ||
compression=inferred_compression, | ||
is_text=False) | ||
try: | ||
return func(f) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
|
||
def try_read(path, encoding=None): | ||
# try with cPickle | ||
# try with current pickle, if we have a Type Error then | ||
|
@@ -48,26 +80,24 @@ def try_read(path, encoding=None): | |
# cpickle | ||
# GH 6899 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so these routines need to be changed to read in the file once e.g.
then the operations are all
etc, IOW, all we do is seek to the beginning of the buffer each time, rather than read the file in (potentially) 4 times. |
||
try: | ||
with open(path, 'rb') as fh: | ||
return pkl.load(fh) | ||
return read_wrapper(lambda f: pkl.load(f)) | ||
except Exception: | ||
# reg/patched pickle | ||
try: | ||
with open(path, 'rb') as fh: | ||
return pc.load(fh, encoding=encoding, compat=False) | ||
|
||
return read_wrapper( | ||
lambda f: pc.load(f, encoding=encoding, compat=False)) | ||
# compat pickle | ||
except: | ||
with open(path, 'rb') as fh: | ||
return pc.load(fh, encoding=encoding, compat=True) | ||
|
||
return read_wrapper( | ||
lambda f: pc.load(f, encoding=encoding, compat=True)) | ||
try: | ||
return try_read(path) | ||
except: | ||
if PY3: | ||
return try_read(path, encoding='latin1') | ||
raise | ||
|
||
|
||
# compat with sparse pickle / unpickle | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need an versionadded tag