Skip to content

Refactor compression code to expand URL support #14576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pandas/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1455,9 +1455,9 @@ def save(self):
f = self.path_or_buf
close = False
else:
f = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
close = True

try:
Expand Down
210 changes: 126 additions & 84 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""Common IO api utilities"""

import sys
import os
import csv
import codecs
import mmap
import zipfile
from contextlib import contextmanager, closing

from pandas.compat import StringIO, BytesIO, string_types, text_type
Expand Down Expand Up @@ -141,39 +139,6 @@ def _is_s3_url(url):
return False


def maybe_read_encoded_stream(reader, encoding=None, compression=None):
"""read an encoded stream from the reader and transform the bytes to
unicode if required based on the encoding

Parameters
----------
reader : a streamable file-like object
encoding : optional, the encoding to attempt to read

Returns
-------
a tuple of (a stream of decoded bytes, the encoding which was used)

"""

if compat.PY3 or encoding is not None: # pragma: no cover
if encoding:
errors = 'strict'
else:
errors = 'replace'
encoding = 'utf-8'

if compression == 'gzip':
reader = BytesIO(reader.read())
else:
reader = StringIO(reader.read().decode(encoding, errors))
else:
if compression == 'gzip':
reader = BytesIO(reader.read())
encoding = None
return reader, encoding


def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Expand Down Expand Up @@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
"""

if _is_url(filepath_or_buffer):
req = _urlopen(str(filepath_or_buffer))
if compression == 'infer':
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
compression = 'gzip'
else:
compression = None
# cat on the compression to the tuple returned by the function
to_return = (list(maybe_read_encoded_stream(req, encoding,
compression)) +
[compression])
return tuple(to_return)
url = str(filepath_or_buffer)
req = _urlopen(url)
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header
compression = 'gzip'
Copy link
Contributor Author

@dhimmel dhimmel Dec 3, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think inferring based on the file name is usually enough - but u can also use the Content if it's available

@jreback, Content-Encoding is not a versatile way to infer compression -- gzip is the only compression encoding that pandas and the Content-Encoding specification both support. I've modified it so inference on a URL uses the extension, just as a path would. However, if Content-Encoding is gzip, then compression is overridden. I don't expect this situation to arise very often. Does this make sense or should we just totally ignore Content-Encoding?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I agree with your assements. Since this was here, we can leave it. But I agree the filename inference is better / more standard.

reader = BytesIO(req.read())
return reader, encoding, compression

if _is_s3_url(filepath_or_buffer):
from pandas.io.s3 import get_filepath_or_buffer
Expand Down Expand Up @@ -276,64 +237,145 @@ def file_path_to_url(path):
return urljoin('file:', pathname2url(path))


# ZipFile is not a context manager for <= 2.6
# must be tuple index here since 2.6 doesn't use namedtuple for version_info
if sys.version_info[1] <= 6:
@contextmanager
def ZipFile(*args, **kwargs):
with closing(zipfile.ZipFile(*args, **kwargs)) as zf:
yield zf
else:
ZipFile = zipfile.ZipFile
_compression_to_extension = {
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}


def _infer_compression(filepath_or_buffer, compression):
"""
If compression='infer', infer compression. If compression
"""

# No compression has been explicitly specified
if compression is None:
return None

# Cannot infer compression of a buffer. Hence assume no compression.
is_path = isinstance(filepath_or_buffer, compat.string_types)
if compression == 'infer' and not is_path:
return None

# Infer compression from the filename/URL extension
if compression == 'infer':
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
return None

# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression

msg = 'Unrecognized compression type: {}'.format(compression)
valid = ['infer', None] + sorted(_compression_to_extension)
msg += '\nValid compression types are {}'.format(valid)
raise ValueError(msg)


def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
"""Gets file handle for given path and mode.
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False):
"""
if compression is not None:
if encoding is not None and not compat.PY3:
msg = 'encoding + compression not yet supported in Python 2'
Get file handle for given path/buffer and mode.

Parameters
----------
path_or_buf :
a path (str) or buffer
mode : str
mode to open path_or_buf with
encoding : str or None
compression : str or None
Supported compression protocols are gzip, bz2, zip, and xz
memory_map : boolean, default False
See parsers._parser_params for more information.

Returns
-------
f : file-like
A file-like object
handles : list of file-like objects
A list of file-like object that were openned in this function.
"""

handles = list()
f = path_or_buf
is_path = isinstance(path_or_buf, compat.string_types)

if compression:

if compat.PY2 and not is_path and encoding:
msg = 'compression with encoding is not yet supported in Python 2'
raise ValueError(msg)

# GZ Compression
if compression == 'gzip':
import gzip
f = gzip.GzipFile(path, mode)
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)

# BZ Compression
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, mode)
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
elif compat.PY2:
# Python 2's bz2 module can't take file objects, so have to
# run through decompress manually
f = StringIO(bz2.decompress(path_or_buf.read()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need to close the original file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By following with path_or_buf.close()?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this is a user provided file object, we shouldn't close it? @jreback or is this a special case because of the decompressing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up closing it... since after passing through bz2.decompress there will be no need for the original file. Not sure if this violates the design of how pandas deals with handles.

path_or_buf.close()
else:
f = bz2.BZ2File(path_or_buf)

# ZIP Compression
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path)
zip_file = zipfile.ZipFile(path_or_buf)
zip_names = zip_file.namelist()

if len(zip_names) == 1:
file_name = zip_names.pop()
f = zip_file.open(file_name)
f = zip_file.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path))
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP :{}'
' Only one file per ZIP: {}'
.format(zip_names))

# XZ Compression
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(path, mode)
f = lzma.LZMAFile(path_or_buf, mode)

# Unrecognized Compression
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh ok we are doing the validation, nvm. then. (assume we have some tests that hit this path)

Copy link
Contributor Author

@dhimmel dhimmel Dec 4, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assume we have some tests that hit this path

Actually, I don't think there were any tests for invalid compression (regardless of URL or not), so I added one in f2ce8f8.

else:
raise ValueError('Unrecognized compression type: %s' %
compression)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
else:
if compat.PY3:
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)

handles.append(f)

elif is_path:
if compat.PY2:
# Python 2
f = open(path_or_buf, mode)
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding)
else:
f = open(path, mode)
# Python 3 and no explicit encoding
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need all of this, just

open(path_or_buf, mode, encoding=encoding)

encoding=None is valid on PY2.

we don't want to suppress issue with encoding, so we don't open with errors

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, I c, we actually had this code from before...hmm. ok then.

f = open(path_or_buf, mode, errors='replace')
handles.append(f)

# in Python 3, convert BytesIO or fileobjects passed with an encoding
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)

if memory_map and hasattr(f, 'fileno'):
try:
Expand All @@ -347,7 +389,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
# leave the file handler as is then
pass

return f
return f, handles


class MMapWrapper(BaseIterator):
Expand Down
6 changes: 4 additions & 2 deletions pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
exists = False

if exists:
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
json = fh.read()
fh, handles = _get_handle(filepath_or_buffer, 'r',
encoding=encoding)
json = fh.read()
fh.close()
else:
json = filepath_or_buffer
elif hasattr(filepath_or_buffer, 'read'):
Expand Down
Loading