Skip to content

ENH: allow gzip de-compression for files specified by a url #10649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 24, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ New features

- SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
- Enable writing complex values to HDF stores when using table format (:issue:`10447`)
- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)

.. _whatsnew_0170.enhancements.other:

Expand Down
29 changes: 21 additions & 8 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _is_s3_url(url):
return False


def maybe_read_encoded_stream(reader, encoding=None):
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
"""read an encoded stream from the reader and transform the bytes to
unicode if required based on the encoding

Expand All @@ -94,8 +94,14 @@ def maybe_read_encoded_stream(reader, encoding=None):
else:
errors = 'replace'
encoding = 'utf-8'
reader = StringIO(reader.read().decode(encoding, errors))

if compression == 'gzip':
reader = BytesIO(reader.read())
else:
reader = StringIO(reader.read().decode(encoding, errors))
else:
if compression == 'gzip':
reader = BytesIO(reader.read())
encoding = None
return reader, encoding

Expand All @@ -118,7 +124,8 @@ def _expand_user(filepath_or_buffer):
return filepath_or_buffer


def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer
passthru otherwise.
Expand All @@ -130,12 +137,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):

Returns
-------
a filepath_or_buffer, the encoding
a filepath_or_buffer, the encoding, the compression
"""

if _is_url(filepath_or_buffer):
req = _urlopen(str(filepath_or_buffer))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also do compression = req.headers['Content-Encoding'] here, if it's being served as a gzip encoded.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's a great idea. I couldn't set a Content-Encoding header on a file in github, but I can in s3, so I'll add that code in and a new test.

return maybe_read_encoded_stream(req, encoding)
if compression == 'infer':
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
compression = 'gzip'
# cat on the compression to the tuple returned by the function
to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
[compression]
return tuple(to_return)

if _is_s3_url(filepath_or_buffer):
try:
Expand All @@ -156,10 +170,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
k.key = parsed_url.path
filepath_or_buffer = BytesIO(k.get_contents_as_string(
encoding=encoding))
return filepath_or_buffer, None

return filepath_or_buffer, None, compression

return _expand_user(filepath_or_buffer), None
return _expand_user(filepath_or_buffer), None, compression


def file_path_to_url(path):
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
result : Series or DataFrame
"""

filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf)
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(filepath_or_buffer, compat.string_types):
try:
exists = os.path.exists(filepath_or_buffer)
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
obj : type of object stored in file

"""
path_or_buf, _ = get_filepath_or_buffer(path_or_buf)
path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
if iterator:
return Iterator(path_or_buf)

Expand Down
18 changes: 11 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import pandas.tslib as tslib
import pandas.parser as _parser


class ParserWarning(Warning):
pass

Expand Down Expand Up @@ -234,8 +235,10 @@ def _read(filepath_or_buffer, kwds):
if skipfooter is not None:
kwds['skip_footer'] = skipfooter

filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
encoding)
filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer,
encoding,
compression=kwds.get('compression', None))
kwds['compression'] = compression

if kwds.get('date_parser', None) is not None:
if isinstance(kwds['parse_dates'], bool):
Expand Down Expand Up @@ -402,8 +405,9 @@ def parser_f(filepath_or_buffer,
delimiter = sep

if delim_whitespace and delimiter is not default_sep:
raise ValueError("Specified a delimiter with both sep and"\
" delim_whitespace=True; you can only specify one.")
raise ValueError("Specified a delimiter with both sep and"
" delim_whitespace=True; you can only"
" specify one.")

if engine is not None:
engine_specified = True
Expand Down Expand Up @@ -1711,7 +1715,7 @@ def _infer_columns(self):
num_original_columns = ncols
if not names:
if self.prefix:
columns = [['%s%d' % (self.prefix,i) for i in range(ncols)]]
columns = [['%s%d' % (self.prefix, i) for i in range(ncols)]]
else:
columns = [lrange(ncols)]
columns = self._handle_usecols(columns, columns[0])
Expand Down Expand Up @@ -2233,8 +2237,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None):
if index_col is None or index_col is False:
index = Index([])
else:
index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
for index_name in index_names ]
index = [np.empty(0, dtype=dtype.get(index_name, np.object))
for index_name in index_names]
index = MultiIndex.from_arrays(index, names=index_names)
index_col.sort()
for i, n in enumerate(index_col):
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,7 @@ def __init__(self, path_or_buf, convert_dates=True,

self._native_byteorder = _set_endianness(sys.byteorder)
if isinstance(path_or_buf, str):
path_or_buf, encoding = get_filepath_or_buffer(
path_or_buf, encoding, _ = get_filepath_or_buffer(
path_or_buf, encoding=self._default_encoding
)

Expand Down
Binary file added pandas/io/tests/data/salary.table.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions pandas/io/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def test_expand_user_normal_path(self):

def test_get_filepath_or_buffer_with_path(self):
filename = '~/sometest'
filepath_or_buffer, _ = common.get_filepath_or_buffer(filename)
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
self.assertNotEqual(filepath_or_buffer, filename)
self.assertNotIn('~', filepath_or_buffer)
self.assertEqual(os.path.expanduser(filename), filepath_or_buffer)

def test_get_filepath_or_buffer_with_buffer(self):
input_buffer = StringIO()
filepath_or_buffer, _ = common.get_filepath_or_buffer(input_buffer)
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
self.assertEqual(filepath_or_buffer, input_buffer)
21 changes: 21 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3070,6 +3070,7 @@ def test_whitespace_lines(self):
df = self.read_csv(StringIO(data))
tm.assert_almost_equal(df.values, expected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put both of these in a new TestNetwork (below TestS3), or just throw them in there ok too. You can have some of the boilerplate in the setUp creation call. Eg. the filepath to salary.


class TestFwfColspaceSniffing(tm.TestCase):
def test_full_file(self):
# File with all values
Expand Down Expand Up @@ -4060,6 +4061,26 @@ def test_convert_sql_column_decimals(self):
assert_same_values_and_dtype(result, expected)


class TestUrlGz(tm.TestCase):
def setUp(self):
dirpath = tm.get_data_path()
localtable = os.path.join(dirpath, 'salary.table')
self.local_table = read_table(localtable)

@tm.network
def test_url_gz(self):
url = ('https://raw.github.com./mdagost/pandas/url_gzip_fix/'
'pandas/io/tests/data/salary.table.gz')
url_table = read_table(url, compression="gzip", engine="python")
tm.assert_frame_equal(url_table, self.local_table)

@tm.network
def test_url_gz_infer(self):
url = ('https://s3.amazonaws.com/pandas-url-test/salary.table.gz')
url_table = read_table(url, compression="infer", engine="python")
tm.assert_frame_equal(url_table, self.local_table)


class TestS3(tm.TestCase):
def setUp(self):
try:
Expand Down