[go: nahoru, domu]

Skip to content

Commit

Permalink
Merge pull request #2438 from SuryashankarDas:patch
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 332803829
  • Loading branch information
Copybara-Service committed Sep 21, 2020
2 parents 6282bc8 + 3802bc4 commit bb2ce95
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 23 deletions.
52 changes: 48 additions & 4 deletions tensorflow_datasets/core/download/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import io
import os
import re
from typing import Any, ContextManager, Iterable, Iterator, Tuple, Union
from typing import Any, ContextManager, Iterable, Iterator, Optional, Tuple, Union
import promise
import requests

Expand All @@ -44,12 +44,56 @@ def get_downloader(*args: Any, **kwargs: Any) -> '_Downloader':
return _Downloader(*args, **kwargs)


def _filename_from_content_disposition(
content_disposition: str,
) -> Optional[str]:
"""Extract the filename from the content disposition.
Parse the content_definition as defined in:
https://tools.ietf.org/html/rfc2616
Note:
* If both encoded (`filename*=`) and ascii (filename=) name are defined,
the function returns the ascii name, as encoding might create issue on
some systems
* If only the encoded name is defined (e.g.
`filename*=UTF-8''%e2%82%ac.txt`), the function return None as this is
not yet supported.
Args:
content_disposition: String to parse.
Returns:
filename: The filename, or None if filename could not be parsed
"""
match = re.findall(
# Regex (see unittests for examples):
# ` *` : Strip eventual whitespaces
# `['"]?` : Filename is optionally wrapped in quote
# `([^;\r\n"']+)` : Filename can be any symbol except those
# `;?` : Stop when encountering optional `;`
r"""filename= *['"]?([^;\r\n"']+)['"]? *;?""",
content_disposition,
flags=re.IGNORECASE,
)
if not match:
return None
elif len(match) != 1:
raise ValueError(
f'Error while parsing filename for: {content_disposition}\n'
f'Multiple filename detected: {list(match)}'
)
return match[0].rstrip()


def _get_filename(response: Response) -> str:
content_disposition = response.headers.get('content-disposition', None)
if content_disposition:
match = re.findall('filename="(.+?)"', content_disposition)
if match:
return match[0]
filename = _filename_from_content_disposition(content_disposition)
if filename:
return filename
# Otherwise, fallback on extracting the name from the url.
return utils.basename_from_url(response.url)


Expand Down
113 changes: 94 additions & 19 deletions tensorflow_datasets/core/download/downloader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
import io
import os
import tempfile
from typing import Optional

from absl.testing import absltest
import pytest
import tensorflow.compat.v2 as tf
from tensorflow_datasets import testing
from tensorflow_datasets.core.download import downloader
Expand Down Expand Up @@ -138,22 +140,95 @@ def test_ftp_error(self):
promise.get()


class GetFilenameTest(testing.TestCase):

def test_no_headers(self):
resp = _FakeResponse('http://foo.bar/baz.zip', b'content')
res = downloader._get_filename(resp)
self.assertEqual(res, 'baz.zip')

def test_headers(self):
cdisp = ('attachment;filename="hello.zip";'
'filename*=UTF-8\'\'hello.zip')
resp = _FakeResponse('http://foo.bar/baz.zip', b'content', headers={
'content-disposition': cdisp,
})
res = downloader._get_filename(resp)
self.assertEqual(res, 'hello.zip')


if __name__ == '__main__':
testing.test_main()
# Gramar examples inspired from: https://tools.ietf.org/html/rfc6266#section-5
_CONTENT_DISPOSITION_FILENAME_PAIRS = [
("""attachment; filename=filename.txt""", 'filename.txt'),
# Should strip space
("""attachment; filename= filename.txt """, 'filename.txt'),
("""attachment; filename= filename.txt ;""", 'filename.txt'),
# If both encoded and ascii are present, only keep encoded
(
"""attachment; filename="EURO rates"; filename*=utf-8''%e2%82%ac%20rates""",
'EURO rates',
),
(
"""attachment; filename=EURO rates; filename*=utf-8''%e2%82%ac%20rates""",
'EURO rates',
),
(
"""attachment; filename=EXAMPLE-Im ößä.dat; filename*=iso-8859-1''EXAMPLE-%20I%27m%20%F6%DF%E4.dat""",
'EXAMPLE-Im ößä.dat',
),
(
"""attachment;filename="hello.zip";filename*=UTF-8''hello.zip""",
'hello.zip',
),
(
"""attachment;filename=hello.zip;filename*=UTF-8''hello.zip""",
'hello.zip',
),
# Should be case insensitive
("""INLINE; FILENAME= "an example.html""", 'an example.html'),
("""Attachment; filename=example.html""", 'example.html'),
# Only encoded not supported for now
("""attachment; filename*=UTF-8''filename.txt""", None),
("""attachment; filename*=iso-8859-1'en'%A3%20rates""", None),
# Multi-line also supported
(
"""attachment;
filename="hello.zip";
filename*=UTF-8''hello.zip""",
'hello.zip',
),
("""attachment;filename*=UTF-8''hello.zip""", None),
(
"""attachment;
filename*= UTF-8''%e2%82%ac%20rates.zip""",
None,
),
]


@pytest.mark.parametrize(
('content_disposition', 'filename'), _CONTENT_DISPOSITION_FILENAME_PAIRS
)
def test_filename_from_content_disposition(
content_disposition: str,
filename: Optional[str],
):
get_filename = downloader._filename_from_content_disposition
assert get_filename(content_disposition) == filename


@pytest.mark.parametrize(
('content_disposition', 'filename'),
[
(
# Filename should be parsed from the ascii name, not UTF-8
"""attachment;filename="hello.zip";filename*=UTF-8''other.zip""",
'hello.zip'
),
(
# If ascii filename can't be parsed, filename parsed from url
"""attachment;filename*=UTF-8''other.zip""",
'baz.zip'
),
(
# No headers, filename parsed from url
None,
'baz.zip'
),
],
)
def test_filename_from_headers(
content_disposition: Optional[str],
filename: Optional[str],
):
if content_disposition:
headers = {
'content-disposition': content_disposition,
}
else:
headers = None
resp = _FakeResponse('http://foo.bar/baz.zip', b'content', headers=headers)
assert downloader._get_filename(resp), filename

0 comments on commit bb2ce95

Please sign in to comment.