Merge pull request #2438 from SuryashankarDas:patch

PiperOrigin-RevId: 332803829
tensorflow · Sep 21, 2020 · bb2ce95 · bb2ce95
2 parents 6282bc8 + 3802bc4
commit bb2ce95
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 23 deletions.
diff --git a/tensorflow_datasets/core/download/downloader.py b/tensorflow_datasets/core/download/downloader.py
@@ -22,7 +22,7 @@
 import io
 import os
 import re
-from typing import Any, ContextManager, Iterable, Iterator, Tuple, Union
+from typing import Any, ContextManager, Iterable, Iterator, Optional, Tuple, Union
 import promise
 import requests
 
@@ -44,12 +44,56 @@ def get_downloader(*args: Any, **kwargs: Any) -> '_Downloader':
   return _Downloader(*args, **kwargs)
 
 
+def _filename_from_content_disposition(
+    content_disposition: str,
+) -> Optional[str]:
+  """Extract the filename from the content disposition.
+
+  Parse the content_definition as defined in:
+  https://tools.ietf.org/html/rfc2616
+
+  Note:
+
+   * If both encoded (`filename*=`) and ascii (filename=) name are defined,
+     the function returns the ascii name, as encoding might create issue on
+     some systems
+   * If only the encoded name is defined (e.g.
+     `filename*=UTF-8''%e2%82%ac.txt`), the function return None as this is
+     not yet supported.
+
+  Args:
+      content_disposition: String to parse.
+
+  Returns:
+      filename: The filename, or None if filename could not be parsed
+  """
+  match = re.findall(
+      # Regex (see unittests for examples):
+      # ` *` : Strip eventual whitespaces
+      # `['"]?` : Filename is optionally wrapped in quote
+      # `([^;\r\n"']+)` : Filename can be any symbol except those
+      # `;?` : Stop when encountering optional `;`
+      r"""filename= *['"]?([^;\r\n"']+)['"]? *;?""",
+      content_disposition,
+      flags=re.IGNORECASE,
+  )
+  if not match:
+    return None
+  elif len(match) != 1:
+    raise ValueError(
+        f'Error while parsing filename for: {content_disposition}\n'
+        f'Multiple filename detected: {list(match)}'
+    )
+  return match[0].rstrip()
+
+
 def _get_filename(response: Response) -> str:
   content_disposition = response.headers.get('content-disposition', None)
   if content_disposition:
-    match = re.findall('filename="(.+?)"', content_disposition)
-    if match:
-      return match[0]
+    filename = _filename_from_content_disposition(content_disposition)
+    if filename:
+      return filename
+  # Otherwise, fallback on extracting the name from the url.
   return utils.basename_from_url(response.url)
 
 

diff --git a/tensorflow_datasets/core/download/downloader_test.py b/tensorflow_datasets/core/download/downloader_test.py
@@ -19,8 +19,10 @@
 import io
 import os
 import tempfile
+from typing import Optional
 
 from absl.testing import absltest
+import pytest
 import tensorflow.compat.v2 as tf
 from tensorflow_datasets import testing
 from tensorflow_datasets.core.download import downloader
@@ -138,22 +140,95 @@ def test_ftp_error(self):
       promise.get()
 
 
-class GetFilenameTest(testing.TestCase):
-
-  def test_no_headers(self):
-    resp = _FakeResponse('http://foo.bar/baz.zip', b'content')
-    res = downloader._get_filename(resp)
-    self.assertEqual(res, 'baz.zip')
-
-  def test_headers(self):
-    cdisp = ('attachment;filename="hello.zip";'
-             'filename*=UTF-8\'\'hello.zip')
-    resp = _FakeResponse('http://foo.bar/baz.zip', b'content', headers={
-        'content-disposition': cdisp,
-    })
-    res = downloader._get_filename(resp)
-    self.assertEqual(res, 'hello.zip')
-
-
-if __name__ == '__main__':
-  testing.test_main()
+# Gramar examples inspired from: https://tools.ietf.org/html/rfc6266#section-5
+_CONTENT_DISPOSITION_FILENAME_PAIRS = [
+    ("""attachment; filename=filename.txt""", 'filename.txt'),
+    # Should strip space
+    ("""attachment; filename=  filename.txt  """, 'filename.txt'),
+    ("""attachment; filename=  filename.txt  ;""", 'filename.txt'),
+    # If both encoded and ascii are present, only keep encoded
+    (
+        """attachment; filename="EURO rates"; filename*=utf-8''%e2%82%ac%20rates""",
+        'EURO rates',
+    ),
+    (
+        """attachment; filename=EURO rates; filename*=utf-8''%e2%82%ac%20rates""",
+        'EURO rates',
+    ),
+    (
+        """attachment; filename=EXAMPLE-Im ößä.dat; filename*=iso-8859-1''EXAMPLE-%20I%27m%20%F6%DF%E4.dat""",
+        'EXAMPLE-Im ößä.dat',
+    ),
+    (
+        """attachment;filename="hello.zip";filename*=UTF-8''hello.zip""",
+        'hello.zip',
+    ),
+    (
+        """attachment;filename=hello.zip;filename*=UTF-8''hello.zip""",
+        'hello.zip',
+    ),
+    # Should be case insensitive
+    ("""INLINE; FILENAME= "an example.html""", 'an example.html'),
+    ("""Attachment; filename=example.html""", 'example.html'),
+    # Only encoded not supported for now
+    ("""attachment; filename*=UTF-8''filename.txt""", None),
+    ("""attachment; filename*=iso-8859-1'en'%A3%20rates""", None),
+    # Multi-line also supported
+    (
+        """attachment;
+            filename="hello.zip";
+            filename*=UTF-8''hello.zip""",
+        'hello.zip',
+    ),
+    ("""attachment;filename*=UTF-8''hello.zip""", None),
+    (
+        """attachment;
+            filename*= UTF-8''%e2%82%ac%20rates.zip""",
+        None,
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    ('content_disposition', 'filename'), _CONTENT_DISPOSITION_FILENAME_PAIRS
+)
+def test_filename_from_content_disposition(
+    content_disposition: str,
+    filename: Optional[str],
+):
+  get_filename = downloader._filename_from_content_disposition
+  assert get_filename(content_disposition) == filename
+
+
+@pytest.mark.parametrize(
+    ('content_disposition', 'filename'),
+    [
+        (
+            # Filename should be parsed from the ascii name, not UTF-8
+            """attachment;filename="hello.zip";filename*=UTF-8''other.zip""",
+            'hello.zip'
+        ),
+        (
+            # If ascii filename can't be parsed, filename parsed from url
+            """attachment;filename*=UTF-8''other.zip""",
+            'baz.zip'
+        ),
+        (
+            # No headers, filename parsed from url
+            None,
+            'baz.zip'
+        ),
+    ],
+)
+def test_filename_from_headers(
+    content_disposition: Optional[str],
+    filename: Optional[str],
+):
+  if content_disposition:
+    headers = {
+        'content-disposition': content_disposition,
+    }
+  else:
+    headers = None
+  resp = _FakeResponse('http://foo.bar/baz.zip', b'content', headers=headers)
+  assert downloader._get_filename(resp), filename