[URL] Canonicalize non-special URLs Add CanonicalizeNonSpecialURL functions, putting the followings into together: - https://crrev.com/c/5028539 (Parser for non-special URLs) - https://crrev.com/c/5092012 (Canonicalize non-special hosts) - https://crrev.com/c/5095842 (Canonicalize non-special paths) This CL is separate from the main CL (https://crrev.com/c/4929491). Bug: 1416006 Change-Id: If7125ff5f052e465dcaa5876061690697899872b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5114580 Reviewed-by: Timothy Gu <timothygu@chromium.org> Commit-Queue: Hayato Ito <hayato@chromium.org> Cr-Commit-Position: refs/heads/main@{#1236752}

commit: 735a5cbcef5c4d88e84b2258ad6cfdf05300e2a3 [log] [tgz]
author: Hayato Ito <hayato@chromium.org> Wed Dec 13 02:49:38 2023
committer: Chromium LUCI CQ <chromium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Dec 13 02:49:38 2023
tree: 6e3029ea7a32ca170f4318e4f6c210040ffc29b0
parent: 34a37a021f24ad030dd1a578ea5d786cef84a957 [diff]
diff --git a/url/BUILD.gn b/url/BUILD.gn
index c525c16..0ac17c0 100644
--- a/url/BUILD.gn
+++ b/url/BUILD.gn

@@ -41,6 +41,7 @@
     "url_canon_ip.cc",
     "url_canon_ip.h",
     "url_canon_mailtourl.cc",
+    "url_canon_non_special_url.cc",
     "url_canon_path.cc",
     "url_canon_pathurl.cc",
     "url_canon_query.cc",

diff --git a/url/url_canon.h b/url/url_canon.h
index 913b368..71343b8 100644
--- a/url/url_canon.h
+++ b/url/url_canon.h

@@ -714,6 +714,22 @@
                              CanonOutput* output,
                              Parsed* new_parsed);
 
+// Use for non-special URLs.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeNonSpecialURL(const char* spec,
+                               int spec_len,
+                               const Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput& output,
+                               Parsed& new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeNonSpecialURL(const char16_t* spec,
+                               int spec_len,
+                               const Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput& output,
+                               Parsed& new_parsed);
+
 // Use for file URLs.
 COMPONENT_EXPORT(URL)
 bool CanonicalizeFileURL(const char* spec,

diff --git a/url/url_canon_fileurl.cc b/url/url_canon_fileurl.cc
index 802fe42..a5af5fbe 100644
--- a/url/url_canon_fileurl.cc
+++ b/url/url_canon_fileurl.cc

@@ -127,6 +127,8 @@
                            CharsetConverter* query_converter,
                            CanonOutput* output,
                            Parsed* new_parsed) {
+  DCHECK(!parsed.has_opaque_path);
+
   // Things we don't set in file: URLs.
   new_parsed->username = Component();
   new_parsed->password = Component();

diff --git a/url/url_canon_non_special_url.cc b/url/url_canon_non_special_url.cc
new file mode 100644
index 0000000..eb567b4
--- /dev/null
+++ b/url/url_canon_non_special_url.cc

@@ -0,0 +1,132 @@
+// Copyright 2023 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions to canonicalize non-special URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+template <typename CHAR>
+bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
+                                 const Parsed& parsed,
+                                 CharsetConverter* query_converter,
+                                 CanonOutput& output,
+                                 Parsed& new_parsed) {
+  // The implementation is similar to `DoCanonicalizeStandardURL()`, but there
+  // are many subtle differences. So we have a different function for
+  // canonicalizing non-special URLs.
+
+  DCHECK(!parsed.has_opaque_path);
+
+  // Scheme: this will append the colon.
+  bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
+                                    &new_parsed.scheme);
+  bool have_authority =
+      (parsed.username.is_valid() || parsed.password.is_valid() ||
+       parsed.host.is_valid() || parsed.port.is_valid());
+
+  // Non-special URL examples which should be carefully handled:
+  //
+  // | URL      | parsed.user   | parsed.host   | have_authority | Valid URL? |
+  // |----------+---------------+---------------+----------------+------------|
+  // | git:/a   | invalid       | invalid       | false          | valid      |
+  // | git://@/ | valid (empty) | invalid       | true           | invalid    |
+  // | git:///  | invalid       | valid (empty) | true           | valid      |
+
+  if (have_authority) {
+    // Only write the authority separators when we have a scheme.
+    if (parsed.scheme.is_valid()) {
+      output.push_back('/');
+      output.push_back('/');
+    }
+
+    // User info: the canonicalizer will handle the : and @.
+    success &= CanonicalizeUserInfo(source.username, parsed.username,
+                                    source.password, parsed.password, &output,
+                                    &new_parsed.username, &new_parsed.password);
+
+    // Host
+    if (parsed.host.is_valid()) {
+      success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
+                                            new_parsed.host);
+    } else {
+      // URL is invalid if `have_authority` is true, but `parsed.host` is
+      // invalid. Example: "git://@/".
+      success = false;
+    }
+
+    // Port
+    success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
+                                &output, &new_parsed.port);
+  } else {
+    // No authority, clear the components.
+    new_parsed.host.reset();
+    new_parsed.username.reset();
+    new_parsed.password.reset();
+    new_parsed.port.reset();
+  }
+
+  // Path
+  if (parsed.path.is_valid()) {
+    success &=
+        CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
+                         &output, &new_parsed.path);
+  } else {
+    new_parsed.path.reset();
+  }
+
+  // Query
+  CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
+                    &new_parsed.query);
+
+  // Ref: ignore failure for this, since the page can probably still be loaded.
+  CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
+
+  // Carry over the flag for potentially dangling markup:
+  if (parsed.potentially_dangling_markup) {
+    new_parsed.potentially_dangling_markup = true;
+  }
+
+  return success;
+}
+
+}  // namespace
+
+bool CanonicalizeNonSpecialURL(const char* spec,
+                               int spec_len,
+                               const Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput& output,
+                               Parsed& new_parsed) {
+  // Carry over the flag.
+  new_parsed.has_opaque_path = parsed.has_opaque_path;
+
+  if (parsed.has_opaque_path) {
+    return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
+  }
+  return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
+                                     query_converter, output, new_parsed);
+}
+
+bool CanonicalizeNonSpecialURL(const char16_t* spec,
+                               int spec_len,
+                               const Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput& output,
+                               Parsed& new_parsed) {
+  // Carry over the flag.
+  new_parsed.has_opaque_path = parsed.has_opaque_path;
+
+  if (parsed.has_opaque_path) {
+    return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
+  }
+  return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
+                                     query_converter, output, new_parsed);
+}
+
+}  // namespace url

diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc
index 8096b56..304ca4c0 100644
--- a/url/url_canon_stdurl.cc
+++ b/url/url_canon_stdurl.cc

@@ -20,6 +20,8 @@
                                CharsetConverter* query_converter,
                                CanonOutput* output,
                                Parsed* new_parsed) {
+  DCHECK(!parsed.has_opaque_path);
+
   // Scheme: this will append the colon.
   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
                                     output, &new_parsed->scheme);

diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index dff0f71..fd10ca0 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc

@@ -1778,6 +1778,103 @@
   }
 }
 
+TEST(URLCanonTest, CanonicalizeNonSpecialURL) {
+  // The individual component canonicalize tests should have caught the cases
+  // for each of those components. Here, we just need to test that the various
+  // parts are included or excluded properly, and have the correct separators.
+  struct URLCase {
+    const std::string_view input;
+    const std::string_view expected;
+    bool expected_success;
+  } cases[] = {
+      // Basic cases.
+      {"git://host:80/path?a=b#ref", "git://host:80/path?a=b#ref", true},
+      {"git://host", "git://host", true},
+      {"git://host/", "git://host/", true},
+      {"git://HosT/", "git://HosT/", true},
+      {"git://..", "git://..", true},
+      {"git://../", "git://../", true},
+      {"git://../..", "git://../", true},
+
+      // Empty hosts.
+      {"git://", "git://", true},
+      {"git:///", "git:///", true},
+      {"git:////", "git:////", true},
+      {"git:///a", "git:///a", true},
+      {"git:///a/../b", "git:///b", true},
+      {"git:///..", "git:///", true},
+
+      // No hosts.
+      {"git:/", "git:/", true},
+      {"git:/a", "git:/a", true},
+      {"git:/a/../b", "git:/b", true},
+      {"git:/..", "git:/", true},
+      {"git:/../", "git:/", true},
+      {"git:/../..", "git:/", true},
+
+      // Users.
+      {"git://@host", "git://host", true},
+      {"git:// @host", "git://%20@host", true},
+      {"git://\\@host", "git://%5C@host", true},
+
+      // Paths.
+      {"git://host/path", "git://host/path", true},
+      {"git://host/p ath", "git://host/p%20ath", true},
+      {"git://host/a/../b", "git://host/b", true},
+      {"git://host/..", "git://host/", true},
+      {"git://host/../", "git://host/", true},
+      {"git://host/../..", "git://host/", true},
+      {"git://host/.", "git://host/", true},
+      {"git://host/./", "git://host/", true},
+      {"git://host/./.", "git://host/", true},
+      // Backslashes.
+      {"git://host/a\\..\\b", "git://host/a\\..\\b", true},
+
+      // IPv6.
+      {"git://[1:2:0:0:5:0:0:0]", "git://[1:2:0:0:5::]", true},
+      {"git://[1:2:0:0:5:0:0:0]/", "git://[1:2:0:0:5::]/", true},
+      {"git://[1:2:0:0:5:0:0:0]/path", "git://[1:2:0:0:5::]/path", true},
+
+      // IPv4 is unsupported.
+      {"git://127.00.0.1", "git://127.00.0.1", true},
+      {"git://127.1000.0.1", "git://127.1000.0.1", true},
+
+      // Invalid URLs.
+      {"git://@", "git://", false},
+      // Forbidden host code points.
+      {"git://<", "git://", false},
+      {"git:// /", "git:///", false},
+      // Backslashes cannot be used as host terminators.
+      {"git://host\\a/../b", "git://host/b", false},
+
+      // Opaque paths.
+      {"git:", "git:", true},
+      {"git:opaque", "git:opaque", true},
+      {"git:o p a q u e", "git:o p a q u e", true},
+      {"git: <", "git: <", true},
+      {"git:opaque/a/../b", "git:opaque/a/../b", true},
+      {"git:opaque\\a\\..\\b", "git:opaque\\a\\..\\b", true},
+      {"git:\\a", "git:\\a", true},
+      // Like URNs.
+      {"git:a:b:c:123", "git:a:b:c:123", true},
+  };
+
+  for (const auto& i : cases) {
+    SCOPED_TRACE(i.input);
+    Parsed parsed;
+    ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
+    Parsed out_parsed;
+    std::string out_str;
+    StdStringCanonOutput output(&out_str);
+    bool success = CanonicalizeNonSpecialURL(
+        i.input.data(), i.input.size(), parsed,
+        /*query_converter=*/nullptr, output, out_parsed);
+    output.Complete();
+    EXPECT_EQ(success, i.expected_success);
+    EXPECT_EQ(out_str, i.expected);
+  }
+}
+
 // The codepath here is the same as for regular canonicalization, so we just
 // need to test that things are replaced or not correctly.
 TEST(URLCanonTest, ReplaceStandardURL) {

diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
index 6edc3f9..8d8c83e3 100644
--- a/url/url_parse_unittest.cc
+++ b/url/url_parse_unittest.cc

@@ -686,6 +686,10 @@
      nullptr},
     {"git://ho\\st/", "git", nullptr, nullptr, "ho\\st", -1, "/", nullptr,
      nullptr},
+    // Empty users
+    {"git://@host", "git", "", nullptr, "host", -1, nullptr, nullptr, nullptr},
+    // Empty user and invalid host. "git://@" is an invalid URL.
+    {"git://@", "git", "", nullptr, nullptr, -1, nullptr, nullptr, nullptr},
     // Empty host cases
     {"git://", "git", nullptr, nullptr, "", -1, nullptr, nullptr, nullptr},
     {"git:///", "git", nullptr, nullptr, "", -1, "/", nullptr, nullptr},
commit	735a5cbcef5c4d88e84b2258ad6cfdf05300e2a3	[log] [tgz]
author	Hayato Ito <hayato@chromium.org>	Wed Dec 13 02:49:38 2023
committer	Chromium LUCI CQ <chromium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Dec 13 02:49:38 2023
tree	6e3029ea7a32ca170f4318e4f6c210040ffc29b0
parent	34a37a021f24ad030dd1a578ea5d786cef84a957 [diff]