[URL] Canonicalize non-special URLs
Add CanonicalizeNonSpecialURL functions, putting the followings into
together:
- https://crrev.com/c/5028539 (Parser for non-special URLs)
- https://crrev.com/c/5092012 (Canonicalize non-special hosts)
- https://crrev.com/c/5095842 (Canonicalize non-special paths)
This CL is separate from the main CL (https://crrev.com/c/4929491).
Bug: 1416006
Change-Id: If7125ff5f052e465dcaa5876061690697899872b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5114580
Reviewed-by: Timothy Gu <timothygu@chromium.org>
Commit-Queue: Hayato Ito <hayato@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1236752}
diff --git a/url/BUILD.gn b/url/BUILD.gn
index c525c16..0ac17c0 100644
--- a/url/BUILD.gn
+++ b/url/BUILD.gn
@@ -41,6 +41,7 @@
"url_canon_ip.cc",
"url_canon_ip.h",
"url_canon_mailtourl.cc",
+ "url_canon_non_special_url.cc",
"url_canon_path.cc",
"url_canon_pathurl.cc",
"url_canon_query.cc",
diff --git a/url/url_canon.h b/url/url_canon.h
index 913b368..71343b8 100644
--- a/url/url_canon.h
+++ b/url/url_canon.h
@@ -714,6 +714,22 @@
CanonOutput* output,
Parsed* new_parsed);
+// Use for non-special URLs.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeNonSpecialURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput& output,
+ Parsed& new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeNonSpecialURL(const char16_t* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput& output,
+ Parsed& new_parsed);
+
// Use for file URLs.
COMPONENT_EXPORT(URL)
bool CanonicalizeFileURL(const char* spec,
diff --git a/url/url_canon_fileurl.cc b/url/url_canon_fileurl.cc
index 802fe42..a5af5fbe 100644
--- a/url/url_canon_fileurl.cc
+++ b/url/url_canon_fileurl.cc
@@ -127,6 +127,8 @@
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed) {
+ DCHECK(!parsed.has_opaque_path);
+
// Things we don't set in file: URLs.
new_parsed->username = Component();
new_parsed->password = Component();
diff --git a/url/url_canon_non_special_url.cc b/url/url_canon_non_special_url.cc
new file mode 100644
index 0000000..eb567b4
--- /dev/null
+++ b/url/url_canon_non_special_url.cc
@@ -0,0 +1,132 @@
+// Copyright 2023 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions to canonicalize non-special URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+template <typename CHAR>
+bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput& output,
+ Parsed& new_parsed) {
+ // The implementation is similar to `DoCanonicalizeStandardURL()`, but there
+ // are many subtle differences. So we have a different function for
+ // canonicalizing non-special URLs.
+
+ DCHECK(!parsed.has_opaque_path);
+
+ // Scheme: this will append the colon.
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
+ &new_parsed.scheme);
+ bool have_authority =
+ (parsed.username.is_valid() || parsed.password.is_valid() ||
+ parsed.host.is_valid() || parsed.port.is_valid());
+
+ // Non-special URL examples which should be carefully handled:
+ //
+ // | URL | parsed.user | parsed.host | have_authority | Valid URL? |
+ // |----------+---------------+---------------+----------------+------------|
+ // | git:/a | invalid | invalid | false | valid |
+ // | git://@/ | valid (empty) | invalid | true | invalid |
+ // | git:/// | invalid | valid (empty) | true | valid |
+
+ if (have_authority) {
+ // Only write the authority separators when we have a scheme.
+ if (parsed.scheme.is_valid()) {
+ output.push_back('/');
+ output.push_back('/');
+ }
+
+ // User info: the canonicalizer will handle the : and @.
+ success &= CanonicalizeUserInfo(source.username, parsed.username,
+ source.password, parsed.password, &output,
+ &new_parsed.username, &new_parsed.password);
+
+ // Host
+ if (parsed.host.is_valid()) {
+ success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
+ new_parsed.host);
+ } else {
+ // URL is invalid if `have_authority` is true, but `parsed.host` is
+ // invalid. Example: "git://@/".
+ success = false;
+ }
+
+ // Port
+ success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
+ &output, &new_parsed.port);
+ } else {
+ // No authority, clear the components.
+ new_parsed.host.reset();
+ new_parsed.username.reset();
+ new_parsed.password.reset();
+ new_parsed.port.reset();
+ }
+
+ // Path
+ if (parsed.path.is_valid()) {
+ success &=
+ CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
+ &output, &new_parsed.path);
+ } else {
+ new_parsed.path.reset();
+ }
+
+ // Query
+ CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
+ &new_parsed.query);
+
+ // Ref: ignore failure for this, since the page can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
+
+ // Carry over the flag for potentially dangling markup:
+ if (parsed.potentially_dangling_markup) {
+ new_parsed.potentially_dangling_markup = true;
+ }
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeNonSpecialURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput& output,
+ Parsed& new_parsed) {
+ // Carry over the flag.
+ new_parsed.has_opaque_path = parsed.has_opaque_path;
+
+ if (parsed.has_opaque_path) {
+ return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
+ }
+ return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
+ query_converter, output, new_parsed);
+}
+
+bool CanonicalizeNonSpecialURL(const char16_t* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput& output,
+ Parsed& new_parsed) {
+ // Carry over the flag.
+ new_parsed.has_opaque_path = parsed.has_opaque_path;
+
+ if (parsed.has_opaque_path) {
+ return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
+ }
+ return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
+ query_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc
index 8096b56..304ca4c0 100644
--- a/url/url_canon_stdurl.cc
+++ b/url/url_canon_stdurl.cc
@@ -20,6 +20,8 @@
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed) {
+ DCHECK(!parsed.has_opaque_path);
+
// Scheme: this will append the colon.
bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
output, &new_parsed->scheme);
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index dff0f71..fd10ca0 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -1778,6 +1778,103 @@
}
}
+TEST(URLCanonTest, CanonicalizeNonSpecialURL) {
+ // The individual component canonicalize tests should have caught the cases
+ // for each of those components. Here, we just need to test that the various
+ // parts are included or excluded properly, and have the correct separators.
+ struct URLCase {
+ const std::string_view input;
+ const std::string_view expected;
+ bool expected_success;
+ } cases[] = {
+ // Basic cases.
+ {"git://host:80/path?a=b#ref", "git://host:80/path?a=b#ref", true},
+ {"git://host", "git://host", true},
+ {"git://host/", "git://host/", true},
+ {"git://HosT/", "git://HosT/", true},
+ {"git://..", "git://..", true},
+ {"git://../", "git://../", true},
+ {"git://../..", "git://../", true},
+
+ // Empty hosts.
+ {"git://", "git://", true},
+ {"git:///", "git:///", true},
+ {"git:////", "git:////", true},
+ {"git:///a", "git:///a", true},
+ {"git:///a/../b", "git:///b", true},
+ {"git:///..", "git:///", true},
+
+ // No hosts.
+ {"git:/", "git:/", true},
+ {"git:/a", "git:/a", true},
+ {"git:/a/../b", "git:/b", true},
+ {"git:/..", "git:/", true},
+ {"git:/../", "git:/", true},
+ {"git:/../..", "git:/", true},
+
+ // Users.
+ {"git://@host", "git://host", true},
+ {"git:// @host", "git://%20@host", true},
+ {"git://\\@host", "git://%5C@host", true},
+
+ // Paths.
+ {"git://host/path", "git://host/path", true},
+ {"git://host/p ath", "git://host/p%20ath", true},
+ {"git://host/a/../b", "git://host/b", true},
+ {"git://host/..", "git://host/", true},
+ {"git://host/../", "git://host/", true},
+ {"git://host/../..", "git://host/", true},
+ {"git://host/.", "git://host/", true},
+ {"git://host/./", "git://host/", true},
+ {"git://host/./.", "git://host/", true},
+ // Backslashes.
+ {"git://host/a\\..\\b", "git://host/a\\..\\b", true},
+
+ // IPv6.
+ {"git://[1:2:0:0:5:0:0:0]", "git://[1:2:0:0:5::]", true},
+ {"git://[1:2:0:0:5:0:0:0]/", "git://[1:2:0:0:5::]/", true},
+ {"git://[1:2:0:0:5:0:0:0]/path", "git://[1:2:0:0:5::]/path", true},
+
+ // IPv4 is unsupported.
+ {"git://127.00.0.1", "git://127.00.0.1", true},
+ {"git://127.1000.0.1", "git://127.1000.0.1", true},
+
+ // Invalid URLs.
+ {"git://@", "git://", false},
+ // Forbidden host code points.
+ {"git://<", "git://", false},
+ {"git:// /", "git:///", false},
+ // Backslashes cannot be used as host terminators.
+ {"git://host\\a/../b", "git://host/b", false},
+
+ // Opaque paths.
+ {"git:", "git:", true},
+ {"git:opaque", "git:opaque", true},
+ {"git:o p a q u e", "git:o p a q u e", true},
+ {"git: <", "git: <", true},
+ {"git:opaque/a/../b", "git:opaque/a/../b", true},
+ {"git:opaque\\a\\..\\b", "git:opaque\\a\\..\\b", true},
+ {"git:\\a", "git:\\a", true},
+ // Like URNs.
+ {"git:a:b:c:123", "git:a:b:c:123", true},
+ };
+
+ for (const auto& i : cases) {
+ SCOPED_TRACE(i.input);
+ Parsed parsed;
+ ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
+ Parsed out_parsed;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizeNonSpecialURL(
+ i.input.data(), i.input.size(), parsed,
+ /*query_converter=*/nullptr, output, out_parsed);
+ output.Complete();
+ EXPECT_EQ(success, i.expected_success);
+ EXPECT_EQ(out_str, i.expected);
+ }
+}
+
// The codepath here is the same as for regular canonicalization, so we just
// need to test that things are replaced or not correctly.
TEST(URLCanonTest, ReplaceStandardURL) {
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
index 6edc3f9..8d8c83e3 100644
--- a/url/url_parse_unittest.cc
+++ b/url/url_parse_unittest.cc
@@ -686,6 +686,10 @@
nullptr},
{"git://ho\\st/", "git", nullptr, nullptr, "ho\\st", -1, "/", nullptr,
nullptr},
+ // Empty users
+ {"git://@host", "git", "", nullptr, "host", -1, nullptr, nullptr, nullptr},
+ // Empty user and invalid host. "git://@" is an invalid URL.
+ {"git://@", "git", "", nullptr, nullptr, -1, nullptr, nullptr, nullptr},
// Empty host cases
{"git://", "git", nullptr, nullptr, "", -1, nullptr, nullptr, nullptr},
{"git:///", "git", nullptr, nullptr, "", -1, "/", nullptr, nullptr},