| // Copyright 2023 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Functions to canonicalize non-special URLs. |
| |
| #include "url/url_canon.h" |
| #include "url/url_canon_internal.h" |
| |
| namespace url { |
| |
| namespace { |
| |
| template <typename CHAR> |
| bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source, |
| const Parsed& parsed, |
| CharsetConverter* query_converter, |
| CanonOutput& output, |
| Parsed& new_parsed) { |
| // The implementation is similar to `DoCanonicalizeStandardURL()`, but there |
| // are many subtle differences. So we have a different function for |
| // canonicalizing non-special URLs. |
| // |
| // Since canonicalization is also used from url::ReplaceComponents(), |
| // we have to handle an invalid URL replacement here, such as: |
| // |
| // > const url = "git:///"; |
| // > url.username = "x"; |
| // > url.href |
| // "git:///" (this should not be "git://x@"). |
| |
| DCHECK(!parsed.has_opaque_path); |
| |
| // Scheme: this will append the colon. |
| bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output, |
| &new_parsed.scheme); |
| bool have_authority = |
| (parsed.username.is_valid() || parsed.password.is_valid() || |
| parsed.host.is_valid() || parsed.port.is_valid()); |
| |
| // Non-special URL examples which should be carefully handled: |
| // |
| // | URL | parsed.user | parsed.host | have_authority | Valid URL? | |
| // |----------+---------------+---------------+----------------+------------| |
| // | git:/a | invalid | invalid | false | valid | |
| // | git://@/ | valid (empty) | invalid | true | invalid | |
| // | git:/// | invalid | valid (empty) | true | valid | |
| |
| if (have_authority) { |
| // Only write the authority separators when we have a scheme. |
| if (parsed.scheme.is_valid()) { |
| output.push_back('/'); |
| output.push_back('/'); |
| } |
| |
| // Username and Password |
| // |
| // URL Standard: |
| // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port |
| // - https://url.spec.whatwg.org/#dom-url-username |
| // - https://url.spec.whatwg.org/#dom-url-password |
| if (parsed.host.is_nonempty()) { |
| // User info: the canonicalizer will handle the : and @. |
| success &= CanonicalizeUserInfo( |
| source.username, parsed.username, source.password, parsed.password, |
| &output, &new_parsed.username, &new_parsed.password); |
| } else { |
| new_parsed.username.reset(); |
| new_parsed.password.reset(); |
| } |
| |
| // Host |
| if (parsed.host.is_valid()) { |
| success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output, |
| new_parsed.host); |
| } else { |
| new_parsed.host.reset(); |
| // URL is invalid if `have_authority` is true, but `parsed.host` is |
| // invalid. Example: "git://@/". |
| success = false; |
| } |
| |
| // Port |
| // |
| // URL Standard: |
| // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port |
| // - https://url.spec.whatwg.org/#dom-url-port |
| if (parsed.host.is_nonempty()) { |
| success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED, |
| &output, &new_parsed.port); |
| } else { |
| new_parsed.port.reset(); |
| } |
| } else { |
| // No authority, clear the components. |
| new_parsed.host.reset(); |
| new_parsed.username.reset(); |
| new_parsed.password.reset(); |
| new_parsed.port.reset(); |
| } |
| |
| // Path |
| if (parsed.path.is_valid()) { |
| success &= |
| CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL, |
| &output, &new_parsed.path); |
| } else { |
| new_parsed.path.reset(); |
| } |
| |
| // Query |
| CanonicalizeQuery(source.query, parsed.query, query_converter, &output, |
| &new_parsed.query); |
| |
| // Ref: ignore failure for this, since the page can probably still be loaded. |
| CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref); |
| |
| // Carry over the flag for potentially dangling markup: |
| if (parsed.potentially_dangling_markup) { |
| new_parsed.potentially_dangling_markup = true; |
| } |
| |
| return success; |
| } |
| |
| } // namespace |
| |
| bool CanonicalizeNonSpecialURL(const char* spec, |
| int spec_len, |
| const Parsed& parsed, |
| CharsetConverter* query_converter, |
| CanonOutput& output, |
| Parsed& new_parsed) { |
| // Carry over the flag. |
| new_parsed.has_opaque_path = parsed.has_opaque_path; |
| |
| if (parsed.has_opaque_path) { |
| return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed); |
| } |
| return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed, |
| query_converter, output, new_parsed); |
| } |
| |
| bool CanonicalizeNonSpecialURL(const char16_t* spec, |
| int spec_len, |
| const Parsed& parsed, |
| CharsetConverter* query_converter, |
| CanonOutput& output, |
| Parsed& new_parsed) { |
| // Carry over the flag. |
| new_parsed.has_opaque_path = parsed.has_opaque_path; |
| |
| if (parsed.has_opaque_path) { |
| return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed); |
| } |
| return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed, |
| query_converter, output, new_parsed); |
| } |
| |
| bool ReplaceNonSpecialURL(const char* base, |
| const Parsed& base_parsed, |
| const Replacements<char>& replacements, |
| CharsetConverter* query_converter, |
| CanonOutput& output, |
| Parsed& new_parsed) { |
| if (base_parsed.has_opaque_path) { |
| return ReplacePathURL(base, base_parsed, replacements, &output, |
| &new_parsed); |
| } |
| |
| URLComponentSource<char> source(base); |
| Parsed parsed(base_parsed); |
| SetupOverrideComponents(base, replacements, &source, &parsed); |
| return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output, |
| new_parsed); |
| } |
| |
| // For 16-bit replacements, we turn all the replacements into UTF-8 so the |
| // regular code path can be used. |
| bool ReplaceNonSpecialURL(const char* base, |
| const Parsed& base_parsed, |
| const Replacements<char16_t>& replacements, |
| CharsetConverter* query_converter, |
| CanonOutput& output, |
| Parsed& new_parsed) { |
| if (base_parsed.has_opaque_path) { |
| return ReplacePathURL(base, base_parsed, replacements, &output, |
| &new_parsed); |
| } |
| |
| RawCanonOutput<1024> utf8; |
| URLComponentSource<char> source(base); |
| Parsed parsed(base_parsed); |
| SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); |
| return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output, |
| new_parsed); |
| } |
| |
| } // namespace url |