Avi Drissman | 3e1a26c | 2022-09-15 20:26:03 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors |
tfarina@chromium.org | 51bcc5d | 2013-04-24 01:41:37 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 4 | |
Hans Wennborg | 0e22368 | 2020-04-27 21:51:29 | [diff] [blame] | 5 | #include "base/check.h" |
Erik Chen | 0fcb876 | 2022-02-10 17:35:06 | [diff] [blame] | 6 | #include "base/cpu_reduction_experiment.h" |
tfarina@chromium.org | 318076b | 2013-04-18 21:19:45 | [diff] [blame] | 7 | #include "url/url_canon.h" |
| 8 | #include "url/url_canon_internal.h" |
Hayato Ito | 746863d | 2023-09-21 06:51:37 | [diff] [blame] | 9 | #include "url/url_features.h" |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 10 | |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 11 | namespace url { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 12 | |
| 13 | namespace { |
| 14 | |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 15 | // This table lists the canonical version of all characters we allow in the |
Hayato Ito | b961a56 | 2023-12-06 05:59:34 | [diff] [blame] | 16 | // input, with 0 indicating it is disallowed. We use the magic kEsc value to |
| 17 | // indicate that this character should be escaped. At present, ' ' (SPACE) and |
| 18 | // '*' (asterisk) are still non-compliant to the URL Standard. See |
| 19 | // https://crbug.com/1416013 for details. |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 20 | const unsigned char kEsc = 0xff; |
Hayato Ito | b961a56 | 2023-12-06 05:59:34 | [diff] [blame] | 21 | // clang-format off |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 22 | const unsigned char kHostCharLookup[0x80] = { |
| 23 | // 00-1f: all are invalid |
| 24 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 25 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 26 | // ' ' ! " # $ % & ' ( ) * + , - . / |
Hayato Ito | 746863d | 2023-09-21 06:51:37 | [diff] [blame] | 27 | kEsc,'!', '"', 0, '$', 0, '&', '\'','(', ')', kEsc, '+', ',', '-', '.', 0, |
| 28 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
| 29 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';' , 0, '=', 0, 0, |
| 30 | // @ A B C D E F G H I J K L M N O |
| 31 | 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
| 32 | // P Q R S T U V W X Y Z [ \ ] ^ _ |
| 33 | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0, ']', 0, '_', |
| 34 | // ` a b c d e f g h i j k l m n o |
| 35 | '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
| 36 | // p q r s t u v w x y z { | } ~ |
| 37 | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', 0, '}', '~', 0 }; |
| 38 | // clang-format on |
| 39 | |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 40 | // https://url.spec.whatwg.org/#forbidden-host-code-point |
| 41 | const uint8_t kForbiddenHost = 0x1; |
| 42 | |
| 43 | // TODO(crbug.com/1416006): Merge other lookup tables into this table. That can |
| 44 | // be probably done after https://crbug.com/1416013 is resolved. |
| 45 | // |
| 46 | // This table is currently only used for an opaque-host in non-special URLs. |
| 47 | const uint8_t kHostCharacterTable[128] = { |
| 48 | kForbiddenHost, // 0x00 (NUL) |
| 49 | 0, // 0x01 |
| 50 | 0, // 0x02 |
| 51 | 0, // 0x03 |
| 52 | 0, // 0x04 |
| 53 | 0, // 0x05 |
| 54 | 0, // 0x06 |
| 55 | 0, // 0x07 |
| 56 | 0, // 0x08 |
| 57 | kForbiddenHost, // 0x09 (TAB) |
| 58 | kForbiddenHost, // 0x0A (LF) |
| 59 | 0, // 0x0B |
| 60 | 0, // 0x0C |
| 61 | kForbiddenHost, // 0x0D (CR) |
| 62 | 0, // 0x0E |
| 63 | 0, // 0x0F |
| 64 | 0, // 0x10 |
| 65 | 0, // 0x11 |
| 66 | 0, // 0x12 |
| 67 | 0, // 0x13 |
| 68 | 0, // 0x14 |
| 69 | 0, // 0x15 |
| 70 | 0, // 0x16 |
| 71 | 0, // 0x17 |
| 72 | 0, // 0x18 |
| 73 | 0, // 0x19 |
| 74 | 0, // 0x1A |
| 75 | 0, // 0x1B |
| 76 | 0, // 0x1C |
| 77 | 0, // 0x1D |
| 78 | 0, // 0x1E |
| 79 | 0, // 0x1F |
| 80 | kForbiddenHost, // ' ' |
| 81 | 0, // '!' |
| 82 | 0, // '"' |
| 83 | kForbiddenHost, // '#' |
| 84 | 0, // '$' |
| 85 | 0, // '%' |
| 86 | 0, // '&' |
| 87 | 0, // '\'' |
| 88 | 0, // '(' |
| 89 | 0, // ')' |
| 90 | 0, // '*' |
| 91 | 0, // '+' |
| 92 | 0, // ',' |
| 93 | 0, // '-' |
| 94 | 0, // '.' |
| 95 | kForbiddenHost, // '/' |
| 96 | 0, // '0' |
| 97 | 0, // '1' |
| 98 | 0, // '2' |
| 99 | 0, // '3' |
| 100 | 0, // '4' |
| 101 | 0, // '5' |
| 102 | 0, // '6' |
| 103 | 0, // '7' |
| 104 | 0, // '8' |
| 105 | 0, // '9' |
| 106 | kForbiddenHost, // ':' |
| 107 | 0, // ';' |
| 108 | kForbiddenHost, // '<' |
| 109 | 0, // '=' |
| 110 | kForbiddenHost, // '>' |
| 111 | kForbiddenHost, // '?' |
| 112 | kForbiddenHost, // '@' |
| 113 | 0, // 'A' |
| 114 | 0, // 'B' |
| 115 | 0, // 'C' |
| 116 | 0, // 'D' |
| 117 | 0, // 'E' |
| 118 | 0, // 'F' |
| 119 | 0, // 'G' |
| 120 | 0, // 'H' |
| 121 | 0, // 'I' |
| 122 | 0, // 'J' |
| 123 | 0, // 'K' |
| 124 | 0, // 'L' |
| 125 | 0, // 'M' |
| 126 | 0, // 'N' |
| 127 | 0, // 'O' |
| 128 | 0, // 'P' |
| 129 | 0, // 'Q' |
| 130 | 0, // 'R' |
| 131 | 0, // 'S' |
| 132 | 0, // 'T' |
| 133 | 0, // 'U' |
| 134 | 0, // 'V' |
| 135 | 0, // 'W' |
| 136 | 0, // 'X' |
| 137 | 0, // 'Y' |
| 138 | 0, // 'Z' |
| 139 | kForbiddenHost, // '[' |
| 140 | kForbiddenHost, // '\\' |
| 141 | kForbiddenHost, // ']' |
| 142 | kForbiddenHost, // '^' |
| 143 | 0, // '_' |
| 144 | 0, // '`' |
| 145 | 0, // 'a' |
| 146 | 0, // 'b' |
| 147 | 0, // 'c' |
| 148 | 0, // 'd' |
| 149 | 0, // 'e' |
| 150 | 0, // 'f' |
| 151 | 0, // 'g' |
| 152 | 0, // 'h' |
| 153 | 0, // 'i' |
| 154 | 0, // 'j' |
| 155 | 0, // 'k' |
| 156 | 0, // 'l' |
| 157 | 0, // 'm' |
| 158 | 0, // 'n' |
| 159 | 0, // 'o' |
| 160 | 0, // 'p' |
| 161 | 0, // 'q' |
| 162 | 0, // 'r' |
| 163 | 0, // 's' |
| 164 | 0, // 't' |
| 165 | 0, // 'u' |
| 166 | 0, // 'v' |
| 167 | 0, // 'w' |
| 168 | 0, // 'x' |
| 169 | 0, // 'y' |
| 170 | 0, // 'z' |
| 171 | 0, // '{' |
| 172 | kForbiddenHost, // '|' |
| 173 | 0, // '}' |
| 174 | 0, // '~' |
| 175 | 0, // 0x7F (DEL) |
| 176 | }; |
| 177 | // clang-format on |
| 178 | |
| 179 | bool IsForbiddenHostCodePoint(uint8_t ch) { |
| 180 | return ch <= 0x7F && (kHostCharacterTable[ch] & kForbiddenHost); |
| 181 | } |
| 182 | |
Ian Clelland | 02c6822 | 2018-05-18 20:47:50 | [diff] [blame] | 183 | // RFC1034 maximum FQDN length. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 184 | constexpr size_t kMaxHostLength = 253; |
Ian Clelland | 02c6822 | 2018-05-18 20:47:50 | [diff] [blame] | 185 | |
| 186 | // Generous padding to account for the fact that UTS#46 normalization can cause |
| 187 | // a long string to actually shrink and fit within the 253 character RFC1034 |
| 188 | // FQDN length limit. Note that this can still be too short for pathological |
| 189 | // cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be |
| 190 | // removed from the input by UTS#46 processing. However, this should be |
| 191 | // sufficient for all normally-encountered, non-abusive hostname strings. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 192 | constexpr size_t kMaxHostBufferLength = kMaxHostLength * 5; |
Ian Clelland | 02c6822 | 2018-05-18 20:47:50 | [diff] [blame] | 193 | |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 194 | constexpr size_t kTempHostBufferLen = 1024; |
| 195 | using StackBuffer = RawCanonOutputT<char, kTempHostBufferLen>; |
| 196 | using StackBufferW = RawCanonOutputT<char16_t, kTempHostBufferLen>; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 197 | |
| 198 | // Scans a host name and fills in the output flags according to what we find. |
| 199 | // |has_non_ascii| will be true if there are any non-7-bit characters, and |
| 200 | // |has_escaped| will be true if there is a percent sign. |
| 201 | template<typename CHAR, typename UCHAR> |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 202 | void ScanHostname(const CHAR* spec, |
| 203 | const Component& host, |
| 204 | bool* has_non_ascii, |
| 205 | bool* has_escaped) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 206 | int end = host.end(); |
| 207 | *has_non_ascii = false; |
| 208 | *has_escaped = false; |
| 209 | for (int i = host.begin; i < end; i++) { |
| 210 | if (static_cast<UCHAR>(spec[i]) >= 0x80) |
| 211 | *has_non_ascii = true; |
| 212 | else if (spec[i] == '%') |
| 213 | *has_escaped = true; |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | // Canonicalizes a host name that is entirely 8-bit characters (even though |
| 218 | // the type holding them may be 16 bits. Escaped characters will be unescaped. |
| 219 | // Non-7-bit characters (for example, UTF-8) will be passed unchanged. |
| 220 | // |
| 221 | // The |*has_non_ascii| flag will be true if there are non-7-bit characters in |
| 222 | // the output. |
| 223 | // |
| 224 | // This function is used in two situations: |
| 225 | // |
| 226 | // * When the caller knows there is no non-ASCII or percent escaped |
| 227 | // characters. This is what DoHost does. The result will be a completely |
| 228 | // canonicalized host since we know nothing weird can happen (escaped |
| 229 | // characters could be unescaped to non-7-bit, so they have to be treated |
| 230 | // with suspicion at this point). It does not use the |has_non_ascii| flag. |
| 231 | // |
| 232 | // * When the caller has an 8-bit string that may need unescaping. |
| 233 | // DoComplexHost calls us this situation to do unescaping and validation. |
| 234 | // After this, it may do other IDN operations depending on the value of the |
| 235 | // |*has_non_ascii| flag. |
| 236 | // |
| 237 | // The return value indicates if the output is a potentially valid host name. |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 238 | template <typename INCHAR, typename OUTCHAR> |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 239 | bool DoSimpleHost(const INCHAR* host, |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 240 | size_t host_len, |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 241 | CanonOutputT<OUTCHAR>* output, |
| 242 | bool* has_non_ascii) { |
| 243 | *has_non_ascii = false; |
| 244 | |
| 245 | bool success = true; |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 246 | for (size_t i = 0; i < host_len; ++i) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 247 | unsigned int source = host[i]; |
| 248 | if (source == '%') { |
| 249 | // Unescape first, if possible. |
| 250 | // Source will be used only if decode operation was successful. |
| 251 | if (!DecodeEscaped(host, &i, host_len, |
| 252 | reinterpret_cast<unsigned char*>(&source))) { |
| 253 | // Invalid escaped character. There is nothing that can make this |
| 254 | // host valid. We append an escaped percent so the URL looks reasonable |
| 255 | // and mark as failed. |
| 256 | AppendEscapedChar('%', output); |
| 257 | success = false; |
| 258 | continue; |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | if (source < 0x80) { |
| 263 | // We have ASCII input, we can use our lookup table. |
Hayato Ito | b961a56 | 2023-12-06 05:59:34 | [diff] [blame] | 264 | unsigned char replacement = kHostCharLookup[source]; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 265 | if (!replacement) { |
| 266 | // Invalid character, add it as percent-escaped and mark as failed. |
| 267 | AppendEscapedChar(source, output); |
| 268 | success = false; |
| 269 | } else if (replacement == kEsc) { |
| 270 | // This character is valid but should be escaped. |
| 271 | AppendEscapedChar(source, output); |
| 272 | } else { |
| 273 | // Common case, the given character is valid in a hostname, the lookup |
| 274 | // table tells us the canonical representation of that character (lower |
| 275 | // cased). |
| 276 | output->push_back(replacement); |
| 277 | } |
| 278 | } else { |
| 279 | // It's a non-ascii char. Just push it to the output. |
| 280 | // In case where we have char16 input, and char output it's safe to |
| 281 | // cast char16->char only if input string was converted to ASCII. |
| 282 | output->push_back(static_cast<OUTCHAR>(source)); |
| 283 | *has_non_ascii = true; |
| 284 | } |
| 285 | } |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 286 | return success; |
| 287 | } |
| 288 | |
| 289 | // Canonicalizes a host that requires IDN conversion. Returns true on success |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 290 | bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) { |
brettw | 1141951b | 2015-11-26 00:29:35 | [diff] [blame] | 291 | int original_output_len = output->length(); // So we can rewind below. |
| 292 | |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 293 | // We need to escape URL before doing IDN conversion, since punicode strings |
| 294 | // cannot be escaped after they are created. |
| 295 | RawCanonOutputW<kTempHostBufferLen> url_escaped_host; |
| 296 | bool has_non_ascii; |
| 297 | DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); |
Ian Clelland | 02c6822 | 2018-05-18 20:47:50 | [diff] [blame] | 298 | if (url_escaped_host.length() > kMaxHostBufferLength) { |
| 299 | AppendInvalidNarrowString(src, 0, src_len, output); |
| 300 | return false; |
| 301 | } |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 302 | |
| 303 | StackBufferW wide_output; |
David Benjamin | cc4d2b2 | 2023-10-02 23:12:04 | [diff] [blame] | 304 | if (!IDNToASCII(url_escaped_host.view(), &wide_output)) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 305 | // Some error, give up. This will write some reasonable looking |
| 306 | // representation of the string to the output. |
| 307 | AppendInvalidNarrowString(src, 0, src_len, output); |
| 308 | return false; |
| 309 | } |
| 310 | |
| 311 | // Now we check the ASCII output like a normal host. It will also handle |
| 312 | // unescaping. Although we unescaped everything before this function call, if |
| 313 | // somebody does %00 as fullwidth, ICU will convert this to ASCII. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 314 | bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output, |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 315 | &has_non_ascii); |
brettw | 1141951b | 2015-11-26 00:29:35 | [diff] [blame] | 316 | if (has_non_ascii) { |
| 317 | // ICU generated something that DoSimpleHost didn't think looked like |
| 318 | // ASCII. This is quite rare, but ICU might convert some characters to |
| 319 | // percent signs which might generate new escape sequences which might in |
| 320 | // turn be invalid. An example is U+FE6A "small percent" which ICU will |
| 321 | // name prep into an ASCII percent and then we can interpret the following |
| 322 | // characters as escaped characters. |
| 323 | // |
| 324 | // If DoSimpleHost didn't think the output was ASCII, just escape the |
| 325 | // thing we gave ICU and give up. DoSimpleHost will have handled a further |
| 326 | // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates |
| 327 | // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't |
| 328 | // do more (like handle escaped non-ASCII sequences). Handling the escaped |
| 329 | // ASCII isn't strictly necessary, but DoSimpleHost handles this case |
| 330 | // anyway so we handle it/ |
| 331 | output->set_length(original_output_len); |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 332 | AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(), |
brettw | 1141951b | 2015-11-26 00:29:35 | [diff] [blame] | 333 | output); |
| 334 | return false; |
| 335 | } |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 336 | return success; |
| 337 | } |
| 338 | |
| 339 | // 8-bit convert host to its ASCII version: this converts the UTF-8 input to |
| 340 | // UTF-16. The has_escaped flag should be set if the input string requires |
| 341 | // unescaping. |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 342 | bool DoComplexHost(const char* host, |
| 343 | size_t host_len, |
| 344 | bool has_non_ascii, |
| 345 | bool has_escaped, |
| 346 | CanonOutput* output) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 347 | // Save the current position in the output. We may write stuff and rewind it |
| 348 | // below, so we need to know where to rewind to. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 349 | size_t begin_length = output->length(); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 350 | |
| 351 | // Points to the UTF-8 data we want to convert. This will either be the |
| 352 | // input or the unescaped version written to |*output| if necessary. |
| 353 | const char* utf8_source; |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 354 | size_t utf8_source_len; |
Nico Weber | 465eee6 | 2021-03-18 07:33:52 | [diff] [blame] | 355 | bool are_all_escaped_valid = true; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 356 | if (has_escaped) { |
| 357 | // Unescape before converting to UTF-16 for IDN. We write this into the |
| 358 | // output because it most likely does not require IDNization, and we can |
| 359 | // save another huge stack buffer. It will be replaced below if it requires |
| 360 | // IDN. This will also update our non-ASCII flag so we know whether the |
| 361 | // unescaped input requires IDN. |
| 362 | if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { |
| 363 | // Error with some escape sequence. We'll call the current output |
Nico Weber | 465eee6 | 2021-03-18 07:33:52 | [diff] [blame] | 364 | // complete. DoSimpleHost will have written some "reasonable" output |
| 365 | // for the invalid escapes, but the output could be non-ASCII and |
| 366 | // needs to go through re-encoding below. |
| 367 | are_all_escaped_valid = false; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 368 | } |
| 369 | |
| 370 | // Unescaping may have left us with ASCII input, in which case the |
| 371 | // unescaped version we wrote to output is complete. |
| 372 | if (!has_non_ascii) { |
Nico Weber | 465eee6 | 2021-03-18 07:33:52 | [diff] [blame] | 373 | return are_all_escaped_valid; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 374 | } |
| 375 | |
| 376 | // Save the pointer into the data was just converted (it may be appended to |
| 377 | // other data in the output buffer). |
| 378 | utf8_source = &output->data()[begin_length]; |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 379 | utf8_source_len = output->length() - begin_length; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 380 | } else { |
| 381 | // We don't need to unescape, use input for IDNization later. (We know the |
| 382 | // input has non-ASCII, or the simple version would have been called |
| 383 | // instead of us.) |
| 384 | utf8_source = host; |
| 385 | utf8_source_len = host_len; |
| 386 | } |
| 387 | |
| 388 | // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. |
| 389 | // Above, we may have used the output to write the unescaped values to, so |
| 390 | // we have to rewind it to where we started after we convert it to UTF-16. |
| 391 | StackBufferW utf16; |
| 392 | if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { |
| 393 | // In this error case, the input may or may not be the output. |
| 394 | StackBuffer utf8; |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 395 | for (size_t i = 0; i < utf8_source_len; i++) |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 396 | utf8.push_back(utf8_source[i]); |
| 397 | output->set_length(begin_length); |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 398 | AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 399 | return false; |
| 400 | } |
| 401 | output->set_length(begin_length); |
| 402 | |
| 403 | // This will call DoSimpleHost which will do normal ASCII canonicalization |
| 404 | // and also check for IP addresses in the outpt. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 405 | return DoIDNHost(utf16.data(), utf16.length(), output) && |
Nico Weber | 465eee6 | 2021-03-18 07:33:52 | [diff] [blame] | 406 | are_all_escaped_valid; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 407 | } |
| 408 | |
| 409 | // UTF-16 convert host to its ASCII version. The set up is already ready for |
| 410 | // the backend, so we just pass through. The has_escaped flag should be set if |
| 411 | // the input string requires unescaping. |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 412 | bool DoComplexHost(const char16_t* host, |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 413 | size_t host_len, |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 414 | bool has_non_ascii, |
| 415 | bool has_escaped, |
| 416 | CanonOutput* output) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 417 | if (has_escaped) { |
| 418 | // Yikes, we have escaped characters with wide input. The escaped |
| 419 | // characters should be interpreted as UTF-8. To solve this problem, |
| 420 | // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. |
| 421 | // |
| 422 | // We don't bother to optimize the conversion in the ASCII case (which |
| 423 | // *could* just be a copy) and use the UTF-8 path, because it should be |
| 424 | // very rare that host names have escaped characters, and it is relatively |
| 425 | // fast to do the conversion anyway. |
| 426 | StackBuffer utf8; |
| 427 | if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { |
| 428 | AppendInvalidNarrowString(host, 0, host_len, output); |
| 429 | return false; |
| 430 | } |
| 431 | |
| 432 | // Once we convert to UTF-8, we can use the 8-bit version of the complex |
| 433 | // host handling code above. |
Peter Kasting | cfdf32c | 2022-08-17 20:21:02 | [diff] [blame] | 434 | return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, has_escaped, |
| 435 | output); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 436 | } |
| 437 | |
| 438 | // No unescaping necessary, we can safely pass the input to ICU. This |
| 439 | // function will only get called if we either have escaped or non-ascii |
| 440 | // input, so it's safe to just use ICU now. Even if the input is ASCII, |
| 441 | // this function will do the right thing (just slower than we could). |
| 442 | return DoIDNHost(host, host_len, output); |
| 443 | } |
| 444 | |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 445 | template <typename CHAR, typename UCHAR> |
| 446 | bool DoHostSubstring(const CHAR* spec, |
| 447 | const Component& host, |
| 448 | CanonOutput* output) { |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 449 | DCHECK(host.is_valid()); |
| 450 | |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 451 | bool has_non_ascii, has_escaped; |
| 452 | ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); |
| 453 | |
| 454 | if (has_non_ascii || has_escaped) { |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 455 | return DoComplexHost(&spec[host.begin], static_cast<size_t>(host.len), |
| 456 | has_non_ascii, has_escaped, output); |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 457 | } |
| 458 | |
Peter Kasting | 8bb45c2 | 2022-06-16 19:39:27 | [diff] [blame] | 459 | const bool success = DoSimpleHost( |
| 460 | &spec[host.begin], static_cast<size_t>(host.len), output, &has_non_ascii); |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 461 | DCHECK(!has_non_ascii); |
| 462 | return success; |
| 463 | } |
| 464 | |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 465 | template <typename CharT> |
| 466 | bool DoOpaqueHost(const std::basic_string_view<CharT> host, |
| 467 | CanonOutput& output) { |
| 468 | // URL Standard: https://url.spec.whatwg.org/#concept-opaque-host-parser |
| 469 | |
| 470 | size_t host_len = host.size(); |
| 471 | |
| 472 | for (size_t i = 0; i < host_len; ++i) { |
| 473 | char16_t ch = host[i]; |
| 474 | // The characters '[', ':', and ']', are checked later in |
| 475 | // `CanonicalizeIPv6Address` function. |
| 476 | if (ch != '[' && ch != ']' && ch != ':' && IsForbiddenHostCodePoint(ch)) { |
| 477 | return false; |
| 478 | } |
| 479 | |
| 480 | // Implementation note: |
| 481 | // |
| 482 | // URL Standard: Step 3 in |
| 483 | // https://url.spec.whatwg.org/#concept-opaque-host-parser |
| 484 | // |
| 485 | // > 3. If input contains a U+0025 (%) and the two code points following |
| 486 | // > it are not ASCII hex digits, invalid-URL-unit validation error. |
| 487 | // |
| 488 | // `invalid-URL-unit` is NOT marked as failure. We don't need to consider |
| 489 | // step 3 here. |
| 490 | |
| 491 | // URL Standard: Step 4 in |
| 492 | // https://url.spec.whatwg.org/#concept-opaque-host-parser |
| 493 | // |
| 494 | // > 4. Return the result of running UTF-8 percent-encode on input using |
| 495 | // > the C0 control percent-encode set. |
| 496 | if (IsInC0ControlPercentEncodeSet(ch)) { |
| 497 | AppendUTF8EscapedChar(host.data(), &i, host_len, &output); |
| 498 | } else { |
| 499 | output.push_back(ch); |
| 500 | } |
| 501 | } |
| 502 | return true; |
| 503 | } |
| 504 | |
| 505 | template <typename CHAR, typename UCHAR, CanonMode canon_mode> |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 506 | void DoHost(const CHAR* spec, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 507 | const Component& host, |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 508 | CanonOutput& output, |
| 509 | CanonHostInfo& host_info) { |
| 510 | // URL Standard: https://url.spec.whatwg.org/#host-parsing |
| 511 | |
Hayato Ito | bbdc90b | 2024-01-31 02:57:28 | [diff] [blame] | 512 | // Keep track of output's initial length, so we can rewind later. |
| 513 | const int output_begin = output.length(); |
| 514 | |
Tom Sepez | 40fbf43e | 2022-11-15 00:11:03 | [diff] [blame] | 515 | if (host.is_empty()) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 516 | // Empty hosts don't need anything. |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 517 | host_info.family = CanonHostInfo::NEUTRAL; |
Hayato Ito | bbdc90b | 2024-01-31 02:57:28 | [diff] [blame] | 518 | // Carry over the valid empty host for non-special URLs. |
| 519 | // |
| 520 | // Component(0, 0) should be considered invalid here for historical reasons. |
| 521 | // |
| 522 | // TODO(crbug.com/1416006): Update the callers so that they don't pass |
| 523 | // Component(0, 0) as an invalid `host`. |
| 524 | if (host.begin != 0 && host.len == 0) { |
| 525 | host_info.out_host = Component(output_begin, 0); |
| 526 | } else { |
| 527 | host_info.out_host = Component(); |
| 528 | } |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 529 | return; |
| 530 | } |
| 531 | |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 532 | bool success; |
| 533 | if constexpr (canon_mode == CanonMode::kSpecialURL) { |
| 534 | success = DoHostSubstring<CHAR, UCHAR>(spec, host, &output); |
| 535 | } else { |
| 536 | // URL Standard: https://url.spec.whatwg.org/#concept-opaque-host-parser |
| 537 | success = DoOpaqueHost(host.as_string_view_on(spec), output); |
| 538 | } |
| 539 | |
| 540 | if (success) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 541 | // After all the other canonicalization, check if we ended up with an IP |
qyearsley | 2bc727d | 2015-08-14 20:17:15 | [diff] [blame] | 542 | // address. IP addresses are small, so writing into this temporary buffer |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 543 | // should not cause an allocation. |
| 544 | RawCanonOutput<64> canon_ip; |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 545 | |
| 546 | if constexpr (canon_mode == CanonMode::kSpecialURL) { |
| 547 | CanonicalizeIPAddress(output.data(), |
| 548 | MakeRange(output_begin, output.length()), &canon_ip, |
| 549 | &host_info); |
| 550 | } else { |
| 551 | // Non-special URLs support only IPv6. |
| 552 | CanonicalizeIPv6Address(output.data(), |
| 553 | MakeRange(output_begin, output.length()), |
| 554 | canon_ip, host_info); |
| 555 | } |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 556 | |
| 557 | // If we got an IPv4/IPv6 address, copy the canonical form back to the |
qyearsley | 2bc727d | 2015-08-14 20:17:15 | [diff] [blame] | 558 | // real buffer. Otherwise, it's a hostname or broken IP, in which case |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 559 | // we just leave it in place. |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 560 | if (host_info.IsIPAddress()) { |
| 561 | output.set_length(output_begin); |
| 562 | output.Append(canon_ip.view()); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 563 | } |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 564 | } else { |
| 565 | // Canonicalization failed. Set BROKEN to notify the caller. |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 566 | host_info.family = CanonHostInfo::BROKEN; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 567 | } |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 568 | host_info.out_host = MakeRange(output_begin, output.length()); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 569 | } |
| 570 | |
| 571 | } // namespace |
| 572 | |
| 573 | bool CanonicalizeHost(const char* spec, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 574 | const Component& host, |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 575 | CanonOutput* output, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 576 | Component* out_host) { |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 577 | DCHECK(output); |
| 578 | DCHECK(out_host); |
| 579 | return CanonicalizeSpecialHost(spec, host, *output, *out_host); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 580 | } |
| 581 | |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 582 | bool CanonicalizeHost(const char16_t* spec, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 583 | const Component& host, |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 584 | CanonOutput* output, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 585 | Component* out_host) { |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 586 | DCHECK(output); |
| 587 | DCHECK(out_host); |
| 588 | return CanonicalizeSpecialHost(spec, host, *output, *out_host); |
| 589 | } |
| 590 | |
| 591 | bool CanonicalizeSpecialHost(const char* spec, |
| 592 | const Component& host, |
| 593 | CanonOutput& output, |
| 594 | Component& out_host) { |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 595 | CanonHostInfo host_info; |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 596 | DoHost<char, unsigned char, CanonMode::kSpecialURL>(spec, host, output, |
| 597 | host_info); |
| 598 | out_host = host_info.out_host; |
| 599 | return (host_info.family != CanonHostInfo::BROKEN); |
| 600 | } |
| 601 | |
| 602 | bool CanonicalizeSpecialHost(const char16_t* spec, |
| 603 | const Component& host, |
| 604 | CanonOutput& output, |
| 605 | Component& out_host) { |
| 606 | CanonHostInfo host_info; |
| 607 | DoHost<char16_t, char16_t, CanonMode::kSpecialURL>(spec, host, output, |
| 608 | host_info); |
| 609 | out_host = host_info.out_host; |
| 610 | return (host_info.family != CanonHostInfo::BROKEN); |
| 611 | } |
| 612 | |
| 613 | bool CanonicalizeNonSpecialHost(const char* spec, |
| 614 | const Component& host, |
| 615 | CanonOutput& output, |
| 616 | Component& out_host) { |
| 617 | CanonHostInfo host_info; |
| 618 | DoHost<char, unsigned char, CanonMode::kNonSpecialURL>(spec, host, output, |
| 619 | host_info); |
| 620 | out_host = host_info.out_host; |
| 621 | return (host_info.family != CanonHostInfo::BROKEN); |
| 622 | } |
| 623 | |
| 624 | bool CanonicalizeNonSpecialHost(const char16_t* spec, |
| 625 | const Component& host, |
| 626 | CanonOutput& output, |
| 627 | Component& out_host) { |
| 628 | CanonHostInfo host_info; |
| 629 | DoHost<char16_t, char16_t, CanonMode::kNonSpecialURL>(spec, host, output, |
| 630 | host_info); |
| 631 | out_host = host_info.out_host; |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 632 | return (host_info.family != CanonHostInfo::BROKEN); |
| 633 | } |
| 634 | |
| 635 | void CanonicalizeHostVerbose(const char* spec, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 636 | const Component& host, |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 637 | CanonOutput* output, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 638 | CanonHostInfo* host_info) { |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 639 | DCHECK(output); |
| 640 | DCHECK(host_info); |
| 641 | CanonicalizeSpecialHostVerbose(spec, host, *output, *host_info); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 642 | } |
| 643 | |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 644 | void CanonicalizeHostVerbose(const char16_t* spec, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 645 | const Component& host, |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 646 | CanonOutput* output, |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 647 | CanonHostInfo* host_info) { |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 648 | DCHECK(output); |
| 649 | DCHECK(host_info); |
| 650 | CanonicalizeSpecialHostVerbose(spec, host, *output, *host_info); |
| 651 | } |
| 652 | |
| 653 | void CanonicalizeSpecialHostVerbose(const char* spec, |
| 654 | const Component& host, |
| 655 | CanonOutput& output, |
| 656 | CanonHostInfo& host_info) { |
| 657 | DoHost<char, unsigned char, CanonMode::kSpecialURL>(spec, host, output, |
| 658 | host_info); |
| 659 | } |
| 660 | |
| 661 | void CanonicalizeSpecialHostVerbose(const char16_t* spec, |
| 662 | const Component& host, |
| 663 | CanonOutput& output, |
| 664 | CanonHostInfo& host_info) { |
| 665 | DoHost<char16_t, char16_t, CanonMode::kSpecialURL>(spec, host, output, |
| 666 | host_info); |
brettw@chromium.org | e7bba5f | 2013-04-10 20:10:52 | [diff] [blame] | 667 | } |
| 668 | |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 669 | bool CanonicalizeHostSubstring(const char* spec, |
| 670 | const Component& host, |
| 671 | CanonOutput* output) { |
| 672 | return DoHostSubstring<char, unsigned char>(spec, host, output); |
| 673 | } |
| 674 | |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 675 | bool CanonicalizeHostSubstring(const char16_t* spec, |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 676 | const Component& host, |
| 677 | CanonOutput* output) { |
Jan Wilken Dörrie | 5aad5c2 | 2021-03-08 21:44:12 | [diff] [blame] | 678 | return DoHostSubstring<char16_t, char16_t>(spec, host, output); |
brettw | 5a36380ef | 2016-10-27 19:51:56 | [diff] [blame] | 679 | } |
| 680 | |
Hayato Ito | 203c302 | 2023-12-11 04:58:57 | [diff] [blame] | 681 | void CanonicalizeNonSpecialHostVerbose(const char* spec, |
| 682 | const Component& host, |
| 683 | CanonOutput& output, |
| 684 | CanonHostInfo& host_info) { |
| 685 | DoHost<char, unsigned char, CanonMode::kNonSpecialURL>(spec, host, output, |
| 686 | host_info); |
| 687 | } |
| 688 | |
| 689 | void CanonicalizeNonSpecialHostVerbose(const char16_t* spec, |
| 690 | const Component& host, |
| 691 | CanonOutput& output, |
| 692 | CanonHostInfo& host_info) { |
| 693 | DoHost<char16_t, char16_t, CanonMode::kNonSpecialURL>(spec, host, output, |
| 694 | host_info); |
| 695 | } |
| 696 | |
vitalybuka@chromium.org | 0318f92 | 2014-04-22 00:09:23 | [diff] [blame] | 697 | } // namespace url |