[go: nahoru, domu]

blob: 21524907383ca0f0e1588e1f4b6d5ed3cad38b9f [file] [log] [blame]
Avi Drissman3e1a26c2022-09-15 20:26:031// Copyright 2013 The Chromium Authors
tfarina@chromium.org51bcc5d2013-04-24 01:41:372// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
brettw@chromium.orge7bba5f2013-04-10 20:10:524
Hans Wennborg0e223682020-04-27 21:51:295#include "base/check.h"
Erik Chen0fcb8762022-02-10 17:35:066#include "base/cpu_reduction_experiment.h"
tfarina@chromium.org318076b2013-04-18 21:19:457#include "url/url_canon.h"
8#include "url/url_canon_internal.h"
Hayato Ito746863d2023-09-21 06:51:379#include "url/url_features.h"
brettw@chromium.orge7bba5f2013-04-10 20:10:5210
vitalybuka@chromium.org0318f922014-04-22 00:09:2311namespace url {
brettw@chromium.orge7bba5f2013-04-10 20:10:5212
13namespace {
14
brettw@chromium.orge7bba5f2013-04-10 20:10:5215// This table lists the canonical version of all characters we allow in the
Hayato Itob961a562023-12-06 05:59:3416// input, with 0 indicating it is disallowed. We use the magic kEsc value to
17// indicate that this character should be escaped. At present, ' ' (SPACE) and
18// '*' (asterisk) are still non-compliant to the URL Standard. See
19// https://crbug.com/1416013 for details.
brettw@chromium.orge7bba5f2013-04-10 20:10:5220const unsigned char kEsc = 0xff;
Hayato Itob961a562023-12-06 05:59:3421// clang-format off
brettw@chromium.orge7bba5f2013-04-10 20:10:5222const unsigned char kHostCharLookup[0x80] = {
23// 00-1f: all are invalid
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26// ' ' ! " # $ % & ' ( ) * + , - . /
Hayato Ito746863d2023-09-21 06:51:3727 kEsc,'!', '"', 0, '$', 0, '&', '\'','(', ')', kEsc, '+', ',', '-', '.', 0,
28// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
29 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';' , 0, '=', 0, 0,
30// @ A B C D E F G H I J K L M N O
31 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
32// P Q R S T U V W X Y Z [ \ ] ^ _
33 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0, ']', 0, '_',
34// ` a b c d e f g h i j k l m n o
35 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
36// p q r s t u v w x y z { | } ~
37 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', 0, '}', '~', 0 };
38// clang-format on
39
Hayato Ito203c3022023-12-11 04:58:5740// https://url.spec.whatwg.org/#forbidden-host-code-point
41const uint8_t kForbiddenHost = 0x1;
42
43// TODO(crbug.com/1416006): Merge other lookup tables into this table. That can
44// be probably done after https://crbug.com/1416013 is resolved.
45//
46// This table is currently only used for an opaque-host in non-special URLs.
47const uint8_t kHostCharacterTable[128] = {
48 kForbiddenHost, // 0x00 (NUL)
49 0, // 0x01
50 0, // 0x02
51 0, // 0x03
52 0, // 0x04
53 0, // 0x05
54 0, // 0x06
55 0, // 0x07
56 0, // 0x08
57 kForbiddenHost, // 0x09 (TAB)
58 kForbiddenHost, // 0x0A (LF)
59 0, // 0x0B
60 0, // 0x0C
61 kForbiddenHost, // 0x0D (CR)
62 0, // 0x0E
63 0, // 0x0F
64 0, // 0x10
65 0, // 0x11
66 0, // 0x12
67 0, // 0x13
68 0, // 0x14
69 0, // 0x15
70 0, // 0x16
71 0, // 0x17
72 0, // 0x18
73 0, // 0x19
74 0, // 0x1A
75 0, // 0x1B
76 0, // 0x1C
77 0, // 0x1D
78 0, // 0x1E
79 0, // 0x1F
80 kForbiddenHost, // ' '
81 0, // '!'
82 0, // '"'
83 kForbiddenHost, // '#'
84 0, // '$'
85 0, // '%'
86 0, // '&'
87 0, // '\''
88 0, // '('
89 0, // ')'
90 0, // '*'
91 0, // '+'
92 0, // ','
93 0, // '-'
94 0, // '.'
95 kForbiddenHost, // '/'
96 0, // '0'
97 0, // '1'
98 0, // '2'
99 0, // '3'
100 0, // '4'
101 0, // '5'
102 0, // '6'
103 0, // '7'
104 0, // '8'
105 0, // '9'
106 kForbiddenHost, // ':'
107 0, // ';'
108 kForbiddenHost, // '<'
109 0, // '='
110 kForbiddenHost, // '>'
111 kForbiddenHost, // '?'
112 kForbiddenHost, // '@'
113 0, // 'A'
114 0, // 'B'
115 0, // 'C'
116 0, // 'D'
117 0, // 'E'
118 0, // 'F'
119 0, // 'G'
120 0, // 'H'
121 0, // 'I'
122 0, // 'J'
123 0, // 'K'
124 0, // 'L'
125 0, // 'M'
126 0, // 'N'
127 0, // 'O'
128 0, // 'P'
129 0, // 'Q'
130 0, // 'R'
131 0, // 'S'
132 0, // 'T'
133 0, // 'U'
134 0, // 'V'
135 0, // 'W'
136 0, // 'X'
137 0, // 'Y'
138 0, // 'Z'
139 kForbiddenHost, // '['
140 kForbiddenHost, // '\\'
141 kForbiddenHost, // ']'
142 kForbiddenHost, // '^'
143 0, // '_'
144 0, // '`'
145 0, // 'a'
146 0, // 'b'
147 0, // 'c'
148 0, // 'd'
149 0, // 'e'
150 0, // 'f'
151 0, // 'g'
152 0, // 'h'
153 0, // 'i'
154 0, // 'j'
155 0, // 'k'
156 0, // 'l'
157 0, // 'm'
158 0, // 'n'
159 0, // 'o'
160 0, // 'p'
161 0, // 'q'
162 0, // 'r'
163 0, // 's'
164 0, // 't'
165 0, // 'u'
166 0, // 'v'
167 0, // 'w'
168 0, // 'x'
169 0, // 'y'
170 0, // 'z'
171 0, // '{'
172 kForbiddenHost, // '|'
173 0, // '}'
174 0, // '~'
175 0, // 0x7F (DEL)
176};
177// clang-format on
178
179bool IsForbiddenHostCodePoint(uint8_t ch) {
180 return ch <= 0x7F && (kHostCharacterTable[ch] & kForbiddenHost);
181}
182
Ian Clelland02c68222018-05-18 20:47:50183// RFC1034 maximum FQDN length.
Peter Kastingcfdf32c2022-08-17 20:21:02184constexpr size_t kMaxHostLength = 253;
Ian Clelland02c68222018-05-18 20:47:50185
186// Generous padding to account for the fact that UTS#46 normalization can cause
187// a long string to actually shrink and fit within the 253 character RFC1034
188// FQDN length limit. Note that this can still be too short for pathological
189// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
190// removed from the input by UTS#46 processing. However, this should be
191// sufficient for all normally-encountered, non-abusive hostname strings.
Peter Kastingcfdf32c2022-08-17 20:21:02192constexpr size_t kMaxHostBufferLength = kMaxHostLength * 5;
Ian Clelland02c68222018-05-18 20:47:50193
Peter Kastingcfdf32c2022-08-17 20:21:02194constexpr size_t kTempHostBufferLen = 1024;
195using StackBuffer = RawCanonOutputT<char, kTempHostBufferLen>;
196using StackBufferW = RawCanonOutputT<char16_t, kTempHostBufferLen>;
brettw@chromium.orge7bba5f2013-04-10 20:10:52197
198// Scans a host name and fills in the output flags according to what we find.
199// |has_non_ascii| will be true if there are any non-7-bit characters, and
200// |has_escaped| will be true if there is a percent sign.
201template<typename CHAR, typename UCHAR>
vitalybuka@chromium.org0318f922014-04-22 00:09:23202void ScanHostname(const CHAR* spec,
203 const Component& host,
204 bool* has_non_ascii,
205 bool* has_escaped) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52206 int end = host.end();
207 *has_non_ascii = false;
208 *has_escaped = false;
209 for (int i = host.begin; i < end; i++) {
210 if (static_cast<UCHAR>(spec[i]) >= 0x80)
211 *has_non_ascii = true;
212 else if (spec[i] == '%')
213 *has_escaped = true;
214 }
215}
216
217// Canonicalizes a host name that is entirely 8-bit characters (even though
218// the type holding them may be 16 bits. Escaped characters will be unescaped.
219// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
220//
221// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
222// the output.
223//
224// This function is used in two situations:
225//
226// * When the caller knows there is no non-ASCII or percent escaped
227// characters. This is what DoHost does. The result will be a completely
228// canonicalized host since we know nothing weird can happen (escaped
229// characters could be unescaped to non-7-bit, so they have to be treated
230// with suspicion at this point). It does not use the |has_non_ascii| flag.
231//
232// * When the caller has an 8-bit string that may need unescaping.
233// DoComplexHost calls us this situation to do unescaping and validation.
234// After this, it may do other IDN operations depending on the value of the
235// |*has_non_ascii| flag.
236//
237// The return value indicates if the output is a potentially valid host name.
Peter Kasting8bb45c22022-06-16 19:39:27238template <typename INCHAR, typename OUTCHAR>
brettw@chromium.orge7bba5f2013-04-10 20:10:52239bool DoSimpleHost(const INCHAR* host,
Peter Kasting8bb45c22022-06-16 19:39:27240 size_t host_len,
brettw@chromium.orge7bba5f2013-04-10 20:10:52241 CanonOutputT<OUTCHAR>* output,
242 bool* has_non_ascii) {
243 *has_non_ascii = false;
244
245 bool success = true;
Peter Kasting8bb45c22022-06-16 19:39:27246 for (size_t i = 0; i < host_len; ++i) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52247 unsigned int source = host[i];
248 if (source == '%') {
249 // Unescape first, if possible.
250 // Source will be used only if decode operation was successful.
251 if (!DecodeEscaped(host, &i, host_len,
252 reinterpret_cast<unsigned char*>(&source))) {
253 // Invalid escaped character. There is nothing that can make this
254 // host valid. We append an escaped percent so the URL looks reasonable
255 // and mark as failed.
256 AppendEscapedChar('%', output);
257 success = false;
258 continue;
259 }
260 }
261
262 if (source < 0x80) {
263 // We have ASCII input, we can use our lookup table.
Hayato Itob961a562023-12-06 05:59:34264 unsigned char replacement = kHostCharLookup[source];
brettw@chromium.orge7bba5f2013-04-10 20:10:52265 if (!replacement) {
266 // Invalid character, add it as percent-escaped and mark as failed.
267 AppendEscapedChar(source, output);
268 success = false;
269 } else if (replacement == kEsc) {
270 // This character is valid but should be escaped.
271 AppendEscapedChar(source, output);
272 } else {
273 // Common case, the given character is valid in a hostname, the lookup
274 // table tells us the canonical representation of that character (lower
275 // cased).
276 output->push_back(replacement);
277 }
278 } else {
279 // It's a non-ascii char. Just push it to the output.
280 // In case where we have char16 input, and char output it's safe to
281 // cast char16->char only if input string was converted to ASCII.
282 output->push_back(static_cast<OUTCHAR>(source));
283 *has_non_ascii = true;
284 }
285 }
brettw@chromium.orge7bba5f2013-04-10 20:10:52286 return success;
287}
288
289// Canonicalizes a host that requires IDN conversion. Returns true on success
Peter Kasting8bb45c22022-06-16 19:39:27290bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) {
brettw1141951b2015-11-26 00:29:35291 int original_output_len = output->length(); // So we can rewind below.
292
brettw@chromium.orge7bba5f2013-04-10 20:10:52293 // We need to escape URL before doing IDN conversion, since punicode strings
294 // cannot be escaped after they are created.
295 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
296 bool has_non_ascii;
297 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
Ian Clelland02c68222018-05-18 20:47:50298 if (url_escaped_host.length() > kMaxHostBufferLength) {
299 AppendInvalidNarrowString(src, 0, src_len, output);
300 return false;
301 }
brettw@chromium.orge7bba5f2013-04-10 20:10:52302
303 StackBufferW wide_output;
David Benjamincc4d2b22023-10-02 23:12:04304 if (!IDNToASCII(url_escaped_host.view(), &wide_output)) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52305 // Some error, give up. This will write some reasonable looking
306 // representation of the string to the output.
307 AppendInvalidNarrowString(src, 0, src_len, output);
308 return false;
309 }
310
311 // Now we check the ASCII output like a normal host. It will also handle
312 // unescaping. Although we unescaped everything before this function call, if
313 // somebody does %00 as fullwidth, ICU will convert this to ASCII.
Peter Kastingcfdf32c2022-08-17 20:21:02314 bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output,
Peter Kasting8bb45c22022-06-16 19:39:27315 &has_non_ascii);
brettw1141951b2015-11-26 00:29:35316 if (has_non_ascii) {
317 // ICU generated something that DoSimpleHost didn't think looked like
318 // ASCII. This is quite rare, but ICU might convert some characters to
319 // percent signs which might generate new escape sequences which might in
320 // turn be invalid. An example is U+FE6A "small percent" which ICU will
321 // name prep into an ASCII percent and then we can interpret the following
322 // characters as escaped characters.
323 //
324 // If DoSimpleHost didn't think the output was ASCII, just escape the
325 // thing we gave ICU and give up. DoSimpleHost will have handled a further
326 // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
327 // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
328 // do more (like handle escaped non-ASCII sequences). Handling the escaped
329 // ASCII isn't strictly necessary, but DoSimpleHost handles this case
330 // anyway so we handle it/
331 output->set_length(original_output_len);
Peter Kastingcfdf32c2022-08-17 20:21:02332 AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
brettw1141951b2015-11-26 00:29:35333 output);
334 return false;
335 }
brettw@chromium.orge7bba5f2013-04-10 20:10:52336 return success;
337}
338
339// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
340// UTF-16. The has_escaped flag should be set if the input string requires
341// unescaping.
Peter Kasting8bb45c22022-06-16 19:39:27342bool DoComplexHost(const char* host,
343 size_t host_len,
344 bool has_non_ascii,
345 bool has_escaped,
346 CanonOutput* output) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52347 // Save the current position in the output. We may write stuff and rewind it
348 // below, so we need to know where to rewind to.
Peter Kastingcfdf32c2022-08-17 20:21:02349 size_t begin_length = output->length();
brettw@chromium.orge7bba5f2013-04-10 20:10:52350
351 // Points to the UTF-8 data we want to convert. This will either be the
352 // input or the unescaped version written to |*output| if necessary.
353 const char* utf8_source;
Peter Kasting8bb45c22022-06-16 19:39:27354 size_t utf8_source_len;
Nico Weber465eee62021-03-18 07:33:52355 bool are_all_escaped_valid = true;
brettw@chromium.orge7bba5f2013-04-10 20:10:52356 if (has_escaped) {
357 // Unescape before converting to UTF-16 for IDN. We write this into the
358 // output because it most likely does not require IDNization, and we can
359 // save another huge stack buffer. It will be replaced below if it requires
360 // IDN. This will also update our non-ASCII flag so we know whether the
361 // unescaped input requires IDN.
362 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
363 // Error with some escape sequence. We'll call the current output
Nico Weber465eee62021-03-18 07:33:52364 // complete. DoSimpleHost will have written some "reasonable" output
365 // for the invalid escapes, but the output could be non-ASCII and
366 // needs to go through re-encoding below.
367 are_all_escaped_valid = false;
brettw@chromium.orge7bba5f2013-04-10 20:10:52368 }
369
370 // Unescaping may have left us with ASCII input, in which case the
371 // unescaped version we wrote to output is complete.
372 if (!has_non_ascii) {
Nico Weber465eee62021-03-18 07:33:52373 return are_all_escaped_valid;
brettw@chromium.orge7bba5f2013-04-10 20:10:52374 }
375
376 // Save the pointer into the data was just converted (it may be appended to
377 // other data in the output buffer).
378 utf8_source = &output->data()[begin_length];
Peter Kastingcfdf32c2022-08-17 20:21:02379 utf8_source_len = output->length() - begin_length;
brettw@chromium.orge7bba5f2013-04-10 20:10:52380 } else {
381 // We don't need to unescape, use input for IDNization later. (We know the
382 // input has non-ASCII, or the simple version would have been called
383 // instead of us.)
384 utf8_source = host;
385 utf8_source_len = host_len;
386 }
387
388 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
389 // Above, we may have used the output to write the unescaped values to, so
390 // we have to rewind it to where we started after we convert it to UTF-16.
391 StackBufferW utf16;
392 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
393 // In this error case, the input may or may not be the output.
394 StackBuffer utf8;
Peter Kasting8bb45c22022-06-16 19:39:27395 for (size_t i = 0; i < utf8_source_len; i++)
brettw@chromium.orge7bba5f2013-04-10 20:10:52396 utf8.push_back(utf8_source[i]);
397 output->set_length(begin_length);
Peter Kastingcfdf32c2022-08-17 20:21:02398 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
brettw@chromium.orge7bba5f2013-04-10 20:10:52399 return false;
400 }
401 output->set_length(begin_length);
402
403 // This will call DoSimpleHost which will do normal ASCII canonicalization
404 // and also check for IP addresses in the outpt.
Peter Kastingcfdf32c2022-08-17 20:21:02405 return DoIDNHost(utf16.data(), utf16.length(), output) &&
Nico Weber465eee62021-03-18 07:33:52406 are_all_escaped_valid;
brettw@chromium.orge7bba5f2013-04-10 20:10:52407}
408
409// UTF-16 convert host to its ASCII version. The set up is already ready for
410// the backend, so we just pass through. The has_escaped flag should be set if
411// the input string requires unescaping.
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12412bool DoComplexHost(const char16_t* host,
Peter Kasting8bb45c22022-06-16 19:39:27413 size_t host_len,
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12414 bool has_non_ascii,
415 bool has_escaped,
416 CanonOutput* output) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52417 if (has_escaped) {
418 // Yikes, we have escaped characters with wide input. The escaped
419 // characters should be interpreted as UTF-8. To solve this problem,
420 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
421 //
422 // We don't bother to optimize the conversion in the ASCII case (which
423 // *could* just be a copy) and use the UTF-8 path, because it should be
424 // very rare that host names have escaped characters, and it is relatively
425 // fast to do the conversion anyway.
426 StackBuffer utf8;
427 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
428 AppendInvalidNarrowString(host, 0, host_len, output);
429 return false;
430 }
431
432 // Once we convert to UTF-8, we can use the 8-bit version of the complex
433 // host handling code above.
Peter Kastingcfdf32c2022-08-17 20:21:02434 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, has_escaped,
435 output);
brettw@chromium.orge7bba5f2013-04-10 20:10:52436 }
437
438 // No unescaping necessary, we can safely pass the input to ICU. This
439 // function will only get called if we either have escaped or non-ascii
440 // input, so it's safe to just use ICU now. Even if the input is ASCII,
441 // this function will do the right thing (just slower than we could).
442 return DoIDNHost(host, host_len, output);
443}
444
brettw5a36380ef2016-10-27 19:51:56445template <typename CHAR, typename UCHAR>
446bool DoHostSubstring(const CHAR* spec,
447 const Component& host,
448 CanonOutput* output) {
Peter Kasting8bb45c22022-06-16 19:39:27449 DCHECK(host.is_valid());
450
brettw5a36380ef2016-10-27 19:51:56451 bool has_non_ascii, has_escaped;
452 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
453
454 if (has_non_ascii || has_escaped) {
Peter Kasting8bb45c22022-06-16 19:39:27455 return DoComplexHost(&spec[host.begin], static_cast<size_t>(host.len),
456 has_non_ascii, has_escaped, output);
brettw5a36380ef2016-10-27 19:51:56457 }
458
Peter Kasting8bb45c22022-06-16 19:39:27459 const bool success = DoSimpleHost(
460 &spec[host.begin], static_cast<size_t>(host.len), output, &has_non_ascii);
brettw5a36380ef2016-10-27 19:51:56461 DCHECK(!has_non_ascii);
462 return success;
463}
464
Hayato Ito203c3022023-12-11 04:58:57465template <typename CharT>
466bool DoOpaqueHost(const std::basic_string_view<CharT> host,
467 CanonOutput& output) {
468 // URL Standard: https://url.spec.whatwg.org/#concept-opaque-host-parser
469
470 size_t host_len = host.size();
471
472 for (size_t i = 0; i < host_len; ++i) {
473 char16_t ch = host[i];
474 // The characters '[', ':', and ']', are checked later in
475 // `CanonicalizeIPv6Address` function.
476 if (ch != '[' && ch != ']' && ch != ':' && IsForbiddenHostCodePoint(ch)) {
477 return false;
478 }
479
480 // Implementation note:
481 //
482 // URL Standard: Step 3 in
483 // https://url.spec.whatwg.org/#concept-opaque-host-parser
484 //
485 // > 3. If input contains a U+0025 (%) and the two code points following
486 // > it are not ASCII hex digits, invalid-URL-unit validation error.
487 //
488 // `invalid-URL-unit` is NOT marked as failure. We don't need to consider
489 // step 3 here.
490
491 // URL Standard: Step 4 in
492 // https://url.spec.whatwg.org/#concept-opaque-host-parser
493 //
494 // > 4. Return the result of running UTF-8 percent-encode on input using
495 // > the C0 control percent-encode set.
496 if (IsInC0ControlPercentEncodeSet(ch)) {
497 AppendUTF8EscapedChar(host.data(), &i, host_len, &output);
498 } else {
499 output.push_back(ch);
500 }
501 }
502 return true;
503}
504
505template <typename CHAR, typename UCHAR, CanonMode canon_mode>
brettw@chromium.orge7bba5f2013-04-10 20:10:52506void DoHost(const CHAR* spec,
vitalybuka@chromium.org0318f922014-04-22 00:09:23507 const Component& host,
Hayato Ito203c3022023-12-11 04:58:57508 CanonOutput& output,
509 CanonHostInfo& host_info) {
510 // URL Standard: https://url.spec.whatwg.org/#host-parsing
511
Hayato Itobbdc90b2024-01-31 02:57:28512 // Keep track of output's initial length, so we can rewind later.
513 const int output_begin = output.length();
514
Tom Sepez40fbf43e2022-11-15 00:11:03515 if (host.is_empty()) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52516 // Empty hosts don't need anything.
Hayato Ito203c3022023-12-11 04:58:57517 host_info.family = CanonHostInfo::NEUTRAL;
Hayato Itobbdc90b2024-01-31 02:57:28518 // Carry over the valid empty host for non-special URLs.
519 //
520 // Component(0, 0) should be considered invalid here for historical reasons.
521 //
522 // TODO(crbug.com/1416006): Update the callers so that they don't pass
523 // Component(0, 0) as an invalid `host`.
524 if (host.begin != 0 && host.len == 0) {
525 host_info.out_host = Component(output_begin, 0);
526 } else {
527 host_info.out_host = Component();
528 }
brettw@chromium.orge7bba5f2013-04-10 20:10:52529 return;
530 }
531
Hayato Ito203c3022023-12-11 04:58:57532 bool success;
533 if constexpr (canon_mode == CanonMode::kSpecialURL) {
534 success = DoHostSubstring<CHAR, UCHAR>(spec, host, &output);
535 } else {
536 // URL Standard: https://url.spec.whatwg.org/#concept-opaque-host-parser
537 success = DoOpaqueHost(host.as_string_view_on(spec), output);
538 }
539
540 if (success) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52541 // After all the other canonicalization, check if we ended up with an IP
qyearsley2bc727d2015-08-14 20:17:15542 // address. IP addresses are small, so writing into this temporary buffer
brettw@chromium.orge7bba5f2013-04-10 20:10:52543 // should not cause an allocation.
544 RawCanonOutput<64> canon_ip;
Hayato Ito203c3022023-12-11 04:58:57545
546 if constexpr (canon_mode == CanonMode::kSpecialURL) {
547 CanonicalizeIPAddress(output.data(),
548 MakeRange(output_begin, output.length()), &canon_ip,
549 &host_info);
550 } else {
551 // Non-special URLs support only IPv6.
552 CanonicalizeIPv6Address(output.data(),
553 MakeRange(output_begin, output.length()),
554 canon_ip, host_info);
555 }
brettw@chromium.orge7bba5f2013-04-10 20:10:52556
557 // If we got an IPv4/IPv6 address, copy the canonical form back to the
qyearsley2bc727d2015-08-14 20:17:15558 // real buffer. Otherwise, it's a hostname or broken IP, in which case
brettw@chromium.orge7bba5f2013-04-10 20:10:52559 // we just leave it in place.
Hayato Ito203c3022023-12-11 04:58:57560 if (host_info.IsIPAddress()) {
561 output.set_length(output_begin);
562 output.Append(canon_ip.view());
brettw@chromium.orge7bba5f2013-04-10 20:10:52563 }
brettw5a36380ef2016-10-27 19:51:56564 } else {
565 // Canonicalization failed. Set BROKEN to notify the caller.
Hayato Ito203c3022023-12-11 04:58:57566 host_info.family = CanonHostInfo::BROKEN;
brettw@chromium.orge7bba5f2013-04-10 20:10:52567 }
Hayato Ito203c3022023-12-11 04:58:57568 host_info.out_host = MakeRange(output_begin, output.length());
brettw@chromium.orge7bba5f2013-04-10 20:10:52569}
570
571} // namespace
572
573bool CanonicalizeHost(const char* spec,
vitalybuka@chromium.org0318f922014-04-22 00:09:23574 const Component& host,
brettw@chromium.orge7bba5f2013-04-10 20:10:52575 CanonOutput* output,
vitalybuka@chromium.org0318f922014-04-22 00:09:23576 Component* out_host) {
Hayato Ito203c3022023-12-11 04:58:57577 DCHECK(output);
578 DCHECK(out_host);
579 return CanonicalizeSpecialHost(spec, host, *output, *out_host);
brettw@chromium.orge7bba5f2013-04-10 20:10:52580}
581
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12582bool CanonicalizeHost(const char16_t* spec,
vitalybuka@chromium.org0318f922014-04-22 00:09:23583 const Component& host,
brettw@chromium.orge7bba5f2013-04-10 20:10:52584 CanonOutput* output,
vitalybuka@chromium.org0318f922014-04-22 00:09:23585 Component* out_host) {
Hayato Ito203c3022023-12-11 04:58:57586 DCHECK(output);
587 DCHECK(out_host);
588 return CanonicalizeSpecialHost(spec, host, *output, *out_host);
589}
590
591bool CanonicalizeSpecialHost(const char* spec,
592 const Component& host,
593 CanonOutput& output,
594 Component& out_host) {
brettw@chromium.orge7bba5f2013-04-10 20:10:52595 CanonHostInfo host_info;
Hayato Ito203c3022023-12-11 04:58:57596 DoHost<char, unsigned char, CanonMode::kSpecialURL>(spec, host, output,
597 host_info);
598 out_host = host_info.out_host;
599 return (host_info.family != CanonHostInfo::BROKEN);
600}
601
602bool CanonicalizeSpecialHost(const char16_t* spec,
603 const Component& host,
604 CanonOutput& output,
605 Component& out_host) {
606 CanonHostInfo host_info;
607 DoHost<char16_t, char16_t, CanonMode::kSpecialURL>(spec, host, output,
608 host_info);
609 out_host = host_info.out_host;
610 return (host_info.family != CanonHostInfo::BROKEN);
611}
612
613bool CanonicalizeNonSpecialHost(const char* spec,
614 const Component& host,
615 CanonOutput& output,
616 Component& out_host) {
617 CanonHostInfo host_info;
618 DoHost<char, unsigned char, CanonMode::kNonSpecialURL>(spec, host, output,
619 host_info);
620 out_host = host_info.out_host;
621 return (host_info.family != CanonHostInfo::BROKEN);
622}
623
624bool CanonicalizeNonSpecialHost(const char16_t* spec,
625 const Component& host,
626 CanonOutput& output,
627 Component& out_host) {
628 CanonHostInfo host_info;
629 DoHost<char16_t, char16_t, CanonMode::kNonSpecialURL>(spec, host, output,
630 host_info);
631 out_host = host_info.out_host;
brettw@chromium.orge7bba5f2013-04-10 20:10:52632 return (host_info.family != CanonHostInfo::BROKEN);
633}
634
635void CanonicalizeHostVerbose(const char* spec,
vitalybuka@chromium.org0318f922014-04-22 00:09:23636 const Component& host,
brettw@chromium.orge7bba5f2013-04-10 20:10:52637 CanonOutput* output,
vitalybuka@chromium.org0318f922014-04-22 00:09:23638 CanonHostInfo* host_info) {
Hayato Ito203c3022023-12-11 04:58:57639 DCHECK(output);
640 DCHECK(host_info);
641 CanonicalizeSpecialHostVerbose(spec, host, *output, *host_info);
brettw@chromium.orge7bba5f2013-04-10 20:10:52642}
643
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12644void CanonicalizeHostVerbose(const char16_t* spec,
vitalybuka@chromium.org0318f922014-04-22 00:09:23645 const Component& host,
brettw@chromium.orge7bba5f2013-04-10 20:10:52646 CanonOutput* output,
vitalybuka@chromium.org0318f922014-04-22 00:09:23647 CanonHostInfo* host_info) {
Hayato Ito203c3022023-12-11 04:58:57648 DCHECK(output);
649 DCHECK(host_info);
650 CanonicalizeSpecialHostVerbose(spec, host, *output, *host_info);
651}
652
653void CanonicalizeSpecialHostVerbose(const char* spec,
654 const Component& host,
655 CanonOutput& output,
656 CanonHostInfo& host_info) {
657 DoHost<char, unsigned char, CanonMode::kSpecialURL>(spec, host, output,
658 host_info);
659}
660
661void CanonicalizeSpecialHostVerbose(const char16_t* spec,
662 const Component& host,
663 CanonOutput& output,
664 CanonHostInfo& host_info) {
665 DoHost<char16_t, char16_t, CanonMode::kSpecialURL>(spec, host, output,
666 host_info);
brettw@chromium.orge7bba5f2013-04-10 20:10:52667}
668
brettw5a36380ef2016-10-27 19:51:56669bool CanonicalizeHostSubstring(const char* spec,
670 const Component& host,
671 CanonOutput* output) {
672 return DoHostSubstring<char, unsigned char>(spec, host, output);
673}
674
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12675bool CanonicalizeHostSubstring(const char16_t* spec,
brettw5a36380ef2016-10-27 19:51:56676 const Component& host,
677 CanonOutput* output) {
Jan Wilken Dörrie5aad5c22021-03-08 21:44:12678 return DoHostSubstring<char16_t, char16_t>(spec, host, output);
brettw5a36380ef2016-10-27 19:51:56679}
680
Hayato Ito203c3022023-12-11 04:58:57681void CanonicalizeNonSpecialHostVerbose(const char* spec,
682 const Component& host,
683 CanonOutput& output,
684 CanonHostInfo& host_info) {
685 DoHost<char, unsigned char, CanonMode::kNonSpecialURL>(spec, host, output,
686 host_info);
687}
688
689void CanonicalizeNonSpecialHostVerbose(const char16_t* spec,
690 const Component& host,
691 CanonOutput& output,
692 CanonHostInfo& host_info) {
693 DoHost<char16_t, char16_t, CanonMode::kNonSpecialURL>(spec, host, output,
694 host_info);
695}
696
vitalybuka@chromium.org0318f922014-04-22 00:09:23697} // namespace url