| // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // This command-line program converts an effective-TLD data file in UTF-8 from |
| // the format provided by Mozilla to the format expected by Chrome. Any errors |
| // or warnings are recorded in tld_cleanup.log. |
| // |
| // In particular, it |
| // * Strips blank lines and comments, as well as notes for individual rules. |
| // * Changes all line endings to LF. |
| // * Strips a single leading and/or trailing dot from each rule, if present. |
| // * Logs a warning if a rule contains '!' or '*.' other than at the beginning |
| // of the rule. (This also catches multiple ! or *. at the start of a rule.) |
| // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. |
| // * Canonicalizes each rule's domain by converting it to a GURL and back. |
| // * Adds explicit rules for true TLDs found in any rule. |
| |
| #include <set> |
| #include <string> |
| |
| #include "base/at_exit.h" |
| #include "base/file_util.h" |
| #include "base/i18n/icu_util.h" |
| #include "base/logging.h" |
| #include "base/file_path.h" |
| #include "base/file_util.h" |
| #include "base/path_service.h" |
| #include "base/process_util.h" |
| #include "base/string_util.h" |
| #include "googleurl/src/gurl.h" |
| #include "googleurl/src/url_parse.h" |
| |
| typedef std::set<std::string> StringSet; |
| |
| // Writes the list of domain rules contained in the 'rules' set to the |
| // 'outfile', with each rule terminated by a LF. The file must already have |
| // been created with write access. |
| bool WriteRules(const StringSet& rules, FilePath outfile) { |
| std::string data; |
| data.append( |
| "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n" |
| "// Use of this source code is governed by a BSD-style license that\n" |
| "// can be found in the LICENSE file.\n\n" |
| "// This file is generated by net/tools/tld_cleanup/.\n" |
| "// DO NOT MANUALLY EDIT!\n" |
| "#include \"net/base/registry_controlled_domain.h\"\n\n" |
| "const char net::RegistryControlledDomainService::kDomainData[] =\n" |
| ); |
| |
| for (StringSet::const_iterator iter = rules.begin(); |
| iter != rules.end(); |
| ++iter) { |
| data.append(" \""); |
| data.append(*iter); |
| data.append("\\n\"\n"); |
| } |
| |
| data.append(";\n"); |
| |
| int written = file_util::WriteFile(outfile.ToWStringHack(), data.data(), |
| data.size()); |
| |
| return written == static_cast<int>(data.size()); |
| } |
| |
| // These result codes should be in increasing order of severity. |
| typedef enum { |
| kSuccess, |
| kWarning, |
| kError, |
| } NormalizeResult; |
| |
| // Adjusts the rule to a standard form: removes single extraneous dots and |
| // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
| // valid; logs a warning and returns kWarning if it is probably invalid; and |
| // logs an error and returns kError if the rule is (almost) certainly invalid. |
| NormalizeResult NormalizeRule(std::string* rule) { |
| NormalizeResult result = kSuccess; |
| |
| // Strip single leading and trailing dots. |
| if (rule->at(0) == '.') |
| rule->erase(0, 1); |
| if (rule->size() == 0) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| if (rule->at(rule->size() - 1) == '.') |
| rule->erase(rule->size() - 1, 1); |
| if (rule->size() == 0) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| |
| // Allow single leading '*.' or '!', saved here so it's not canonicalized. |
| bool wildcard = false; |
| bool exception = false; |
| size_t start_offset = 0; |
| if (rule->at(0) == '!') { |
| rule->erase(0, 1); |
| exception = true; |
| } else if (rule->find("*.") == 0) { |
| rule->erase(0, 2); |
| wildcard = true; |
| } |
| if (rule->size() == 0) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| |
| // Warn about additional '*.' or '!'. |
| if (rule->find("*.", start_offset) != std::string::npos || |
| rule->find('!', start_offset) != std::string::npos) { |
| LOG(WARNING) << "Keeping probably invalid rule: " << *rule; |
| result = kWarning; |
| } |
| |
| // Make a GURL and normalize it, then get the host back out. |
| std::string url = "http://"; |
| url.append(*rule); |
| GURL gurl(url); |
| const std::string& spec = gurl.possibly_invalid_spec(); |
| url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
| if (host.len < 0) { |
| LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *rule; |
| return kError; |
| } |
| if (!gurl.is_valid()) { |
| LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *rule; |
| result = kWarning; |
| } |
| rule->assign(spec.substr(host.begin, host.len)); |
| |
| // Restore wildcard or exception marker. |
| if (exception) |
| rule->insert(0, 1, '!'); |
| else if (wildcard) |
| rule->insert(0, "*."); |
| |
| return result; |
| } |
| |
| // Loads the file described by 'in_filename', converts it to the desired format |
| // (see the file comments above), and saves it into 'out_filename'. Returns |
| // the most severe of the result codes encountered when normalizing the rules. |
| NormalizeResult NormalizeFile(const FilePath& in_filename, |
| const FilePath& out_filename) { |
| std::string data; |
| if (!file_util::ReadFileToString(in_filename, &data)) { |
| LOG(ERROR) << "Unable to read file"; |
| // We return success since we've already reported the error. |
| return kSuccess; |
| } |
| |
| // We do a lot of string assignment during parsing, but simplicity is more |
| // important than performance here. |
| std::string rule; |
| NormalizeResult result = kSuccess; |
| size_t line_start = 0; |
| size_t line_end = 0; |
| StringSet rules; |
| while (line_start < data.size()) { |
| // Skip comments. |
| if (line_start + 1 < data.size() && |
| data[line_start] == '/' && |
| data[line_start + 1] == '/') { |
| line_end = data.find_first_of("\r\n", line_start); |
| if (line_end == std::string::npos) |
| line_end = data.size(); |
| } else { |
| // Truncate at first whitespace. |
| line_end = data.find_first_of("\r\n \t", line_start); |
| if (line_end == std::string::npos) |
| line_end = data.size(); |
| rule.assign(data.data(), line_start, line_end - line_start); |
| |
| NormalizeResult new_result = NormalizeRule(&rule); |
| if (new_result != kError) { |
| rules.insert(rule); |
| // Add true TLD for multi-level rules. |
| size_t tld_start = rule.find_last_of('.'); |
| if (tld_start != std::string::npos && tld_start + 1 < rule.size()) |
| rules.insert(rule.substr(tld_start + 1)); |
| } |
| result = std::max(result, new_result); |
| } |
| |
| // Find beginning of next non-empty line. |
| line_start = data.find_first_of("\r\n", line_end); |
| if (line_start == std::string::npos) |
| line_start = data.size(); |
| line_start = data.find_first_not_of("\r\n", line_start); |
| if (line_start == std::string::npos) |
| line_start = data.size(); |
| } |
| |
| if (!WriteRules(rules, out_filename)) { |
| LOG(ERROR) << "Error(s) writing output file"; |
| result = kError; |
| } |
| |
| return result; |
| } |
| |
| int main(int argc, const char* argv[]) { |
| base::EnableTerminationOnHeapCorruption(); |
| if (argc != 1) { |
| fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); |
| fprintf(stderr, "Usage: %s\n", argv[0]); |
| return 1; |
| } |
| |
| // Manages the destruction of singletons. |
| base::AtExitManager exit_manager; |
| |
| // Only use OutputDebugString in debug mode. |
| #ifdef NDEBUG |
| logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE; |
| #else |
| logging::LoggingDestination destination = |
| logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG; |
| #endif |
| |
| CommandLine::Init(argc, argv); |
| |
| FilePath log_filename; |
| PathService::Get(base::DIR_EXE, &log_filename); |
| log_filename = log_filename.AppendASCII("tld_cleanup.log"); |
| logging::InitLogging(log_filename.value().c_str(), |
| destination, |
| logging::LOCK_LOG_FILE, |
| logging::DELETE_OLD_LOG_FILE); |
| |
| icu_util::Initialize(); |
| |
| FilePath input_file; |
| PathService::Get(base::DIR_SOURCE_ROOT, &input_file); |
| input_file = input_file.Append(FILE_PATH_LITERAL("net")) |
| .Append(FILE_PATH_LITERAL("base")) |
| .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); |
| FilePath output_file; |
| PathService::Get(base::DIR_SOURCE_ROOT, &output_file); |
| output_file = output_file.Append(FILE_PATH_LITERAL("net")) |
| .Append(FILE_PATH_LITERAL("base")) |
| .Append(FILE_PATH_LITERAL("effective_tld_names.cc")); |
| NormalizeResult result = NormalizeFile(input_file, output_file); |
| if (result != kSuccess) { |
| fprintf(stderr, |
| "Errors or warnings processing file. See log in tld_cleanup.log."); |
| } |
| |
| if (result == kError) |
| return 1; |
| return 0; |
| } |