net/tools/tld_cleanup/tld_cleanup.cc - chromium/src - Git at Google

 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // This command-line program converts an effective-TLD data file in UTF-8 from
 // the format provided by Mozilla to the format expected by Chrome.  Any errors
 // or warnings are recorded in tld_cleanup.log.
 //
 // In particular, it
 //  * Strips blank lines and comments, as well as notes for individual rules.
 //  * Changes all line endings to LF.
 //  * Strips a single leading and/or trailing dot from each rule, if present.
 //  * Logs a warning if a rule contains '!' or '*.' other than at the beginning
 //    of the rule.  (This also catches multiple ! or *. at the start of a rule.)
 //  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
 //  * Canonicalizes each rule's domain by converting it to a GURL and back.
 //  * Adds explicit rules for true TLDs found in any rule.

 #include <set>
 #include <string>

 #include "base/at_exit.h"
 #include "base/file_util.h"
 #include "base/i18n/icu_util.h"
 #include "base/logging.h"
 #include "base/file_path.h"
 #include "base/file_util.h"
 #include "base/path_service.h"
 #include "base/process_util.h"
 #include "base/string_util.h"
 #include "googleurl/src/gurl.h"
 #include "googleurl/src/url_parse.h"

 typedef std::set<std::string> StringSet;

 // Writes the list of domain rules contained in the 'rules' set to the
 // 'outfile', with each rule terminated by a LF.  The file must already have
 // been created with write access.
 bool WriteRules(const StringSet& rules, FilePath outfile) {
   std::string data;
   data.append(
       "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
       "// Use of this source code is governed by a BSD-style license that\n"
       "// can be found in the LICENSE file.\n\n"
       "// This file is generated by net/tools/tld_cleanup/.\n"
       "// DO NOT MANUALLY EDIT!\n"
       "#include \"net/base/registry_controlled_domain.h\"\n\n"
       "const char net::RegistryControlledDomainService::kDomainData[] =\n"
   );

   for (StringSet::const_iterator iter = rules.begin();
        iter != rules.end();
        ++iter) {
     data.append("  \"");
     data.append(*iter);
     data.append("\\n\"\n");
   }

   data.append(";\n");

   int written = file_util::WriteFile(outfile.ToWStringHack(), data.data(),
                                      data.size());

   return written == static_cast<int>(data.size());
 }

 // These result codes should be in increasing order of severity.
 typedef enum {
   kSuccess,
   kWarning,
   kError,
 } NormalizeResult;

 // Adjusts the rule to a standard form: removes single extraneous dots and
 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
 // valid; logs a warning and returns kWarning if it is probably invalid; and
 // logs an error and returns kError if the rule is (almost) certainly invalid.
 NormalizeResult NormalizeRule(std::string* rule) {
   NormalizeResult result = kSuccess;

   // Strip single leading and trailing dots.
   if (rule->at(0) == '.')
     rule->erase(0, 1);
   if (rule->size() == 0) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }
   if (rule->at(rule->size() - 1) == '.')
     rule->erase(rule->size() - 1, 1);
   if (rule->size() == 0) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }

   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
   bool wildcard = false;
   bool exception = false;
   size_t start_offset = 0;
   if (rule->at(0) == '!') {
     rule->erase(0, 1);
     exception = true;
   } else if (rule->find("*.") == 0) {
     rule->erase(0, 2);
     wildcard = true;
   }
   if (rule->size() == 0) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }

   // Warn about additional '*.' or '!'.
   if (rule->find("*.", start_offset) != std::string::npos ||
       rule->find('!', start_offset) != std::string::npos) {
     LOG(WARNING) << "Keeping probably invalid rule: " << *rule;
     result = kWarning;
   }

   // Make a GURL and normalize it, then get the host back out.
   std::string url = "http://";
   url.append(*rule);
   GURL gurl(url);
   const std::string& spec = gurl.possibly_invalid_spec();
   url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
   if (host.len < 0) {
     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *rule;
     return kError;
   }
   if (!gurl.is_valid()) {
     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *rule;
     result = kWarning;
   }
   rule->assign(spec.substr(host.begin, host.len));

   // Restore wildcard or exception marker.
   if (exception)
     rule->insert(0, 1, '!');
   else if (wildcard)
     rule->insert(0, "*.");

   return result;
 }

 // Loads the file described by 'in_filename', converts it to the desired format
 // (see the file comments above), and saves it into 'out_filename'.  Returns
 // the most severe of the result codes encountered when normalizing the rules.
 NormalizeResult NormalizeFile(const FilePath& in_filename,
                               const FilePath& out_filename) {
   std::string data;
   if (!file_util::ReadFileToString(in_filename, &data)) {
     LOG(ERROR) << "Unable to read file";
     // We return success since we've already reported the error.
     return kSuccess;
   }

   // We do a lot of string assignment during parsing, but simplicity is more
   // important than performance here.
   std::string rule;
   NormalizeResult result = kSuccess;
   size_t line_start = 0;
   size_t line_end = 0;
   StringSet rules;
   while (line_start < data.size()) {
     // Skip comments.
     if (line_start + 1 < data.size() &&
         data[line_start] == '/' &&
         data[line_start + 1] == '/') {
       line_end = data.find_first_of("\r\n", line_start);
       if (line_end == std::string::npos)
         line_end = data.size();
     } else {
       // Truncate at first whitespace.
       line_end = data.find_first_of("\r\n \t", line_start);
       if (line_end == std::string::npos)
         line_end = data.size();
       rule.assign(data.data(), line_start, line_end - line_start);

       NormalizeResult new_result = NormalizeRule(&rule);
       if (new_result != kError) {
         rules.insert(rule);
         // Add true TLD for multi-level rules.
         size_t tld_start = rule.find_last_of('.');
         if (tld_start != std::string::npos && tld_start + 1 < rule.size())
           rules.insert(rule.substr(tld_start + 1));
       }
       result = std::max(result, new_result);
     }

     // Find beginning of next non-empty line.
     line_start = data.find_first_of("\r\n", line_end);
     if (line_start == std::string::npos)
       line_start = data.size();
     line_start = data.find_first_not_of("\r\n", line_start);
     if (line_start == std::string::npos)
       line_start = data.size();
   }

   if (!WriteRules(rules, out_filename)) {
     LOG(ERROR) << "Error(s) writing output file";
     result = kError;
   }

   return result;
 }

 int main(int argc, const char* argv[]) {
   base::EnableTerminationOnHeapCorruption();
   if (argc != 1) {
     fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
     fprintf(stderr, "Usage: %s\n", argv[0]);
     return 1;
   }

   // Manages the destruction of singletons.
   base::AtExitManager exit_manager;

   // Only use OutputDebugString in debug mode.
 #ifdef NDEBUG
   logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
 #else
   logging::LoggingDestination destination =
       logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
 #endif

   CommandLine::Init(argc, argv);

   FilePath log_filename;
   PathService::Get(base::DIR_EXE, &log_filename);
   log_filename = log_filename.AppendASCII("tld_cleanup.log");
   logging::InitLogging(log_filename.value().c_str(),
                        destination,
                        logging::LOCK_LOG_FILE,
                        logging::DELETE_OLD_LOG_FILE);

   icu_util::Initialize();

   FilePath input_file;
   PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
   input_file = input_file.Append(FILE_PATH_LITERAL("net"))
                          .Append(FILE_PATH_LITERAL("base"))
                          .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
   FilePath output_file;
   PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
   output_file = output_file.Append(FILE_PATH_LITERAL("net"))
                            .Append(FILE_PATH_LITERAL("base"))
                            .Append(FILE_PATH_LITERAL("effective_tld_names.cc"));
   NormalizeResult result = NormalizeFile(input_file, output_file);
   if (result != kSuccess) {
     fprintf(stderr,
             "Errors or warnings processing file.  See log in tld_cleanup.log.");
   }

   if (result == kError)
     return 1;
   return 0;
 }
	// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// This command-line program converts an effective-TLD data file in UTF-8 from
	// the format provided by Mozilla to the format expected by Chrome. Any errors
	// or warnings are recorded in tld_cleanup.log.
	//
	// In particular, it
	// * Strips blank lines and comments, as well as notes for individual rules.
	// * Changes all line endings to LF.
	// * Strips a single leading and/or trailing dot from each rule, if present.
	// * Logs a warning if a rule contains '!' or '*.' other than at the beginning
	// of the rule. (This also catches multiple ! or *. at the start of a rule.)
	// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
	// * Canonicalizes each rule's domain by converting it to a GURL and back.
	// * Adds explicit rules for true TLDs found in any rule.

	#include <set>
	#include <string>

	#include "base/at_exit.h"
	#include "base/file_util.h"
	#include "base/i18n/icu_util.h"
	#include "base/logging.h"
	#include "base/file_path.h"
	#include "base/file_util.h"
	#include "base/path_service.h"
	#include "base/process_util.h"
	#include "base/string_util.h"
	#include "googleurl/src/gurl.h"
	#include "googleurl/src/url_parse.h"

	typedef std::set<std::string> StringSet;

	// Writes the list of domain rules contained in the 'rules' set to the
	// 'outfile', with each rule terminated by a LF. The file must already have
	// been created with write access.
	bool WriteRules(const StringSet& rules, FilePath outfile) {
	std::string data;
	data.append(
	"// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
	"// Use of this source code is governed by a BSD-style license that\n"
	"// can be found in the LICENSE file.\n\n"
	"// This file is generated by net/tools/tld_cleanup/.\n"
	"// DO NOT MANUALLY EDIT!\n"
	"#include \"net/base/registry_controlled_domain.h\"\n\n"
	"const char net::RegistryControlledDomainService::kDomainData[] =\n"
	);

	for (StringSet::const_iterator iter = rules.begin();
	iter != rules.end();
	++iter) {
	data.append(" \"");
	data.append(*iter);
	data.append("\\n\"\n");
	}

	data.append(";\n");

	int written = file_util::WriteFile(outfile.ToWStringHack(), data.data(),
	data.size());

	return written == static_cast<int>(data.size());
	}

	// These result codes should be in increasing order of severity.
	typedef enum {
	kSuccess,
	kWarning,
	kError,
	} NormalizeResult;

	// Adjusts the rule to a standard form: removes single extraneous dots and
	// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
	// valid; logs a warning and returns kWarning if it is probably invalid; and
	// logs an error and returns kError if the rule is (almost) certainly invalid.
	NormalizeResult NormalizeRule(std::string* rule) {
	NormalizeResult result = kSuccess;

	// Strip single leading and trailing dots.
	if (rule->at(0) == '.')
	rule->erase(0, 1);
	if (rule->size() == 0) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}
	if (rule->at(rule->size() - 1) == '.')
	rule->erase(rule->size() - 1, 1);
	if (rule->size() == 0) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}

	// Allow single leading '*.' or '!', saved here so it's not canonicalized.
	bool wildcard = false;
	bool exception = false;
	size_t start_offset = 0;
	if (rule->at(0) == '!') {
	rule->erase(0, 1);
	exception = true;
	} else if (rule->find("*.") == 0) {
	rule->erase(0, 2);
	wildcard = true;
	}
	if (rule->size() == 0) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}

	// Warn about additional '*.' or '!'.
	if (rule->find("*.", start_offset) != std::string::npos \|\|
	rule->find('!', start_offset) != std::string::npos) {
	LOG(WARNING) << "Keeping probably invalid rule: " << *rule;
	result = kWarning;
	}

	// Make a GURL and normalize it, then get the host back out.
	std::string url = "http://";
	url.append(*rule);
	GURL gurl(url);
	const std::string& spec = gurl.possibly_invalid_spec();
	url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
	if (host.len < 0) {
	LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *rule;
	return kError;
	}
	if (!gurl.is_valid()) {
	LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *rule;
	result = kWarning;
	}
	rule->assign(spec.substr(host.begin, host.len));

	// Restore wildcard or exception marker.
	if (exception)
	rule->insert(0, 1, '!');
	else if (wildcard)
	rule->insert(0, "*.");

	return result;
	}

	// Loads the file described by 'in_filename', converts it to the desired format
	// (see the file comments above), and saves it into 'out_filename'. Returns
	// the most severe of the result codes encountered when normalizing the rules.
	NormalizeResult NormalizeFile(const FilePath& in_filename,
	const FilePath& out_filename) {
	std::string data;
	if (!file_util::ReadFileToString(in_filename, &data)) {
	LOG(ERROR) << "Unable to read file";
	// We return success since we've already reported the error.
	return kSuccess;
	}

	// We do a lot of string assignment during parsing, but simplicity is more
	// important than performance here.
	std::string rule;
	NormalizeResult result = kSuccess;
	size_t line_start = 0;
	size_t line_end = 0;
	StringSet rules;
	while (line_start < data.size()) {
	// Skip comments.
	if (line_start + 1 < data.size() &&
	data[line_start] == '/' &&
	data[line_start + 1] == '/') {
	line_end = data.find_first_of("\r\n", line_start);
	if (line_end == std::string::npos)
	line_end = data.size();
	} else {
	// Truncate at first whitespace.
	line_end = data.find_first_of("\r\n \t", line_start);
	if (line_end == std::string::npos)
	line_end = data.size();
	rule.assign(data.data(), line_start, line_end - line_start);

	NormalizeResult new_result = NormalizeRule(&rule);
	if (new_result != kError) {
	rules.insert(rule);
	// Add true TLD for multi-level rules.
	size_t tld_start = rule.find_last_of('.');
	if (tld_start != std::string::npos && tld_start + 1 < rule.size())
	rules.insert(rule.substr(tld_start + 1));
	}
	result = std::max(result, new_result);
	}

	// Find beginning of next non-empty line.
	line_start = data.find_first_of("\r\n", line_end);
	if (line_start == std::string::npos)
	line_start = data.size();
	line_start = data.find_first_not_of("\r\n", line_start);
	if (line_start == std::string::npos)
	line_start = data.size();
	}

	if (!WriteRules(rules, out_filename)) {
	LOG(ERROR) << "Error(s) writing output file";
	result = kError;
	}

	return result;
	}

	int main(int argc, const char* argv[]) {
	base::EnableTerminationOnHeapCorruption();
	if (argc != 1) {
	fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
	fprintf(stderr, "Usage: %s\n", argv[0]);
	return 1;
	}

	// Manages the destruction of singletons.
	base::AtExitManager exit_manager;

	// Only use OutputDebugString in debug mode.
	#ifdef NDEBUG
	logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
	#else
	logging::LoggingDestination destination =
	logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
	#endif

	CommandLine::Init(argc, argv);

	FilePath log_filename;
	PathService::Get(base::DIR_EXE, &log_filename);
	log_filename = log_filename.AppendASCII("tld_cleanup.log");
	logging::InitLogging(log_filename.value().c_str(),
	destination,
	logging::LOCK_LOG_FILE,
	logging::DELETE_OLD_LOG_FILE);

	icu_util::Initialize();

	FilePath input_file;
	PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
	input_file = input_file.Append(FILE_PATH_LITERAL("net"))
	.Append(FILE_PATH_LITERAL("base"))
	.Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
	FilePath output_file;
	PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
	output_file = output_file.Append(FILE_PATH_LITERAL("net"))
	.Append(FILE_PATH_LITERAL("base"))
	.Append(FILE_PATH_LITERAL("effective_tld_names.cc"));
	NormalizeResult result = NormalizeFile(input_file, output_file);
	if (result != kSuccess) {
	fprintf(stderr,
	"Errors or warnings processing file. See log in tld_cleanup.log.");
	}

	if (result == kError)
	return 1;
	return 0;
	}