// Copyright 2015 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/feedback/redaction_tool.h" #include #include #include #include "base/files/file_path.h" #include "base/strings/strcat.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "base/strings/stringprintf.h" #include "base/threading/thread_restrictions.h" #include "build/chromeos_buildflags.h" #include "components/feedback/pii_types.h" #include "net/base/ip_address.h" #include "third_party/re2/src/re2/re2.h" using re2::RE2; namespace feedback { namespace { // The |kCustomPatternsWithContext| array defines patterns to match and // redact. Each pattern needs to define three capturing parentheses groups: // // - a group for the pattern before the identifier to be redacted; // - a group for the identifier to be redacted; // - a group for the pattern after the identifier to be redacted. // // The first and the last capture group are the origin of the "WithContext" // suffix in the name of this constant. // // Every matched identifier (in the context of the whole pattern) is redacted // by replacing it with an incremental instance identifier. Every different // pattern defines a separate instance identifier space. See the unit test for // RedactionToolTest::RedactCustomPatterns for pattern redaction examples. // // Useful regular expression syntax: // // +? is a non-greedy (lazy) +. // \b matches a word boundary. // (?i) turns on case insensitivity for the remainder of the regex. // (?-s) turns off "dot matches newline" for the remainder of the regex. // (?:regex) denotes non-capturing parentheses group. CustomPatternWithAlias kCustomPatternsWithContext[] = { // ModemManager {"CellID", "(\\bCell ID: ')([0-9a-fA-F]+)(')", PIIType::kCellID}, {"LocAC", "(\\bLocation area code: ')([0-9a-fA-F]+)(')", PIIType::kLocationAreaCode}, // wpa_supplicant {"SSID", "(?i-s)(\\bssid[= ]')(.+)(')", PIIType::kSSID}, {"SSID", "(?i-s)(\\bssid[= ]\")(.+)(\")", PIIType::kSSID}, {"SSID", "(\\* SSID=)(.+)($)", PIIType::kSSID}, {"SSIDHex", "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", PIIType::kSSID}, // shill {"SSID", "(?-s)(\\[SSID=)(.+?)(\\])", PIIType::kSSID}, // Serial numbers. The actual serial number itself can include any alphanum // char as well as dashes, periods, colons, slashes and unprintable ASCII // chars (except newline). The second one is for a special case in // edid-decode, where if we genericized it further then we would catch too // many other cases that we don't want to redact. {"Serial", "(?i-s)(\\bserial\\s*_?(?:number)?['\"]?\\s*[:=|]\\s*['\"]?)" "([0-9a-zA-Z\\-.:\\/\\\\\\x00-\\x09\\x0B-\\x1F]+)(\\b)", PIIType::kSerial}, {"Serial", "( Serial Number )(\\d+)(\\b)", PIIType::kSerial}, // The attested device id, a serial number, that comes from vpd_2.0.txt. // The pattern was recently clarified as being a case insensitive string of // ASCII letters and digits, plus the dash/hyphen character. The dash cannot // appear first or last {"Serial", "(\"attested_device_id\"=\")([^-][0-9a-zA-Z-]+[^-])(\")", PIIType::kSerial}, // GAIA IDs {"GAIA", R"xxx((\"?\bgaia_id\"?[=:]['\"])(\d+)(\b['\"]))xxx", PIIType::kGaiaID}, {"GAIA", R"xxx((\{id: )(\d+)(, email:))xxx", PIIType::kGaiaID}, // UUIDs given by the 'blkid' tool. These don't necessarily look like // standard UUIDs, so treat them specially. {"UUID", R"xxx((UUID=")([0-9a-zA-Z-]+)("))xxx", PIIType::kUUID}, // Also cover UUIDs given by the 'lvs' and 'pvs' tools, which similarly // don't necessarily look like standard UUIDs. {"UUID", R"xxx(("[lp]v_uuid":")([0-9a-zA-Z-]+)("))xxx", PIIType::kUUID}, // Volume labels presented in the 'blkid' tool, and as part of removable // media paths shown in various logs such as cros-disks (in syslog). // There isn't a well-defined format for these. For labels in blkid, // capture everything between the open and closing quote. {"Volume Label", R"xxx((LABEL=")([^"]+)("))xxx", PIIType::kVolumeLabel}, // For paths, this is harder. The only restricted characters are '/' and // NUL, so use a simple heuristic. cros-disks generally quotes paths using // single-quotes, so capture everything until a quote character. For lsblk, // capture everything until the end of the line, since the mount path is the // last field. {"Volume Label", R"xxx((/media/removable/)(.+?)(['"/\n]|$))xxx", PIIType::kVolumeLabel}, // IPP (Internet Printing Protocol) Addresses {"IPP Address", R"xxx((ipp:\/\/)(.+?)(\/ipp))xxx", PIIType::kIPPAddress}, }; bool MaybeUnmapAddress(net::IPAddress* addr) { if (!addr->IsIPv4MappedIPv6()) return false; *addr = net::ConvertIPv4MappedIPv6ToIPv4(*addr); return true; } bool MaybeUntranslateAddress(net::IPAddress* addr) { if (!addr->IsIPv6()) return false; static const net::IPAddress kTranslated6To4(0, 0x64, 0xff, 0x9b, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); if (!IPAddressMatchesPrefix(*addr, kTranslated6To4, 96)) return false; const auto bytes = addr->bytes(); *addr = net::IPAddress(bytes[12], bytes[13], bytes[14], bytes[15]); return true; } // If |addr| points to a valid IPv6 address, this function truncates it at /32. bool MaybeTruncateIPv6(net::IPAddress* addr) { if (!addr->IsIPv6()) return false; const auto bytes = addr->bytes(); *addr = net::IPAddress(bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); return true; } // Returns an appropriately scrubbed version of |addr| if applicable. std::string MaybeScrubIPAddress(const std::string& addr) { struct { net::IPAddress ip_addr; int prefix_length; bool scrub; } static const kNonIdentifyingIPRanges[] = { // Private. {net::IPAddress(10, 0, 0, 0), 8, true}, {net::IPAddress(172, 16, 0, 0), 12, true}, {net::IPAddress(192, 168, 0, 0), 16, true}, // Chrome OS containers and VMs. {net::IPAddress(100, 115, 92, 0), 24, false}, // Loopback. {net::IPAddress(127, 0, 0, 0), 8, true}, // Any. {net::IPAddress(0, 0, 0, 0), 8, true}, // DNS. {net::IPAddress(8, 8, 8, 8), 32, false}, {net::IPAddress(8, 8, 4, 4), 32, false}, {net::IPAddress(1, 1, 1, 1), 32, false}, // Multicast. {net::IPAddress(224, 0, 0, 0), 4, true}, // Link local. {net::IPAddress(169, 254, 0, 0), 16, true}, {net::IPAddress(0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 10, true}, // Broadcast. {net::IPAddress(255, 255, 255, 255), 32, false}, // IPv6 loopback, unspecified and non-address strings. {net::IPAddress::IPv6AllZeros(), 112, false}, // IPv6 multicast all nodes and routers. {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 128, false}, {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2), 128, false}, {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 128, false}, {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2), 128, false}, // IPv6 other multicast (link and interface local). {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16, true}, {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16, true}, }; net::IPAddress input_addr; if (input_addr.AssignFromIPLiteral(addr) && input_addr.IsValid()) { bool mapped = MaybeUnmapAddress(&input_addr); bool translated = !mapped ? MaybeUntranslateAddress(&input_addr) : false; for (const auto& range : kNonIdentifyingIPRanges) { if (IPAddressMatchesPrefix(input_addr, range.ip_addr, range.prefix_length)) { std::string prefix; std::string out_addr = addr; if (mapped) { prefix = "M "; out_addr = input_addr.ToString(); } else if (translated) { prefix = "T "; out_addr = input_addr.ToString(); } if (range.scrub) { out_addr = base::StringPrintf( "%s/%d", range.ip_addr.ToString().c_str(), range.prefix_length); } return base::StrCat({prefix, out_addr}); } } // |addr| may have been over-aggressively matched as an IPv6 address when // it's really just an arbitrary part of a sentence. If the string is the // same as the coarsely truncated address then keep it because even if // it happens to be a real address, there is no leak. if (MaybeTruncateIPv6(&input_addr) && input_addr.ToString() == addr) return addr; } return ""; } // Helper macro: Non capturing group #define NCG(x) "(?:" x ")" // Helper macro: Optional non capturing group #define OPT_NCG(x) NCG(x) "?" ////////////////////////////////////////////////////////////////////////// // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial // limitation on the scheme to increase precision. Otherwise anything // like "ID:" would be considered an IRI. #define UNRESERVED "[-a-z0-9._~]" #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) #define SUB_DELIMS "[!$&'()*+,;=]" #define GEN_DELIMS "[:/?#[\\]@]" #define DIGIT "[0-9]" #define HEXDIG "[0-9a-f]" #define PCT_ENCODED "%" HEXDIG HEXDIG #define DEC_OCTET NCG("1[0-9][0-9]|2[0-4][0-9]|25[0-5]|[1-9][0-9]|[0-9]") #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET #define H16 NCG(HEXDIG) "{1,4}" #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) #define WB "\\b" // clang-format off #define IPV6ADDRESS NCG( \ WB NCG(H16 ":") "{6}" LS32 WB "|" \ "::" NCG(H16 ":") "{5}" LS32 WB "|" \ OPT_NCG( WB H16) "::" NCG(H16 ":") "{4}" LS32 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,4}" H16) "::" LS32 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,5}" H16) "::" H16 WB "|" \ OPT_NCG( WB NCG(H16 ":") "{0,6}" H16) "::") // clang-format on #define IPVFUTURE \ "v" HEXDIG \ "+" \ "\\." NCG(UNRESERVED "|" SUB_DELIMS \ "|" \ ":") "+" #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" #define PORT DIGIT "*" // This is a diversion of RFC 3987 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android|rtsp|file") #define IPRIVATE \ "[" \ "\\x{E000}-\\x{F8FF}" \ "\\x{F0000}-\\x{FFFFD}" \ "\\x{100000}-\\x{10FFFD}" \ "]" #define UCSCHAR \ "[" \ "\\x{A0}-\\x{D7FF}" \ "\\x{F900}-\\x{FDCF}" \ "\\x{FDF0}-\\x{FFEF}" \ "\\x{10000}-\\x{1FFFD}" \ "\\x{20000}-\\x{2FFFD}" \ "\\x{30000}-\\x{3FFFD}" \ "\\x{40000}-\\x{4FFFD}" \ "\\x{50000}-\\x{5FFFD}" \ "\\x{60000}-\\x{6FFFD}" \ "\\x{70000}-\\x{7FFFD}" \ "\\x{80000}-\\x{8FFFD}" \ "\\x{90000}-\\x{9FFFD}" \ "\\x{A0000}-\\x{AFFFD}" \ "\\x{B0000}-\\x{BFFFD}" \ "\\x{C0000}-\\x{CFFFD}" \ "\\x{D0000}-\\x{DFFFD}" \ "\\x{E1000}-\\x{EFFFD}" \ "]" #define IUNRESERVED \ NCG("[-a-z0-9._~]" \ "|" UCSCHAR) #define IPCHAR \ NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ "|" \ "[:@]") #define IFRAGMENT \ NCG(IPCHAR \ "|" \ "[/?]") \ "*" #define IQUERY \ NCG(IPCHAR "|" IPRIVATE \ "|" \ "[/?]") \ "*" #define ISEGMENT IPCHAR "*" #define ISEGMENT_NZ IPCHAR "+" #define ISEGMENT_NZ_NC \ NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ "|" \ "@") \ "+" #define IPATH_EMPTY "" #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" #define IPATH \ NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" IPATH_ROOTLESS \ "|" IPATH_EMPTY) #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) #define IUSERINFO \ NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ "|" \ ":") \ "*" #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) #define IRELATIVE_PART \ NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME \ "|" IPATH_EMPTY) #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements // that end with "Android:" for example are not considered a URL. #define IHIER_PART \ NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_ROOTLESS) #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email // addresses. Capture names as well ("First Lastname" ). // The |kCustomPatternWithoutContext| array defines further patterns to match // and redact. Each pattern consists of a single capturing group. CustomPatternWithAlias kCustomPatternsWithoutContext[] = { {"URL", "(?i)(" IRI ")", PIIType::kURL}, // Email Addresses need to come after URLs because they can be part // of a query parameter. {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})", PIIType::kEmail}, // IP filter rules need to come after URLs so that they don't disturb the // URL pattern in case the IP address is part of a URL. {"IPv4", "(?i)(" IPV4ADDRESS ")", PIIType::kIPAddress}, {"IPv6", "(?i)(" IPV6ADDRESS ")", PIIType::kIPAddress}, // Universal Unique Identifiers (UUIDs). {"UUID", "(?i)([0-9a-zA-Z]{8}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-" "[0-9a-zA-Z]{12})", PIIType::kUUID}, }; // Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in // |input| and consumes the bytes until the end of the pattern matching. Unlike // FindAndConsume, the bytes skipped before the match of |pattern| are stored // in |skipped_input|. |args| needs to contain at least one element. // Returns whether a match was found. // // Example: input = "aaabbbc", pattern = "(b+)" leads to skipped_input = "aaa", // args[0] = "bbb", and the beginning input is moved to the right so that it // only contains "c". // Example: input = "aaabbbc", pattern = "(z+)" leads to input = "aaabbbc", // the args values are not modified and skipped_input is not modified. bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input, const re2::RE2& pattern, re2::StringPiece* skipped_input, re2::StringPiece* args[], int argc) { re2::StringPiece old_input = *input; CHECK_GE(argc, 1); re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr); re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr); re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr); re2::RE2::Arg a3(argc > 3 ? args[3] : nullptr); const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2, &a3}; CHECK_LE(argc, 4); bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc); if (skipped_input && result) { size_t bytes_skipped = args[0]->data() - old_input.data(); *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped); } return result; } // All |match_groups| need to be of type re2::StringPiece*. template bool FindAndConsumeAndGetSkipped(re2::StringPiece* input, const re2::RE2& pattern, re2::StringPiece* skipped_input, Arg*... match_groups) { re2::StringPiece* args[] = {match_groups...}; return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args, std::size(args)); } // The following MAC addresses will not be redacted as they are not specific // to a device but have general meanings. const char* const kUnredactedMacAddresses[] = { "00:00:00:00:00:00", // ARP failure result MAC. "ff:ff:ff:ff:ff:ff", // Broadcast MAC. }; constexpr size_t kNumUnredactedMacs = std::size(kUnredactedMacAddresses); } // namespace RedactionTool::RedactionTool(const char* const* first_party_extension_ids) : first_party_extension_ids_(first_party_extension_ids) { DETACH_FROM_SEQUENCE(sequence_checker_); // Identity-map these, so we don't mangle them. for (const char* mac : kUnredactedMacAddresses) mac_addresses_[mac] = mac; } RedactionTool::~RedactionTool() { DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); } std::map> RedactionTool::Detect( const std::string& input) { DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); base::AssertLongCPUWorkAllowed(); std::map> detected; RedactMACAddresses(input, &detected); // This function will add to |detected| only on Chrome OS as Android app // storage paths are only detected for Chrome OS. RedactAndroidAppStoragePaths(input, &detected); DetectWithCustomPatterns(input, &detected); // Do hashes last since they may appear in URLs and they also prevent us from // properly recognizing the Android storage paths. RedactHashes(input, &detected); return detected; } std::string RedactionTool::Redact(const std::string& input) { DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); return RedactAndKeepSelected(input, /*pii_types_to_keep=*/{}); } std::string RedactionTool::RedactAndKeepSelected( const std::string& input, const std::set& pii_types_to_keep) { DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); base::AssertLongCPUWorkAllowed(); // Copy |input| so we can modify it. std::string redacted = input; if (pii_types_to_keep.find(PIIType::kMACAddress) == pii_types_to_keep.end()) { redacted = RedactMACAddresses(std::move(redacted), nullptr); } if (pii_types_to_keep.find(PIIType::kAndroidAppStoragePath) == pii_types_to_keep.end()) { redacted = RedactAndroidAppStoragePaths(std::move(redacted), nullptr); } redacted = RedactAndKeepSelectedCustomPatterns(std::move(redacted), pii_types_to_keep); // Do hashes last since they may appear in URLs and they also prevent us // from properly recognizing the Android storage paths. if (pii_types_to_keep.find(PIIType::kHash) == pii_types_to_keep.end()) { // URLs and Android storage paths will be partially redacted (only hashes) // if |pii_types_to_keep| contains PIIType::kURL or // PIIType::kAndroidAppStoragePath and not PIIType::kHash. redacted = RedactHashes(std::move(redacted), nullptr); } return redacted; } RE2* RedactionTool::GetRegExp(const std::string& pattern) { if (regexp_cache_.find(pattern) == regexp_cache_.end()) { RE2::Options options; // set_multiline of pcre is not supported by RE2, yet. options.set_dot_nl(true); // Dot matches a new line. std::unique_ptr re = std::make_unique(pattern, options); DCHECK_EQ(re2::RE2::NoError, re->error_code()) << "Failed to parse:\n" << pattern << "\n" << re->error(); regexp_cache_[pattern] = std::move(re); } return regexp_cache_[pattern].get(); } std::string RedactionTool::RedactMACAddresses( const std::string& input, std::map>* detected) { // This regular expression finds the next MAC address. It splits the data into // an OUI (Organizationally Unique Identifier) part and a NIC (Network // Interface Controller) specific part. We also match on dash and underscore // because we have seen instances of both of those occurring. RE2* mac_re = GetRegExp( "([0-9a-fA-F][0-9a-fA-F][:\\-_]" "[0-9a-fA-F][0-9a-fA-F][:\\-_]" "[0-9a-fA-F][0-9a-fA-F])[:\\-_](" "[0-9a-fA-F][0-9a-fA-F][:\\-_]" "[0-9a-fA-F][0-9a-fA-F][:\\-_]" "[0-9a-fA-F][0-9a-fA-F])"); std::string result; result.reserve(input.size()); // Keep consuming, building up a result string as we go. re2::StringPiece text(input); re2::StringPiece skipped, oui, nic; static const char kMacSeparatorChars[] = "-_"; while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) { // Look up the MAC address in the hash. Force the separator to be a colon // so that the same MAC with a different format will match in all cases. std::string oui_string = base::ToLowerASCII(std::string(oui)); base::ReplaceChars(oui_string, kMacSeparatorChars, ":", &oui_string); std::string nic_string = base::ToLowerASCII(std::string(nic)); base::ReplaceChars(nic_string, kMacSeparatorChars, ":", &nic_string); std::string mac = oui_string + ":" + nic_string; std::string replacement_mac = mac_addresses_[mac]; if (replacement_mac.empty()) { // If not found, build up a replacement MAC address by generating a new // NIC part. int mac_id = mac_addresses_.size() - kNumUnredactedMacs; replacement_mac = base::StringPrintf("[MAC OUI=%s IFACE=%d]", oui_string.c_str(), mac_id); mac_addresses_[mac] = replacement_mac; } if (detected != nullptr) { (*detected)[PIIType::kMACAddress].insert(mac); } skipped.AppendToString(&result); result += replacement_mac; } text.AppendToString(&result); return result; } std::string RedactionTool::RedactHashes( const std::string& input, std::map>* detected) { // This will match hexadecimal strings from length 32 to 64 that have a word // boundary at each end. We then check to make sure they are one of our valid // hash lengths before replacing. // NOTE: There are some occurrences in the dump data (specifically modetest) // where relevant data is formatted with 32 hex chars on a line. In this case, // it is preceded by at least 3 whitespace chars, so check for that and in // that case do not redact. RE2* hash_re = GetRegExp(R"((\s*)\b([0-9a-fA-F]{4})([0-9a-fA-F]{28,60})\b)"); std::string result; result.reserve(input.size()); // Keep consuming, building up a result string as we go. re2::StringPiece text(input); re2::StringPiece skipped, pre_whitespace, hash_prefix, hash_suffix; while (FindAndConsumeAndGetSkipped(&text, *hash_re, &skipped, &pre_whitespace, &hash_prefix, &hash_suffix)) { skipped.AppendToString(&result); pre_whitespace.AppendToString(&result); // Check if it's a valid length for our hashes or if we need to skip due to // the whitespace check. size_t hash_length = 4 + hash_suffix.length(); if ((hash_length != 32 && hash_length != 40 && hash_length != 64) || (hash_length == 32 && pre_whitespace.length() >= 3)) { // This is not a hash string, skip it. hash_prefix.AppendToString(&result); hash_suffix.AppendToString(&result); continue; } // Look up the hash value address in the map of replacements. std::string hash_prefix_string = base::ToLowerASCII(std::string(hash_prefix)); std::string hash = hash_prefix_string + base::ToLowerASCII(std::string(hash_suffix)); std::string replacement_hash = hashes_[hash]; if (replacement_hash.empty()) { // If not found, build up a replacement value. replacement_hash = base::StringPrintf( "", hash_prefix_string.c_str(), hashes_.size()); hashes_[hash] = replacement_hash; } if (detected != nullptr) { (*detected)[PIIType::kHash].insert(hash); } result += replacement_hash; } text.AppendToString(&result); return result; } std::string RedactionTool::RedactAndroidAppStoragePaths( const std::string& input, std::map>* detected) { // We only use this on Chrome OS and there's differences in the API for // FilePath on Windows which prevents this from compiling, so only enable this // code for Chrome OS. #if BUILDFLAG(IS_CHROMEOS_ASH) std::string result; result.reserve(input.size()); // This is for redacting Android data paths included in 'android_app_storage' // and 'audit_log' output. in the following data paths // will be redacted. // - /data/data// // - /data/app// // - /data/user_de/// // These data paths are preceded by "/home/root//android-data" in // 'android_app_storage' output, and preceded by "path=" or "exe=" in // 'audit_log' output. RE2* path_re = GetRegExp( R"((?m)((path=|exe=|/home/root/[\da-f]+/android-data))" R"(/data/(data|app|user_de/\d+)/[^/\n]+)(/[^\n\s]+))"); // Keep consuming, building up a result string as we go. re2::StringPiece text(input); re2::StringPiece skipped; re2::StringPiece path_prefix; // path before app_specific; re2::StringPiece pre_data; // (path=|exe=|/home/root//android-data) re2::StringPiece post_data; // (data|app|user_de/\d+) re2::StringPiece app_specific; // (/[^\n\s]+) while (FindAndConsumeAndGetSkipped(&text, *path_re, &skipped, &path_prefix, &pre_data, &post_data, &app_specific)) { // We can record these parts as-is. skipped.AppendToString(&result); path_prefix.AppendToString(&result); // |app_specific| has to be redacted. First, convert it into components, // and then redact each component as follows: // - If the component has a non-ASCII character, change it to '*'. // - Otherwise, remove all the characters in the component but the first // one. // - If the original component has 2 or more bytes, add '_'. const base::FilePath path(app_specific.as_string()); std::vector components = path.GetComponents(); DCHECK(!components.empty()); auto it = components.begin() + 1; // ignore the leading slash for (; it != components.end(); ++it) { const auto& component = *it; DCHECK(!component.empty()); result += '/'; result += (base::IsStringASCII(component) ? component[0] : '*'); if (component.length() > 1) result += '_'; } if (detected != nullptr) { (*detected)[PIIType::kAndroidAppStoragePath].insert( app_specific.as_string()); } } text.AppendToString(&result); return result; #else return input; #endif // BUILDFLAG(IS_CHROMEOS_ASH) } std::string RedactionTool::RedactAndKeepSelectedCustomPatterns( std::string input, const std::set& pii_types_to_keep) { for (const auto& pattern : kCustomPatternsWithContext) { if (pii_types_to_keep.find(pattern.pii_type) == pii_types_to_keep.end()) { input = RedactCustomPatternWithContext(input, pattern, nullptr); } } for (const auto& pattern : kCustomPatternsWithoutContext) { if (pii_types_to_keep.find(pattern.pii_type) == pii_types_to_keep.end()) { input = RedactCustomPatternWithoutContext(input, pattern, nullptr); } } return input; } void RedactionTool::DetectWithCustomPatterns( std::string input, std::map>* detected) { for (const auto& pattern : kCustomPatternsWithContext) { RedactCustomPatternWithContext(input, pattern, detected); } for (const auto& pattern : kCustomPatternsWithoutContext) { RedactCustomPatternWithoutContext(input, pattern, detected); } } std::string RedactionTool::RedactCustomPatternWithContext( const std::string& input, const CustomPatternWithAlias& pattern, std::map>* detected) { RE2* re = GetRegExp(pattern.pattern); DCHECK_EQ(3, re->NumberOfCapturingGroups()); std::map* identifier_space = &custom_patterns_with_context_[pattern.alias]; std::string result; result.reserve(input.size()); // Keep consuming, building up a result string as we go. re2::StringPiece text(input); re2::StringPiece skipped; re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id; while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id, &matched_id, &post_matched_id)) { std::string matched_id_as_string(matched_id); std::string replacement_id; if (identifier_space->count(matched_id_as_string) == 0) { // The weird NumberToString trick is because Windows does not like // to deal with %zu and a size_t in printf, nor does it support %llu. replacement_id = base::StringPrintf( "<%s: %s>", pattern.alias, base::NumberToString(identifier_space->size() + 1).c_str()); (*identifier_space)[matched_id_as_string] = replacement_id; } else { replacement_id = (*identifier_space)[matched_id_as_string]; } if (detected != nullptr) { (*detected)[pattern.pii_type].insert(matched_id_as_string); } skipped.AppendToString(&result); pre_matched_id.AppendToString(&result); result += replacement_id; post_matched_id.AppendToString(&result); } text.AppendToString(&result); return result; } // This takes a |url| argument and returns true if the URL is exempt from // redaction, returns false otherwise. bool IsUrlExempt(re2::StringPiece url, const char* const* first_party_extension_ids) { // We do not exempt anything with a query parameter. if (url.contains("?")) return false; // Last part of an SELinux context is misdetected as a URL. // e.g. "u:object_r:system_data_file:s0:c512,c768" if (url.starts_with("file:s0")) return true; // Check for chrome:// URLs that are exempt. if (url.starts_with("chrome://")) { // We allow everything in chrome://resources/. if (url.starts_with("chrome://resources/")) return true; // We allow chrome://*/crisper.js. if (url.ends_with("/crisper.js")) return true; return false; } if (!first_party_extension_ids) return false; // Exempt URLs of the format chrome-extension:///*.js if (!url.starts_with("chrome-extension://")) return false; // These must end with a .js extension. if (!url.ends_with(".js")) return false; int i = 0; const char* test_id = first_party_extension_ids[i]; const re2::StringPiece url_sub = url.substr(sizeof("chrome-extension://") - 1); while (test_id) { if (url_sub.starts_with(test_id)) return true; test_id = first_party_extension_ids[++i]; } return false; } std::string RedactionTool::RedactCustomPatternWithoutContext( const std::string& input, const CustomPatternWithAlias& pattern, std::map>* detected) { RE2* re = GetRegExp(pattern.pattern); DCHECK_EQ(1, re->NumberOfCapturingGroups()); std::map* identifier_space = &custom_patterns_without_context_[pattern.alias]; std::string result; result.reserve(input.size()); // Keep consuming, building up a result string as we go. re2::StringPiece text(input); re2::StringPiece skipped; re2::StringPiece matched_id; const re2::StringPiece dash("-"); while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) { if (IsUrlExempt(matched_id, first_party_extension_ids_)) { skipped.AppendToString(&result); matched_id.AppendToString(&result); continue; } std::string matched_id_as_string(matched_id); std::string replacement_id; if (identifier_space->count(matched_id_as_string) == 0) { replacement_id = MaybeScrubIPAddress(matched_id_as_string); if (replacement_id != matched_id_as_string) { // USB paths can be confused with IPv4 Addresses because they can look // similar: n-n.n.n.n . Ignore replacement if previous char is `-` if (skipped.ends_with(dash) && strcmp("IPv4", pattern.alias) == 0) { skipped.AppendToString(&result); matched_id.AppendToString(&result); continue; } // The weird NumberToString trick is because Windows does not like // to deal with %zu and a size_t in printf, nor does it support %llu. replacement_id = base::StringPrintf( "<%s: %s>", replacement_id.empty() ? pattern.alias : replacement_id.c_str(), base::NumberToString(identifier_space->size() + 1).c_str()); (*identifier_space)[matched_id_as_string] = replacement_id; if (detected != nullptr) { (*detected)[pattern.pii_type].insert(matched_id_as_string); } } } else { replacement_id = (*identifier_space)[matched_id_as_string]; if (detected != nullptr) { (*detected)[pattern.pii_type].insert(matched_id_as_string); } } skipped.AppendToString(&result); result += replacement_id; } text.AppendToString(&result); return result; } RedactionToolContainer::RedactionToolContainer( scoped_refptr task_runner, const char* const* first_party_extension_ids) : redactor_(new RedactionTool(first_party_extension_ids)), task_runner_(task_runner) {} RedactionToolContainer::~RedactionToolContainer() { task_runner_->DeleteSoon(FROM_HERE, std::move(redactor_)); } RedactionTool* RedactionToolContainer::Get() { DCHECK(task_runner_->RunsTasksInCurrentSequence()); return redactor_.get(); } } // namespace feedback