diff --git a/pw_tokenizer/detokenize.cc b/pw_tokenizer/detokenize.cc index ecec29a49..8f816d311 100644 --- a/pw_tokenizer/detokenize.cc +++ b/pw_tokenizer/detokenize.cc @@ -15,7 +15,10 @@ #include "pw_tokenizer/detokenize.h" #include +#include #include +#include +#include #include "pw_bytes/bit.h" #include "pw_bytes/endian.h" @@ -144,6 +147,22 @@ bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { return lhs.second > rhs.second; } +// Returns true if all characters in data are printable, space, or if the string +// is empty. +constexpr bool IsPrintableAscii(std::string_view data) { + // This follows the logic in pw_tokenizer.decode_optionally_tokenized below: + // + // if ''.join(text.split()).isprintable(): + // return text + // + for (int letter : data) { + if (std::isprint(letter) == 0 && std::isspace(letter) == 0) { + return false; + } + } + return true; +} + } // namespace DetokenizedString::DetokenizedString( @@ -261,4 +280,52 @@ std::string Detokenizer::DetokenizeText(std::string_view text, return result; } +std::string Detokenizer::DecodeOptionallyTokenizedData( + const ConstByteSpan& optionally_tokenized_data) { + // Try detokenizing as binary using the best result if available, else use + // the input data as a string. + const auto result = Detokenize(optionally_tokenized_data); + const bool found_matches = !result.matches().empty(); + // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding + // process does not encode and decode UTF8 format, it is sufficient to check + // if the data is printable ASCII. + const std::string data = + found_matches + ? result.BestString() + : std::string( + reinterpret_cast(optionally_tokenized_data.data()), + optionally_tokenized_data.size()); + + const bool is_data_printable = IsPrintableAscii(data); + if (!found_matches && !is_data_printable) { + // Assume the token is unknown or the data is corrupt. + std::vector base64_encoding_buffer( + Base64EncodedBufferSize(optionally_tokenized_data.size())); + const size_t encoded_length = PrefixedBase64Encode( + optionally_tokenized_data, span(base64_encoding_buffer)); + return std::string{base64_encoding_buffer.data(), encoded_length}; + } + + // Successfully detokenized, check if the field has more prefixed + // base64-encoded tokens. + const std::string field = DetokenizeText(data); + // If anything detokenized successfully, use that. + if (field != data) { + return field; + } + + // Attempt to determine whether this is an unknown token or plain text. + // Any string with only printable or whitespace characters is plain text. + if (found_matches || is_data_printable) { + return data; + } + + // Assume this field is tokenized data that could not be decoded. + std::vector base64_encoding_buffer( + Base64EncodedBufferSize(optionally_tokenized_data.size())); + const size_t encoded_length = PrefixedBase64Encode( + optionally_tokenized_data, span(base64_encoding_buffer)); + return std::string{base64_encoding_buffer.data(), encoded_length}; +} + } // namespace pw::tokenizer diff --git a/pw_tokenizer/detokenize_test.cc b/pw_tokenizer/detokenize_test.cc index db116a88c..02ac2e2f4 100644 --- a/pw_tokenizer/detokenize_test.cc +++ b/pw_tokenizer/detokenize_test.cc @@ -34,7 +34,7 @@ auto TestCases(Args... args) { return std::array{args...}; } -// Database with the following entries: +// Database with the following entries and arbitrary token values: // { // 0x00000001: "One", // 0x00000005: "TWO", @@ -44,18 +44,20 @@ auto TestCases(Args... args) { // } constexpr char kTestDatabase[] = "TOKENS\0\0" - "\x05\x00\x00\x00" + "\x06\x00\x00\x00" // Number of tokens in this database. "\0\0\0\0" "\x01\x00\x00\x00----" "\x05\x00\x00\x00----" "\xFF\x00\x00\x00----" "\xFF\xEE\xEE\xDD----" "\xEE\xEE\xEE\xEE----" + "\x9D\xA7\x97\xF8----" "One\0" "TWO\0" "333\0" "FOUR\0" - "$AQAAAA=="; + "$AQAAAA==\0" + "■msg♦This is $AQAAAA== message■module♦■file♦file.txt"; class Detokenize : public ::testing::Test { protected: @@ -166,6 +168,33 @@ TEST_F(Detokenize, Base64_NoArguments) { } } +TEST_F(Detokenize, OptionallyTokenizedData) { + for (auto [data, expected] : TestCases( + Case{ONE, "One"}, + Case{"\1\0\0\0", "One"}, + Case{TWO, "TWO"}, + Case{THREE, "333"}, + Case{FOUR, "FOUR"}, + Case{FOUR ONE ONE, "FOUROneOne"}, + Case{ONE TWO THREE FOUR, "OneTWO333FOUR"}, + Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n", + "One\r\nTWO\r\n333\r\nFOUR\r\n"}, + Case{"123" FOUR, "123FOUR"}, + Case{"123" FOUR ", 56", "123FOUR, 56"}, + Case{"12" THREE FOUR ", 56", "12333FOUR, 56"}, + Case{"$0" ONE, "$0One"}, + Case{"$/+7u3Q=", "$/+7u3Q="}, // incomplete message (missing "=") + Case{"$123456==" FOUR, "$123456==FOUR"}, + Case{NEST_ONE, "One"}, + Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"}, + Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"}, + Case{"$naeX+A==", + "■msg♦This is One message■module♦■file♦file.txt"})) { + EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))), + std::string(expected)); + } +} + constexpr char kDataWithArguments[] = "TOKENS\0\0" "\x09\x00\x00\x00" diff --git a/pw_tokenizer/public/pw_tokenizer/detokenize.h b/pw_tokenizer/public/pw_tokenizer/detokenize.h index ac936fa08..3938d8986 100644 --- a/pw_tokenizer/public/pw_tokenizer/detokenize.h +++ b/pw_tokenizer/public/pw_tokenizer/detokenize.h @@ -147,6 +147,22 @@ class Detokenizer { return DetokenizeText(text, 1); } + /// Decodes data that may or may not be tokenized, such as proto fields marked + /// as optionally tokenized. + /// + /// This function currently only supports Base64 nested tokenized messages. + /// Support for hexadecimal-encoded string literals will be added. + /// + /// This function currently assumes when data is not tokenized it is printable + /// ASCII. Otherwise, the returned string will be base64-encoded. + /// + /// @param[in] optionally_tokenized_data Data optionally tokenized. + /// + /// @returns The decoded text if successfully detokenized or if the data is + /// printable, otherwise returns the data base64-encoded. + std::string DecodeOptionallyTokenizedData( + const span& optionally_tokenized_data); + private: std::unordered_map> database_; };