pw_tokenizer: Add DecodeOptionallyTokenizedData

Add new API to decode data that may or may not be tokenized. Change-Id: I00289a0b8041dcbd2ad86489366b741f3cd4091d Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/206070 Reviewed-by: Wyatt Hepler <hepler@google.com> Commit-Queue: Carlos Chinchilla <cachinchilla@google.com> Pigweed-Auto-Submit: Carlos Chinchilla <cachinchilla@google.com>
2024-08-02 06:46:04 +00:00 · 2024-04-24 21:03:05 +00:00 · 2024-04-24 21:03:05 +00:00 · f79f7c42e7
commit f79f7c42e7
parent 82bbfff7ff
3 changed files with 115 additions and 3 deletions
--- a/pw_tokenizer/detokenize.cc
+++ b/pw_tokenizer/detokenize.cc
@ -15,7 +15,10 @@
 #include "pw_tokenizer/detokenize.h"

 #include <algorithm>
+#include <cctype>
 #include <cstring>
+#include <string_view>
+#include <vector>

 #include "pw_bytes/bit.h"
 #include "pw_bytes/endian.h"
@ -144,6 +147,22 @@ bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
  return lhs.second > rhs.second;
 }

+// Returns true if all characters in data are printable, space, or if the string
+// is empty.
+constexpr bool IsPrintableAscii(std::string_view data) {
+  // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
+  //
+  //   if ''.join(text.split()).isprintable():
+  //     return text
+  //
+  for (int letter : data) {
+    if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace

 DetokenizedString::DetokenizedString(
@ -261,4 +280,52 @@ std::string Detokenizer::DetokenizeText(std::string_view text,
  return result;
 }

+std::string Detokenizer::DecodeOptionallyTokenizedData(
+    const ConstByteSpan& optionally_tokenized_data) {
+  // Try detokenizing as binary using the best result if available, else use
+  // the input data as a string.
+  const auto result = Detokenize(optionally_tokenized_data);
+  const bool found_matches = !result.matches().empty();
+  // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
+  // process does not encode and decode UTF8 format, it is sufficient to check
+  // if the data is printable ASCII.
+  const std::string data =
+      found_matches
+          ? result.BestString()
+          : std::string(
+                reinterpret_cast<const char*>(optionally_tokenized_data.data()),
+                optionally_tokenized_data.size());
+
+  const bool is_data_printable = IsPrintableAscii(data);
+  if (!found_matches && !is_data_printable) {
+    // Assume the token is unknown or the data is corrupt.
+    std::vector<char> base64_encoding_buffer(
+        Base64EncodedBufferSize(optionally_tokenized_data.size()));
+    const size_t encoded_length = PrefixedBase64Encode(
+        optionally_tokenized_data, span(base64_encoding_buffer));
+    return std::string{base64_encoding_buffer.data(), encoded_length};
+  }
+
+  // Successfully detokenized, check if the field has more prefixed
+  // base64-encoded tokens.
+  const std::string field = DetokenizeText(data);
+  // If anything detokenized successfully, use that.
+  if (field != data) {
+    return field;
+  }
+
+  // Attempt to determine whether this is an unknown token or plain text.
+  // Any string with only printable or whitespace characters is plain text.
+  if (found_matches || is_data_printable) {
+    return data;
+  }
+
+  // Assume this field is tokenized data that could not be decoded.
+  std::vector<char> base64_encoding_buffer(
+      Base64EncodedBufferSize(optionally_tokenized_data.size()));
+  const size_t encoded_length = PrefixedBase64Encode(
+      optionally_tokenized_data, span(base64_encoding_buffer));
+  return std::string{base64_encoding_buffer.data(), encoded_length};
+}
+
 }  // namespace pw::tokenizer
--- a/pw_tokenizer/detokenize_test.cc
+++ b/pw_tokenizer/detokenize_test.cc
@ -34,7 +34,7 @@ auto TestCases(Args... args) {
  return std::array<Case, sizeof...(Args)>{args...};
 }

-// Database with the following entries:
+// Database with the following entries and arbitrary token values:
 // {
 //   0x00000001: "One",
 //   0x00000005: "TWO",
@ -44,18 +44,20 @@ auto TestCases(Args... args) {
 // }
 constexpr char kTestDatabase[] =
    "TOKENS\0\0"
-    "\x05\x00\x00\x00"
+    "\x06\x00\x00\x00"  // Number of tokens in this database.
    "\0\0\0\0"
    "\x01\x00\x00\x00----"
    "\x05\x00\x00\x00----"
    "\xFF\x00\x00\x00----"
    "\xFF\xEE\xEE\xDD----"
    "\xEE\xEE\xEE\xEE----"
+    "\x9D\xA7\x97\xF8----"
    "One\0"
    "TWO\0"
    "333\0"
    "FOUR\0"
-    "$AQAAAA==";
+    "$AQAAAA==\0"
+    "■msg♦This is $AQAAAA== message■module♦■file♦file.txt";

 class Detokenize : public ::testing::Test {
 protected:
@ -166,6 +168,33 @@ TEST_F(Detokenize, Base64_NoArguments) {
  }
 }

+TEST_F(Detokenize, OptionallyTokenizedData) {
+  for (auto [data, expected] : TestCases(
+           Case{ONE, "One"},
+           Case{"\1\0\0\0", "One"},
+           Case{TWO, "TWO"},
+           Case{THREE, "333"},
+           Case{FOUR, "FOUR"},
+           Case{FOUR ONE ONE, "FOUROneOne"},
+           Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
+           Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
+                "One\r\nTWO\r\n333\r\nFOUR\r\n"},
+           Case{"123" FOUR, "123FOUR"},
+           Case{"123" FOUR ", 56", "123FOUR, 56"},
+           Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
+           Case{"$0" ONE, "$0One"},
+           Case{"$/+7u3Q=", "$/+7u3Q="},  // incomplete message (missing "=")
+           Case{"$123456==" FOUR, "$123456==FOUR"},
+           Case{NEST_ONE, "One"},
+           Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
+           Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
+           Case{"$naeX+A==",
+                "■msg♦This is One message■module♦■file♦file.txt"})) {
+    EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
+              std::string(expected));
+  }
+}
+
 constexpr char kDataWithArguments[] =
    "TOKENS\0\0"
    "\x09\x00\x00\x00"
--- a/pw_tokenizer/public/pw_tokenizer/detokenize.h
+++ b/pw_tokenizer/public/pw_tokenizer/detokenize.h
@ -147,6 +147,22 @@ class Detokenizer {
    return DetokenizeText(text, 1);
  }

+  /// Decodes data that may or may not be tokenized, such as proto fields marked
+  /// as optionally tokenized.
+  ///
+  /// This function currently only supports Base64 nested tokenized messages.
+  /// Support for hexadecimal-encoded string literals will be added.
+  ///
+  /// This function currently assumes when data is not tokenized it is printable
+  /// ASCII. Otherwise, the returned string will be base64-encoded.
+  ///
+  /// @param[in] optionally_tokenized_data Data optionally tokenized.
+  ///
+  /// @returns The decoded text if successfully detokenized or if the data is
+  /// printable, otherwise returns the data base64-encoded.
+  std::string DecodeOptionallyTokenizedData(
+      const span<const std::byte>& optionally_tokenized_data);
+
 private:
  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
 };