pw_tokenizer: Add DecodeOptionallyTokenizedData

Add new API to decode data that may or may not be tokenized.

Change-Id: I00289a0b8041dcbd2ad86489366b741f3cd4091d
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/206070
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Carlos Chinchilla <cachinchilla@google.com>
Pigweed-Auto-Submit: Carlos Chinchilla <cachinchilla@google.com>
This commit is contained in:
Carlos Chinchilla 2024-04-24 21:03:05 +00:00 committed by CQ Bot Account
parent 82bbfff7ff
commit f79f7c42e7
3 changed files with 115 additions and 3 deletions

View File

@ -15,7 +15,10 @@
#include "pw_tokenizer/detokenize.h"
#include <algorithm>
#include <cctype>
#include <cstring>
#include <string_view>
#include <vector>
#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
@ -144,6 +147,22 @@ bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
return lhs.second > rhs.second;
}
// Returns true if all characters in data are printable, space, or if the string
// is empty.
constexpr bool IsPrintableAscii(std::string_view data) {
// This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
//
// if ''.join(text.split()).isprintable():
// return text
//
for (int letter : data) {
if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
return false;
}
}
return true;
}
} // namespace
DetokenizedString::DetokenizedString(
@ -261,4 +280,52 @@ std::string Detokenizer::DetokenizeText(std::string_view text,
return result;
}
std::string Detokenizer::DecodeOptionallyTokenizedData(
const ConstByteSpan& optionally_tokenized_data) {
// Try detokenizing as binary using the best result if available, else use
// the input data as a string.
const auto result = Detokenize(optionally_tokenized_data);
const bool found_matches = !result.matches().empty();
// Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
// process does not encode and decode UTF8 format, it is sufficient to check
// if the data is printable ASCII.
const std::string data =
found_matches
? result.BestString()
: std::string(
reinterpret_cast<const char*>(optionally_tokenized_data.data()),
optionally_tokenized_data.size());
const bool is_data_printable = IsPrintableAscii(data);
if (!found_matches && !is_data_printable) {
// Assume the token is unknown or the data is corrupt.
std::vector<char> base64_encoding_buffer(
Base64EncodedBufferSize(optionally_tokenized_data.size()));
const size_t encoded_length = PrefixedBase64Encode(
optionally_tokenized_data, span(base64_encoding_buffer));
return std::string{base64_encoding_buffer.data(), encoded_length};
}
// Successfully detokenized, check if the field has more prefixed
// base64-encoded tokens.
const std::string field = DetokenizeText(data);
// If anything detokenized successfully, use that.
if (field != data) {
return field;
}
// Attempt to determine whether this is an unknown token or plain text.
// Any string with only printable or whitespace characters is plain text.
if (found_matches || is_data_printable) {
return data;
}
// Assume this field is tokenized data that could not be decoded.
std::vector<char> base64_encoding_buffer(
Base64EncodedBufferSize(optionally_tokenized_data.size()));
const size_t encoded_length = PrefixedBase64Encode(
optionally_tokenized_data, span(base64_encoding_buffer));
return std::string{base64_encoding_buffer.data(), encoded_length};
}
} // namespace pw::tokenizer

View File

@ -34,7 +34,7 @@ auto TestCases(Args... args) {
return std::array<Case, sizeof...(Args)>{args...};
}
// Database with the following entries:
// Database with the following entries and arbitrary token values:
// {
// 0x00000001: "One",
// 0x00000005: "TWO",
@ -44,18 +44,20 @@ auto TestCases(Args... args) {
// }
constexpr char kTestDatabase[] =
"TOKENS\0\0"
"\x05\x00\x00\x00"
"\x06\x00\x00\x00" // Number of tokens in this database.
"\0\0\0\0"
"\x01\x00\x00\x00----"
"\x05\x00\x00\x00----"
"\xFF\x00\x00\x00----"
"\xFF\xEE\xEE\xDD----"
"\xEE\xEE\xEE\xEE----"
"\x9D\xA7\x97\xF8----"
"One\0"
"TWO\0"
"333\0"
"FOUR\0"
"$AQAAAA==";
"$AQAAAA==\0"
"■msg♦This is $AQAAAA== message■module♦■file♦file.txt";
class Detokenize : public ::testing::Test {
protected:
@ -166,6 +168,33 @@ TEST_F(Detokenize, Base64_NoArguments) {
}
}
TEST_F(Detokenize, OptionallyTokenizedData) {
for (auto [data, expected] : TestCases(
Case{ONE, "One"},
Case{"\1\0\0\0", "One"},
Case{TWO, "TWO"},
Case{THREE, "333"},
Case{FOUR, "FOUR"},
Case{FOUR ONE ONE, "FOUROneOne"},
Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
"One\r\nTWO\r\n333\r\nFOUR\r\n"},
Case{"123" FOUR, "123FOUR"},
Case{"123" FOUR ", 56", "123FOUR, 56"},
Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
Case{"$0" ONE, "$0One"},
Case{"$/+7u3Q=", "$/+7u3Q="}, // incomplete message (missing "=")
Case{"$123456==" FOUR, "$123456==FOUR"},
Case{NEST_ONE, "One"},
Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
Case{"$naeX+A==",
"■msg♦This is One message■module♦■file♦file.txt"})) {
EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
std::string(expected));
}
}
constexpr char kDataWithArguments[] =
"TOKENS\0\0"
"\x09\x00\x00\x00"

View File

@ -147,6 +147,22 @@ class Detokenizer {
return DetokenizeText(text, 1);
}
/// Decodes data that may or may not be tokenized, such as proto fields marked
/// as optionally tokenized.
///
/// This function currently only supports Base64 nested tokenized messages.
/// Support for hexadecimal-encoded string literals will be added.
///
/// This function currently assumes when data is not tokenized it is printable
/// ASCII. Otherwise, the returned string will be base64-encoded.
///
/// @param[in] optionally_tokenized_data Data optionally tokenized.
///
/// @returns The decoded text if successfully detokenized or if the data is
/// printable, otherwise returns the data base64-encoded.
std::string DecodeOptionallyTokenizedData(
const span<const std::byte>& optionally_tokenized_data);
private:
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
};