mirror of
https://fuchsia.googlesource.com/third_party/pigweed.googlesource.com/pigweed/pigweed
synced 2024-08-02 06:46:04 +00:00
pw_tokenizer: Add DecodeOptionallyTokenizedData
Add new API to decode data that may or may not be tokenized. Change-Id: I00289a0b8041dcbd2ad86489366b741f3cd4091d Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/206070 Reviewed-by: Wyatt Hepler <hepler@google.com> Commit-Queue: Carlos Chinchilla <cachinchilla@google.com> Pigweed-Auto-Submit: Carlos Chinchilla <cachinchilla@google.com>
This commit is contained in:
parent
82bbfff7ff
commit
f79f7c42e7
|
@ -15,7 +15,10 @@
|
|||
#include "pw_tokenizer/detokenize.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "pw_bytes/bit.h"
|
||||
#include "pw_bytes/endian.h"
|
||||
|
@ -144,6 +147,22 @@ bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
|
|||
return lhs.second > rhs.second;
|
||||
}
|
||||
|
||||
// Returns true if all characters in data are printable, space, or if the string
|
||||
// is empty.
|
||||
constexpr bool IsPrintableAscii(std::string_view data) {
|
||||
// This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
|
||||
//
|
||||
// if ''.join(text.split()).isprintable():
|
||||
// return text
|
||||
//
|
||||
for (int letter : data) {
|
||||
if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
DetokenizedString::DetokenizedString(
|
||||
|
@ -261,4 +280,52 @@ std::string Detokenizer::DetokenizeText(std::string_view text,
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string Detokenizer::DecodeOptionallyTokenizedData(
|
||||
const ConstByteSpan& optionally_tokenized_data) {
|
||||
// Try detokenizing as binary using the best result if available, else use
|
||||
// the input data as a string.
|
||||
const auto result = Detokenize(optionally_tokenized_data);
|
||||
const bool found_matches = !result.matches().empty();
|
||||
// Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
|
||||
// process does not encode and decode UTF8 format, it is sufficient to check
|
||||
// if the data is printable ASCII.
|
||||
const std::string data =
|
||||
found_matches
|
||||
? result.BestString()
|
||||
: std::string(
|
||||
reinterpret_cast<const char*>(optionally_tokenized_data.data()),
|
||||
optionally_tokenized_data.size());
|
||||
|
||||
const bool is_data_printable = IsPrintableAscii(data);
|
||||
if (!found_matches && !is_data_printable) {
|
||||
// Assume the token is unknown or the data is corrupt.
|
||||
std::vector<char> base64_encoding_buffer(
|
||||
Base64EncodedBufferSize(optionally_tokenized_data.size()));
|
||||
const size_t encoded_length = PrefixedBase64Encode(
|
||||
optionally_tokenized_data, span(base64_encoding_buffer));
|
||||
return std::string{base64_encoding_buffer.data(), encoded_length};
|
||||
}
|
||||
|
||||
// Successfully detokenized, check if the field has more prefixed
|
||||
// base64-encoded tokens.
|
||||
const std::string field = DetokenizeText(data);
|
||||
// If anything detokenized successfully, use that.
|
||||
if (field != data) {
|
||||
return field;
|
||||
}
|
||||
|
||||
// Attempt to determine whether this is an unknown token or plain text.
|
||||
// Any string with only printable or whitespace characters is plain text.
|
||||
if (found_matches || is_data_printable) {
|
||||
return data;
|
||||
}
|
||||
|
||||
// Assume this field is tokenized data that could not be decoded.
|
||||
std::vector<char> base64_encoding_buffer(
|
||||
Base64EncodedBufferSize(optionally_tokenized_data.size()));
|
||||
const size_t encoded_length = PrefixedBase64Encode(
|
||||
optionally_tokenized_data, span(base64_encoding_buffer));
|
||||
return std::string{base64_encoding_buffer.data(), encoded_length};
|
||||
}
|
||||
|
||||
} // namespace pw::tokenizer
|
||||
|
|
|
@ -34,7 +34,7 @@ auto TestCases(Args... args) {
|
|||
return std::array<Case, sizeof...(Args)>{args...};
|
||||
}
|
||||
|
||||
// Database with the following entries:
|
||||
// Database with the following entries and arbitrary token values:
|
||||
// {
|
||||
// 0x00000001: "One",
|
||||
// 0x00000005: "TWO",
|
||||
|
@ -44,18 +44,20 @@ auto TestCases(Args... args) {
|
|||
// }
|
||||
constexpr char kTestDatabase[] =
|
||||
"TOKENS\0\0"
|
||||
"\x05\x00\x00\x00"
|
||||
"\x06\x00\x00\x00" // Number of tokens in this database.
|
||||
"\0\0\0\0"
|
||||
"\x01\x00\x00\x00----"
|
||||
"\x05\x00\x00\x00----"
|
||||
"\xFF\x00\x00\x00----"
|
||||
"\xFF\xEE\xEE\xDD----"
|
||||
"\xEE\xEE\xEE\xEE----"
|
||||
"\x9D\xA7\x97\xF8----"
|
||||
"One\0"
|
||||
"TWO\0"
|
||||
"333\0"
|
||||
"FOUR\0"
|
||||
"$AQAAAA==";
|
||||
"$AQAAAA==\0"
|
||||
"■msg♦This is $AQAAAA== message■module♦■file♦file.txt";
|
||||
|
||||
class Detokenize : public ::testing::Test {
|
||||
protected:
|
||||
|
@ -166,6 +168,33 @@ TEST_F(Detokenize, Base64_NoArguments) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_F(Detokenize, OptionallyTokenizedData) {
|
||||
for (auto [data, expected] : TestCases(
|
||||
Case{ONE, "One"},
|
||||
Case{"\1\0\0\0", "One"},
|
||||
Case{TWO, "TWO"},
|
||||
Case{THREE, "333"},
|
||||
Case{FOUR, "FOUR"},
|
||||
Case{FOUR ONE ONE, "FOUROneOne"},
|
||||
Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
|
||||
Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
|
||||
"One\r\nTWO\r\n333\r\nFOUR\r\n"},
|
||||
Case{"123" FOUR, "123FOUR"},
|
||||
Case{"123" FOUR ", 56", "123FOUR, 56"},
|
||||
Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
|
||||
Case{"$0" ONE, "$0One"},
|
||||
Case{"$/+7u3Q=", "$/+7u3Q="}, // incomplete message (missing "=")
|
||||
Case{"$123456==" FOUR, "$123456==FOUR"},
|
||||
Case{NEST_ONE, "One"},
|
||||
Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
|
||||
Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
|
||||
Case{"$naeX+A==",
|
||||
"■msg♦This is One message■module♦■file♦file.txt"})) {
|
||||
EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
|
||||
std::string(expected));
|
||||
}
|
||||
}
|
||||
|
||||
constexpr char kDataWithArguments[] =
|
||||
"TOKENS\0\0"
|
||||
"\x09\x00\x00\x00"
|
||||
|
|
|
@ -147,6 +147,22 @@ class Detokenizer {
|
|||
return DetokenizeText(text, 1);
|
||||
}
|
||||
|
||||
/// Decodes data that may or may not be tokenized, such as proto fields marked
|
||||
/// as optionally tokenized.
|
||||
///
|
||||
/// This function currently only supports Base64 nested tokenized messages.
|
||||
/// Support for hexadecimal-encoded string literals will be added.
|
||||
///
|
||||
/// This function currently assumes when data is not tokenized it is printable
|
||||
/// ASCII. Otherwise, the returned string will be base64-encoded.
|
||||
///
|
||||
/// @param[in] optionally_tokenized_data Data optionally tokenized.
|
||||
///
|
||||
/// @returns The decoded text if successfully detokenized or if the data is
|
||||
/// printable, otherwise returns the data base64-encoded.
|
||||
std::string DecodeOptionallyTokenizedData(
|
||||
const span<const std::byte>& optionally_tokenized_data);
|
||||
|
||||
private:
|
||||
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue
Block a user