Separate Encode/Decode States¶
It is no secret that encoding and decoding may carrying with them separate states. While converting from a legacy encoding to Unicode may require maintenance of a shift state or code unit modifier, the opposite direction may not need any at all. Therefore, as an optimization, an encoding object can define both an encode_state
and a decode_state
, seperate from each other. As an example, here is a (simplified) version of how ztd::text::execution, the encoding for the Locale-based Runtime Execution Encoding, has two seperate states that need to be initialized in different manners:
1class runtime_locale {
2public:
3 struct decode_state {
4 std::mbstate_t c_stdlib_state;
5
6 decode_state() noexcept : c_stdlib_state() {
7 // properly set for mbrtoc32 state
8 code_point ghost_ouput[2] {};
9 UCHAR_ACCESS mbrtoc32(
10 ghost_ouput, "\0", 1, &c_stdlib_state);
11 }
12 };
13
14 struct encode_state {
15 std::mbstate_t c_stdlib_state;
16
17 encode_state() noexcept : c_stdlib_state() {
18 // properly set for c32rtomb state
19 code_unit ghost_ouput[MB_LEN_MAX] {};
20 UCHAR_ACCESS c32rtomb(ghost_ouput, U'\0', &c_stdlib_state);
21 }
22 };
23 (void)argc;
This is the proper way to initialize a std::mbstate_t
from the C standard library. Then, you can use it! Here’s a complete implementation using the new encode_state
and decode_state
types:
1class runtime_locale {
2 using rtl_decode_result
3 = ztd::text::decode_result<ztd::span<const code_unit>,
4 ztd::span<code_point>, decode_state>;
5 using rtl_encode_result
6 = ztd::text::encode_result<ztd::span<const code_point>,
7 ztd::span<code_unit>, encode_state>;
8 using rtl_decode_error_handler = std::function<rtl_decode_result(
9 const runtime_locale&, rtl_decode_result, ztd::span<const char>,
10 ztd::span<const char32_t>)>;
11 using rtl_encode_error_handler = std::function<rtl_encode_result(
12 const runtime_locale&, rtl_encode_result,
13 ztd::span<const char32_t>, ztd::span<const char>)>;
14
15 using empty_code_unit_span = ztd::span<const code_unit, 0>;
16 using empty_code_point_span = ztd::span<const code_point, 0>;
17
18public:
19 rtl_decode_result decode_one(
20 ztd::span<const code_unit> input, ztd::span<code_point> output,
21 rtl_decode_error_handler error_handler,
22 decode_state& current // decode-based state
23 ) const {
24 if (output.size() < 1) {
25 return error_handler(*this,
26 rtl_decode_result(input, output, current,
27 ztd::text::encoding_error::
28 insufficient_output_space),
29 empty_code_unit_span(), empty_code_point_span());
30 }
31 std::size_t result = UCHAR_ACCESS mbrtoc32(output.data(),
32 input.data(), input.size(), ¤t.c_stdlib_state);
33 switch (result) {
34 case (std::size_t)0:
35 // '\0' was encountered in the input
36 // current.c_stdlib_state was "cleared"
37 // '\0' character was written to output
38 return rtl_decode_result(
39 input.subspan(1), output.subspan(1), current);
40 break;
41 case (std::size_t)-3:
42 // no input read, pre-stored character
43 // was written out
44 return rtl_decode_result(input, output.subspan(1), current);
45 case (std::size_t)-2:
46 // input was an incomplete sequence
47 return error_handler(*this,
48 rtl_decode_result(input, output, current,
49 ztd::text::encoding_error::incomplete_sequence),
50 empty_code_unit_span(), empty_code_point_span());
51 break;
52 case (std::size_t)-1:
53 // invalid sequence!
54 return error_handler(*this,
55 rtl_decode_result(input, output, current,
56 ztd::text::encoding_error::invalid_sequence),
57 empty_code_unit_span(), empty_code_point_span());
58 }
59 // everything as fine, then
60 return rtl_decode_result(
61 input.subspan(result), output.subspan(1), current);
62 }
63
64 rtl_encode_result encode_one(
65 ztd::span<const code_point> input, ztd::span<code_unit> output,
66 rtl_encode_error_handler error_handler,
67 encode_state& current // encode-based state
68 ) const {
69 // saved, in case we need to go
70 // around mulitple times to get
71 // an output character
72 ztd::span<const code_point> original_input = input;
73 // The C standard library assumes
74 // it can write out MB_CUR_MAX characters to the buffer:
75 // we have no guarantee our output buffer is that big, so it
76 // needs to go into an intermediate buffer instead
77 code_unit intermediate_buffer[MB_LEN_MAX];
78
79 for (int times_around = 0;; ++times_around) {
80 if (input.size() < 1) {
81 // no more input: everything is fine
82 return rtl_encode_result(input, output, current);
83 }
84 std::size_t result
85 = UCHAR_ACCESS c32rtomb(intermediate_buffer,
86 *input.data(), ¤t.c_stdlib_state);
87 if (result == (std::size_t)-1) {
88 // invalid sequence!
89 return error_handler(*this,
90 rtl_encode_result(original_input, output, current,
91 ztd::text::encoding_error::invalid_sequence),
92 empty_code_point_span(), empty_code_unit_span());
93 }
94 else if (result == (std::size_t)0) {
95 // this means nothing was output
96 // we should probably go-around again,
97 // after modifying input
98 input = input.subspan(1);
99 continue;
100 }
101 // otherwise, we got something written out!
102 if (output.size() < result) {
103 // can't fit!!
104 return error_handler(*this,
105 rtl_encode_result(original_input, output, current,
106 ztd::text::encoding_error::
107 insufficient_output_space),
108 empty_code_point_span(), empty_code_unit_span());
109 }
110 ::std::memcpy(output.data(), intermediate_buffer,
111 sizeof(*intermediate_buffer) * result);
112 input = input.subspan(1);
113 output = output.subspan(result);
114 break;
115 }
116 return rtl_encode_result(input, output, current);
117 }
118};
119
120int main(int argc, char* argv[]) {
121 (void)argc;
This allows you to maintain 2 different states, initialized in 2 different ways, one for each of the encode_one
and decode_one
paths.