Separate Encode/Decode States

It is no secret that encoding and decoding may carrying with them separate states. While converting from a legacy encoding to Unicode may require maintenance of a shift state or code unit modifier, the opposite direction may not need any at all. Therefore, as an optimization, an encoding object can define both an encode_state and a decode_state, seperate from each other. As an example, here is a (simplified) version of how ztd::text::execution, the encoding for the Locale-based Runtime Execution Encoding, has two seperate states that need to be initialized in different manners:

 1class runtime_locale {
 2public:
 3	struct decode_state {
 4		std::mbstate_t c_stdlib_state;
 5
 6		decode_state() noexcept : c_stdlib_state() {
 7			// properly set for mbrtoc32 state
 8			code_point ghost_ouput[2] {};
 9			UCHAR_ACCESS mbrtoc32(
10			     ghost_ouput, "\0", 1, &c_stdlib_state);
11		}
12	};
13
14	struct encode_state {
15		std::mbstate_t c_stdlib_state;
16
17		encode_state() noexcept : c_stdlib_state() {
18			// properly set for c32rtomb state
19			code_unit ghost_ouput[MB_LEN_MAX] {};
20			UCHAR_ACCESS c32rtomb(ghost_ouput, U'\0', &c_stdlib_state);
21		}
22	};
23	(void)argc;

This is the proper way to initialize a std::mbstate_t from the C standard library. Then, you can use it! Here’s a complete implementation using the new encode_state and decode_state types:

  1class runtime_locale {
  2	using rtl_decode_result
  3	     = ztd::text::decode_result<ztd::span<const code_unit>,
  4	          ztd::span<code_point>, decode_state>;
  5	using rtl_encode_result
  6	     = ztd::text::encode_result<ztd::span<const code_point>,
  7	          ztd::span<code_unit>, encode_state>;
  8	using rtl_decode_error_handler = std::function<rtl_decode_result(
  9	     const runtime_locale&, rtl_decode_result, ztd::span<const char>,
 10	     ztd::span<const char32_t>)>;
 11	using rtl_encode_error_handler = std::function<rtl_encode_result(
 12	     const runtime_locale&, rtl_encode_result,
 13	     ztd::span<const char32_t>, ztd::span<const char>)>;
 14
 15	using empty_code_unit_span  = ztd::span<const code_unit, 0>;
 16	using empty_code_point_span = ztd::span<const code_point, 0>;
 17
 18public:
 19	rtl_decode_result decode_one(
 20	     ztd::span<const code_unit> input, ztd::span<code_point> output,
 21	     rtl_decode_error_handler error_handler,
 22	     decode_state& current // decode-based state
 23	) const {
 24		if (output.size() < 1) {
 25			return error_handler(*this,
 26			     rtl_decode_result(input, output, current,
 27			          ztd::text::encoding_error::
 28			               insufficient_output_space),
 29			     empty_code_unit_span(), empty_code_point_span());
 30		}
 31		std::size_t result = UCHAR_ACCESS mbrtoc32(output.data(),
 32		     input.data(), input.size(), &current.c_stdlib_state);
 33		switch (result) {
 34		case (std::size_t)0:
 35			// '\0' was encountered in the input
 36			// current.c_stdlib_state was "cleared"
 37			// '\0' character was written to output
 38			return rtl_decode_result(
 39			     input.subspan(1), output.subspan(1), current);
 40			break;
 41		case (std::size_t)-3:
 42			// no input read, pre-stored character
 43			// was written out
 44			return rtl_decode_result(input, output.subspan(1), current);
 45		case (std::size_t)-2:
 46			// input was an incomplete sequence
 47			return error_handler(*this,
 48			     rtl_decode_result(input, output, current,
 49			          ztd::text::encoding_error::incomplete_sequence),
 50			     empty_code_unit_span(), empty_code_point_span());
 51			break;
 52		case (std::size_t)-1:
 53			// invalid sequence!
 54			return error_handler(*this,
 55			     rtl_decode_result(input, output, current,
 56			          ztd::text::encoding_error::invalid_sequence),
 57			     empty_code_unit_span(), empty_code_point_span());
 58		}
 59		// everything as fine, then
 60		return rtl_decode_result(
 61		     input.subspan(result), output.subspan(1), current);
 62	}
 63
 64	rtl_encode_result encode_one(
 65	     ztd::span<const code_point> input, ztd::span<code_unit> output,
 66	     rtl_encode_error_handler error_handler,
 67	     encode_state& current // encode-based state
 68	) const {
 69		// saved, in case we need to go
 70		// around mulitple times to get
 71		// an output character
 72		ztd::span<const code_point> original_input = input;
 73		// The C standard library assumes
 74		// it can write out MB_CUR_MAX characters to the buffer:
 75		// we have no guarantee our output buffer is that big, so it
 76		// needs to go into an intermediate buffer instead
 77		code_unit intermediate_buffer[MB_LEN_MAX];
 78
 79		for (int times_around = 0;; ++times_around) {
 80			if (input.size() < 1) {
 81				// no more input: everything is fine
 82				return rtl_encode_result(input, output, current);
 83			}
 84			std::size_t result
 85			     = UCHAR_ACCESS c32rtomb(intermediate_buffer,
 86			          *input.data(), &current.c_stdlib_state);
 87			if (result == (std::size_t)-1) {
 88				// invalid sequence!
 89				return error_handler(*this,
 90				     rtl_encode_result(original_input, output, current,
 91				          ztd::text::encoding_error::invalid_sequence),
 92				     empty_code_point_span(), empty_code_unit_span());
 93			}
 94			else if (result == (std::size_t)0) {
 95				// this means nothing was output
 96				// we should probably go-around again,
 97				// after modifying input
 98				input = input.subspan(1);
 99				continue;
100			}
101			// otherwise, we got something written out!
102			if (output.size() < result) {
103				// can't fit!!
104				return error_handler(*this,
105				     rtl_encode_result(original_input, output, current,
106				          ztd::text::encoding_error::
107				               insufficient_output_space),
108				     empty_code_point_span(), empty_code_unit_span());
109			}
110			::std::memcpy(output.data(), intermediate_buffer,
111			     sizeof(*intermediate_buffer) * result);
112			input  = input.subspan(1);
113			output = output.subspan(result);
114			break;
115		}
116		return rtl_encode_result(input, output, current);
117	}
118};
119
120int main(int argc, char* argv[]) {
121	(void)argc;

This allows you to maintain 2 different states, initialized in 2 different ways, one for each of the encode_one and decode_one paths.