Separate Encode/Decode States¶

It is no secret that encoding and decoding may carrying with them separate states. While converting from a legacy encoding to Unicode may require maintenance of a shift state or code unit modifier, the opposite direction may not need any at all. Therefore, as an optimization, an encoding object can define both an encode_state and a decode_state, seperate from each other. As an example, here is a (simplified) version of how ztd::text::execution, the encoding for the Locale-based Runtime Execution Encoding, has two seperate states that need to be initialized in different manners:

class runtime_locale {
public:
	struct decode_state {
		std::mbstate_t c_stdlib_state;

		decode_state() noexcept : c_stdlib_state() {
			// properly set for mbrtoc32 state
			code_point ghost_ouput[2] {};
			UCHAR_ACCESS mbrtoc32(
			     ghost_ouput, "\0", 1, &c_stdlib_state);
		}
	};

	struct encode_state {
		std::mbstate_t c_stdlib_state;

		encode_state() noexcept : c_stdlib_state() {
			// properly set for c32rtomb state
			code_unit ghost_ouput[MB_LEN_MAX] {};
			UCHAR_ACCESS c32rtomb(ghost_ouput, U'\0', &c_stdlib_state);
		}
	};
	(void)argc;

This is the proper way to initialize a std::mbstate_t from the C standard library. Then, you can use it! Here’s a complete implementation using the new encode_state and decode_state types:

class runtime_locale {
	using rtl_decode_result
	     = ztd::text::decode_result<ztd::span<const code_unit>,
	          ztd::span<code_point>, decode_state>;
	using rtl_encode_result
	     = ztd::text::encode_result<ztd::span<const code_point>,
	          ztd::span<code_unit>, encode_state>;
	using rtl_decode_error_handler = std::function<rtl_decode_result(
	     const runtime_locale&, rtl_decode_result, ztd::span<const char>,
	     ztd::span<const char32_t>)>;
	using rtl_encode_error_handler = std::function<rtl_encode_result(
	     const runtime_locale&, rtl_encode_result,
	     ztd::span<const char32_t>, ztd::span<const char>)>;

	using empty_code_unit_span  = ztd::span<const code_unit, 0>;
	using empty_code_point_span = ztd::span<const code_point, 0>;

public:
	rtl_decode_result decode_one(
	     ztd::span<const code_unit> input, ztd::span<code_point> output,
	     rtl_decode_error_handler error_handler,
	     decode_state& current // decode-based state
	) const {
		if (output.size() < 1) {
			return error_handler(*this,
			     rtl_decode_result(input, output, current,
			          ztd::text::encoding_error::
			               insufficient_output_space),
			     empty_code_unit_span(), empty_code_point_span());
		}
		std::size_t result = UCHAR_ACCESS mbrtoc32(output.data(),
		     input.data(), input.size(), &current.c_stdlib_state);
		switch (result) {
		case (std::size_t)0:
			// '\0' was encountered in the input
			// current.c_stdlib_state was "cleared"
			// '\0' character was written to output
			return rtl_decode_result(
			     input.subspan(1), output.subspan(1), current);
			break;
		case (std::size_t)-3:
			// no input read, pre-stored character
			// was written out
			return rtl_decode_result(input, output.subspan(1), current);
		case (std::size_t)-2:
			// input was an incomplete sequence
			return error_handler(*this,
			     rtl_decode_result(input, output, current,
			          ztd::text::encoding_error::incomplete_sequence),
			     empty_code_unit_span(), empty_code_point_span());
			break;
		case (std::size_t)-1:
			// invalid sequence!
			return error_handler(*this,
			     rtl_decode_result(input, output, current,
			          ztd::text::encoding_error::invalid_sequence),
			     empty_code_unit_span(), empty_code_point_span());
		}
		// everything as fine, then
		return rtl_decode_result(
		     input.subspan(result), output.subspan(1), current);
	}

	rtl_encode_result encode_one(
	     ztd::span<const code_point> input, ztd::span<code_unit> output,
	     rtl_encode_error_handler error_handler,
	     encode_state& current // encode-based state
	) const {
		// saved, in case we need to go
		// around mulitple times to get
		// an output character
		ztd::span<const code_point> original_input = input;
		// The C standard library assumes
		// it can write out MB_CUR_MAX characters to the buffer:
		// we have no guarantee our output buffer is that big, so it
		// needs to go into an intermediate buffer instead
		code_unit intermediate_buffer[MB_LEN_MAX];

		for (int times_around = 0;; ++times_around) {
			if (input.size() < 1) {
				// no more input: everything is fine
				return rtl_encode_result(input, output, current);
			}
			std::size_t result
			     = UCHAR_ACCESS c32rtomb(intermediate_buffer,
			          *input.data(), &current.c_stdlib_state);
			if (result == (std::size_t)-1) {
				// invalid sequence!
				return error_handler(*this,
				     rtl_encode_result(original_input, output, current,
				          ztd::text::encoding_error::invalid_sequence),
				     empty_code_point_span(), empty_code_unit_span());
			}
			else if (result == (std::size_t)0) {
				// this means nothing was output
				// we should probably go-around again,
				// after modifying input
				input = input.subspan(1);
				continue;
			}
			// otherwise, we got something written out!
			if (output.size() < result) {
				// can't fit!!
				return error_handler(*this,
				     rtl_encode_result(original_input, output, current,
				          ztd::text::encoding_error::
				               insufficient_output_space),
				     empty_code_point_span(), empty_code_unit_span());
			}
			::std::memcpy(output.data(), intermediate_buffer,
			     sizeof(*intermediate_buffer) * result);
			input  = input.subspan(1);
			output = output.subspan(result);
			break;
		}
		return rtl_encode_result(input, output, current);
	}
};

int main(int argc, char* argv[]) {
	(void)argc;

This allows you to maintain 2 different states, initialized in 2 different ways, one for each of the encode_one and decode_one paths.