Skip Input on Error

Some encodings know how to find the next valid sequence after they encounter an error. Many of these encodings are self synchronizing codes, but yet still others may have enough information on their (encode/decode) state to skip additional invalid input. This is where the function ztd::text::skip_input_error comes in, and the Lucky 7 Extension hook that allows an encoding to define the function. There are 2 versions of this function that are optional and can be written by an encoding author:

decode_result<…> skip_input_error(decode_result<…> result), which is meant to skip over bad input from a failed decode operation;
and, encode_result<…> skip_input_error(encode_result<…> result), which is meant to skip over bad input from a failed encode operation.

They can appear in a normal encoding like so:

#include <ztd/text.hpp>

class my_encoding {
public:
	struct empty { };
	// the regular Lucky 7 members
	static inline constexpr std::size_t max_code_points = 1;
	static inline constexpr std::size_t max_code_units  = 1;
	using state                                         = empty;
	using code_point                                    = char32_t;
	using code_unit                                     = char;

	template <typename Input, typename Output, typename ErrorHandler>
	constexpr static auto decode_one(Input&& input, Output&& output,
	     state& current_state, ErrorHandler&& error_handler) noexcept {
		// decoding implementation here !
		return decode_result<ztd::remove_cvref_t<Input>,
		     ztd::remove_cvref_t<Output>, state>(
		     input, output, current_state, ztd::text::encoding_error::ok);
	}

	template <typename Input, typename Output, typename ErrorHandler>
	static constexpr auto decode_one(Input&& input, Output&& output,
	     state& current_state, ErrorHandler&& error_handler) noexcept {
		// encoding implementation here !
		return encode_result<ztd::remove_cvref_t<Input>,
		     ztd::remove_cvref_t<Output>, state>(
		     input, output, current_state, ztd::text::encoding_error::ok);
	}

	// ❗ Special input skip member here
	template <typename Input, typename Output, typename State>
	static constexpr auto skip_input_error(
	     decode_result<Input, Output, State> result) noexcept {
		// manipulate "result" here,
		// for any failures in the decode routine.
		return result;
	}

	template <typename Input, typename Output, typename State>
	static constexpr auto skip_input_error(
	     encode_result<Input, Output, State> result) noexcept {
		// manipulate "result" here,
		// for any failures in the encode routine.
		return result;
	}
};

The ztd::text::decode_result and ztd::text::encode_result contain all of the necessary information to perform the skip (input, output, error_code, state, and error_count fields) that could be needed. For example, on a hypothetical UTF-16 encoding, one could define a skip_input_error function that behaves like so:

#include <ztd/text.hpp>

#include <utility>

class my_utf16 : private ztd::text::utf16_t {
private:
	using base_t = ztd::text::utf16_t;

public:
	// Lucky 7 Members
	static inline constexpr std::size_t max_code_points = 1;
	static inline constexpr std::size_t max_code_units  = 2;
	using state      = ztd::text::decode_state_t<ztd::text::utf16_t>;
	using code_point = char32_t;
	using code_unit  = char16_t;
	// Extension definitions
	using is_unicode_encoding = std::true_type; // UTF-16 is Unicode
	using is_injective        = std::true_type; // conversion is not lossy

	// Import base implementation here,
	// to save on the implementation work!
	using base_t::decode_one;
	using base_t::encode_one;

	// Import additional methods
	using base_t::replacement_code_points;
	using base_t::replacement_code_units;

	// ❗ Special input skip member!!
	// If this function is present and callable, it will
	// allow us to skip over bad input.
	template <typename Input, typename Output, typename State, typename InputRead,
	     typename OutputWritten>
	constexpr auto skip_input_error(
	     ztd::text::decode_result<Input, Output, State> result,
	     const InputRead& input_already_read,
	     [[maybe_unused]] const OutputWritten& output_already_read) const noexcept {
		// If we are decoding a UTF-16 sequence,
		// we can have 1 or 2 UTF-16 code units.
		// they are identifiable as leading and trailing surrogates
		constexpr char16_t last_utf16_lead_surrogate = 0xDBFF;
		auto it   = ztd::ranges::begin(result.input);
		auto last = ztd::ranges::end(result.input);
		if (it != last) {
			if (ztd::ranges::empty(input_already_read)) {
				// if no input was already read (e.g. partial read from a
				// `std::istreambuf_iterator<…>`), then we should
				// increment the iterator at **least** once! this will prevent us
				// from constantly erroring over the same stuff.
				++it;
			}

			for (; it != last; ++it) {
				// We can skip all trailing surrogates, until we find a leading
				// one.
				const bool is_good_utf16_stop_point
				     = *it > last_utf16_lead_surrogate;
				if (is_good_utf16_stop_point) {
					// we found a good place to stop: get out of here!
					break;
				}
				// if we do not break, we go around the
				// for loop again, increment the iterator
			}
		}
		// put input range back together, return in constructed result object
		using SubInput = ztd::ranges::subrange_for_t<Input>;
		using Result   = ztd::text::decode_result<SubInput, Output, State>;
		return Result(
		     // subrange of input
		     SubInput(std::move(it), std::move(last)),
		     // move the output
		     std::move(result.output),
		     // pass state along
		     result.state,
		     // existing error code
		     result.error_code,
		     // existing error count
		     result.error_count);
	}
};

Assuming, briefly, that this was all put into a file called my_utf16.hpp, it can be used like this:

#include "my_utf16.hpp"

#include <ztd/text.hpp>

#include <iostream>

int main(int, char*[]) {

	std::string utf8_string = ztd::text::transcode(
	     u"🌈\xD801\xD802\xD803🌈🌈", my_utf16 {}, ztd::text::compat_utf8);

	std::cout.write(utf8_string.data(), utf8_string.size());
	std::cout << std::endl;
	return 0;
}

This will come in handy when defining other Unicode variants that may need to skip multiple bits of bad input rather than juts passing over 1 code unit. ztd::text::replacement_handler and ztd::text::skip_handler will both try to use these extension points to skip over bad input.