Skip Input on Error

Some encodings know how to find the next valid sequence after they encounter an error. Many of these encodings are self synchronizing codes, but yet still others may have enough information on their (encode/decode) state to skip additional invalid input. This is where the function ztd::text::skip_input_error comes in, and the Lucky 7 Extension hook that allows an encoding to define the function. There are 2 versions of this function that are optional and can be written by an encoding author:

  • decode_result<…> skip_input_error(decode_result<…> result), which is meant to skip over bad input from a failed decode operation;

  • and, encode_result<…> skip_input_error(encode_result<…> result), which is meant to skip over bad input from a failed encode operation.

They can appear in a normal encoding like so:

 1
 2#include <ztd/text.hpp>
 3
 4class my_encoding {
 5public:
 6	struct empty { };
 7	// the regular Lucky 7 members
 8	static inline constexpr std::size_t max_code_points = 1;
 9	static inline constexpr std::size_t max_code_units  = 1;
10	using state                                         = empty;
11	using code_point                                    = char32_t;
12	using code_unit                                     = char;
13
14	template <typename Input, typename Output, typename ErrorHandler>
15	constexpr static auto decode_one(Input&& input, Output&& output,
16	     state& current_state, ErrorHandler&& error_handler) noexcept {
17		// decoding implementation here !
18		return decode_result<ztd::remove_cvref_t<Input>,
19		     ztd::remove_cvref_t<Output>, state>(
20		     input, output, current_state, ztd::text::encoding_error::ok);
21	}
22
23	template <typename Input, typename Output, typename ErrorHandler>
24	static constexpr auto decode_one(Input&& input, Output&& output,
25	     state& current_state, ErrorHandler&& error_handler) noexcept {
26		// encoding implementation here !
27		return encode_result<ztd::remove_cvref_t<Input>,
28		     ztd::remove_cvref_t<Output>, state>(
29		     input, output, current_state, ztd::text::encoding_error::ok);
30	}
31
32	// ❗ Special input skip member here
33	template <typename Input, typename Output, typename State>
34	static constexpr auto skip_input_error(
35	     decode_result<Input, Output, State> result) noexcept {
36		// manipulate "result" here,
37		// for any failures in the decode routine.
38		return result;
39	}
40
41	template <typename Input, typename Output, typename State>
42	static constexpr auto skip_input_error(
43	     encode_result<Input, Output, State> result) noexcept {
44		// manipulate "result" here,
45		// for any failures in the encode routine.
46		return result;
47	}
48};

The ztd::text::decode_result and ztd::text::encode_result contain all of the necessary information to perform the skip (input, output, error_code, state, and error_count fields) that could be needed. For example, on a hypothetical UTF-16 encoding, one could define a skip_input_error function that behaves like so:

 1#include <ztd/text.hpp>
 2
 3#include <utility>
 4
 5class my_utf16 : private ztd::text::utf16_t {
 6private:
 7	using base_t = ztd::text::utf16_t;
 8
 9public:
10	// Lucky 7 Members
11	static inline constexpr std::size_t max_code_points = 1;
12	static inline constexpr std::size_t max_code_units  = 2;
13	using state      = ztd::text::decode_state_t<ztd::text::utf16_t>;
14	using code_point = char32_t;
15	using code_unit  = char16_t;
16	// Extension definitions
17	using is_unicode_encoding = std::true_type; // UTF-16 is Unicode
18	using is_injective        = std::true_type; // conversion is not lossy
19
20	// Import base implementation here,
21	// to save on the implementation work!
22	using base_t::decode_one;
23	using base_t::encode_one;
24
25	// Import additional methods
26	using base_t::replacement_code_points;
27	using base_t::replacement_code_units;
28
29	// ❗ Special input skip member!!
30	// If this function is present and callable, it will
31	// allow us to skip over bad input.
32	template <typename Input, typename Output, typename State, typename InputRead,
33	     typename OutputWritten>
34	constexpr auto skip_input_error(
35	     ztd::text::decode_result<Input, Output, State> result,
36	     const InputRead& input_already_read,
37	     [[maybe_unused]] const OutputWritten& output_already_read) const noexcept {
38		// If we are decoding a UTF-16 sequence,
39		// we can have 1 or 2 UTF-16 code units.
40		// they are identifiable as leading and trailing surrogates
41		constexpr char16_t last_utf16_lead_surrogate = 0xDBFF;
42		auto it   = ztd::ranges::begin(result.input);
43		auto last = ztd::ranges::end(result.input);
44		if (it != last) {
45			if (ztd::ranges::empty(input_already_read)) {
46				// if no input was already read (e.g. partial read from a
47				// `std::istreambuf_iterator<…>`), then we should
48				// increment the iterator at **least** once! this will prevent us
49				// from constantly erroring over the same stuff.
50				++it;
51			}
52
53			for (; it != last; ++it) {
54				// We can skip all trailing surrogates, until we find a leading
55				// one.
56				const bool is_good_utf16_stop_point
57				     = *it > last_utf16_lead_surrogate;
58				if (is_good_utf16_stop_point) {
59					// we found a good place to stop: get out of here!
60					break;
61				}
62				// if we do not break, we go around the
63				// for loop again, increment the iterator
64			}
65		}
66		// put input range back together, return in constructed result object
67		using SubInput = ztd::ranges::subrange_for_t<Input>;
68		using Result   = ztd::text::decode_result<SubInput, Output, State>;
69		return Result(
70		     // subrange of input
71		     SubInput(std::move(it), std::move(last)),
72		     // move the output
73		     std::move(result.output),
74		     // pass state along
75		     result.state,
76		     // existing error code
77		     result.error_code,
78		     // existing error count
79		     result.error_count);
80	}
81};

Assuming, briefly, that this was all put into a file called my_utf16.hpp, it can be used like this:

 1#include "my_utf16.hpp"
 2
 3#include <ztd/text.hpp>
 4
 5#include <iostream>
 6
 7int main(int, char*[]) {
 8
 9	std::string utf8_string = ztd::text::transcode(
10	     u"🌈\xD801\xD802\xD803🌈🌈", my_utf16 {}, ztd::text::compat_utf8);
11
12	std::cout.write(utf8_string.data(), utf8_string.size());
13	std::cout << std::endl;
14	return 0;
15}

This will come in handy when defining other Unicode variants that may need to skip multiple bits of bad input rather than juts passing over 1 code unit. ztd::text::replacement_handler and ztd::text::skip_handler will both try to use these extension points to skip over bad input.