From 8f8919fd08c29397b2e0b5f86ae7314bd8baec85 Mon Sep 17 00:00:00 2001 From: Allen Webster Date: Sat, 25 Mar 2017 14:04:27 -0400 Subject: [PATCH] bullet prooffed the utf8 translation --- 4coder_lib/4coder_utf8.h | 45 +++++++++++++++++++++++++++++++++++++--- 4ed_translation.cpp | 25 +++++++++++++++------- 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/4coder_lib/4coder_utf8.h b/4coder_lib/4coder_utf8.h index 22454d63..bff8d7d7 100644 --- a/4coder_lib/4coder_utf8.h +++ b/4coder_lib/4coder_utf8.h @@ -40,6 +40,20 @@ typedef int32_t b32_4tech; #endif // standard preamble end +static u32_4tech cp_min_by_utf8_length[] = { + 0x0, + 0x0, + 0x80, + 0x800, + 0x10000, +}; + +static u32_4tech surrogate_min = 0xD800; +static u32_4tech surrogate_max = 0xDFFF; + +static u32_4tech nonchar_min = 0xFDD0; +static u32_4tech nonchar_max = 0xFDEF; + static b32_4tech codepoint_is_whitespace(u32_4tech codepoint){ b32_4tech result = false; @@ -76,6 +90,11 @@ utf8_to_u32_length_unchecked(u8_4tech *buffer, u32_4tech *length_out){ *length_out = 4; } + if (result < cp_min_by_utf8_length[*length_out] || (result >= surrogate_min && result <= surrogate_max) || (result >= nonchar_min && result <= nonchar_max) || ((result & 0xFFFF) >= 0xFE)){ + result = 0; + *length_out = 0; + } + return(result); } @@ -104,9 +123,12 @@ utf8_to_u32(u8_4tech **buffer_ptr, u8_4tech *end){ else if (buffer[0] < 0xF0){ length = 3; } - else{ + else if (buffer[0] < 0xF8){ length = 4; } + else{ + length = 0; + } for (u32_4tech i = 1; i < length; ++i){ if ((buffer[i] & 0xC0) != 0x80){ @@ -145,6 +167,11 @@ utf8_to_u32(u8_4tech **buffer_ptr, u8_4tech *end){ }break; } + if (result < cp_min_by_utf8_length[length] || (result >= surrogate_min && result <= surrogate_max) || (result >= nonchar_min && result <= nonchar_max) || ((result & 0xFFFF) >= 0xFE)){ + result = 0; + length = 0; + } + *buffer_ptr = buffer + length; } else{ @@ -171,7 +198,7 @@ u32_to_utf8_unchecked(u32_4tech codepoint, u8_4tech *buffer, u32_4tech *length_o buffer[2] = (u8_4tech)(0x80 | (codepoint & 0x3F)); *length_out = 3; } - else{ + else if (codepoint <= 0x10FFFF){ codepoint &= 0x001FFFFF; buffer[0] = (u8_4tech)(0xF0 | (codepoint >> 18)); buffer[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F)); @@ -179,6 +206,9 @@ u32_to_utf8_unchecked(u32_4tech codepoint, u8_4tech *buffer, u32_4tech *length_o buffer[3] = (u8_4tech)(0x80 | (codepoint & 0x3F)); *length_out = 4; } + else{ + *length_out = 0; + } } static umem_4tech @@ -223,7 +253,7 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech * codepoint |= ((u32_4tech)((s[2])&0x3F)); utf8_size = 3; } - else{ + else if (s[0] < 0xF8){ if (limit > 3){ *error = true; break; @@ -235,6 +265,15 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech * codepoint |= ((u32_4tech)((s[3])&0x3F)); utf8_size = 4; } + else{ + *error = true; + break; + } + + if (codepoint < cp_min_by_utf8_length[utf8_size]){ + *error = true; + break; + } s += utf8_size; limit -= utf8_size; diff --git a/4ed_translation.cpp b/4ed_translation.cpp index 259240c0..c56b11aa 100644 --- a/4ed_translation.cpp +++ b/4ed_translation.cpp @@ -44,16 +44,17 @@ struct Translation_Emits{ u32 step_count; }; -#define SINGLE_BYTE_ERROR_CLASS max_u8 +#define ERROR_BYTE (max_u8-1) +#define CONTINUATION_BYTE max_u8 internal void translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){ desc_out->byte_class = 0; - if ((ch >= ' ' && ch < 0x7F) || ch == '\t' || ch == '\n' || ch == '\r'){ + if (ch < 0x80){ desc_out->byte_class = 1; } else if (ch < 0xC0){ - desc_out->byte_class = SINGLE_BYTE_ERROR_CLASS; + desc_out->byte_class = CONTINUATION_BYTE; } else if (ch < 0xE0){ desc_out->byte_class = 2; @@ -61,9 +62,12 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl else if (ch < 0xF0){ desc_out->byte_class = 3; } - else{ + else if (ch < 0xF8){ desc_out->byte_class = 4; } + else{ + desc_out->byte_class = ERROR_BYTE; + } desc_out->prelim_emit_type = BufferModelUnit_None; desc_out->last_byte_handler = TranLBH_None; @@ -75,7 +79,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl if (desc_out->byte_class == 1){ desc_out->prelim_emit_type = BufferModelUnit_Codepoint; } - else if (desc_out->byte_class == 0 || desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ + else if (desc_out->byte_class == 0 || desc_out->byte_class == CONTINUATION_BYTE || desc_out->byte_class == ERROR_BYTE){ desc_out->prelim_emit_type = BufferModelUnit_Numbers; } else{ @@ -83,7 +87,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl } } else{ - if (desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ + if (desc_out->byte_class == CONTINUATION_BYTE){ tran->fill_buffer[tran->fill_i] = ch; ++tran->fill_i; @@ -138,8 +142,13 @@ translating_select_emit_rule_with_font(System_Functions *system, Render_Font *fo type_out->codepoint_length = 0; if (desc.prelim_emit_type == BufferModelUnit_Codepoint){ u32 cp = utf8_to_u32_length_unchecked(tran->fill_buffer, &type_out->codepoint_length); - type_out->codepoint = cp; - if (!font_can_render(system, font, cp)){ + if (type_out->codepoint_length != 0){ + type_out->codepoint = cp; + if (!font_can_render(system, font, cp)){ + type_out->emit_type = BufferModelUnit_Numbers; + } + } + else{ type_out->emit_type = BufferModelUnit_Numbers; } }