bullet prooffed the utf8 translation

master
Allen Webster 2017-03-25 14:04:27 -04:00
parent 3c3f4bd80b
commit 8f8919fd08
2 changed files with 59 additions and 11 deletions

View File

@ -40,6 +40,20 @@ typedef int32_t b32_4tech;
#endif #endif
// standard preamble end // standard preamble end
static u32_4tech cp_min_by_utf8_length[] = {
0x0,
0x0,
0x80,
0x800,
0x10000,
};
static u32_4tech surrogate_min = 0xD800;
static u32_4tech surrogate_max = 0xDFFF;
static u32_4tech nonchar_min = 0xFDD0;
static u32_4tech nonchar_max = 0xFDEF;
static b32_4tech static b32_4tech
codepoint_is_whitespace(u32_4tech codepoint){ codepoint_is_whitespace(u32_4tech codepoint){
b32_4tech result = false; b32_4tech result = false;
@ -76,6 +90,11 @@ utf8_to_u32_length_unchecked(u8_4tech *buffer, u32_4tech *length_out){
*length_out = 4; *length_out = 4;
} }
if (result < cp_min_by_utf8_length[*length_out] || (result >= surrogate_min && result <= surrogate_max) || (result >= nonchar_min && result <= nonchar_max) || ((result & 0xFFFF) >= 0xFE)){
result = 0;
*length_out = 0;
}
return(result); return(result);
} }
@ -104,9 +123,12 @@ utf8_to_u32(u8_4tech **buffer_ptr, u8_4tech *end){
else if (buffer[0] < 0xF0){ else if (buffer[0] < 0xF0){
length = 3; length = 3;
} }
else{ else if (buffer[0] < 0xF8){
length = 4; length = 4;
} }
else{
length = 0;
}
for (u32_4tech i = 1; i < length; ++i){ for (u32_4tech i = 1; i < length; ++i){
if ((buffer[i] & 0xC0) != 0x80){ if ((buffer[i] & 0xC0) != 0x80){
@ -145,6 +167,11 @@ utf8_to_u32(u8_4tech **buffer_ptr, u8_4tech *end){
}break; }break;
} }
if (result < cp_min_by_utf8_length[length] || (result >= surrogate_min && result <= surrogate_max) || (result >= nonchar_min && result <= nonchar_max) || ((result & 0xFFFF) >= 0xFE)){
result = 0;
length = 0;
}
*buffer_ptr = buffer + length; *buffer_ptr = buffer + length;
} }
else{ else{
@ -171,7 +198,7 @@ u32_to_utf8_unchecked(u32_4tech codepoint, u8_4tech *buffer, u32_4tech *length_o
buffer[2] = (u8_4tech)(0x80 | (codepoint & 0x3F)); buffer[2] = (u8_4tech)(0x80 | (codepoint & 0x3F));
*length_out = 3; *length_out = 3;
} }
else{ else if (codepoint <= 0x10FFFF){
codepoint &= 0x001FFFFF; codepoint &= 0x001FFFFF;
buffer[0] = (u8_4tech)(0xF0 | (codepoint >> 18)); buffer[0] = (u8_4tech)(0xF0 | (codepoint >> 18));
buffer[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F)); buffer[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F));
@ -179,6 +206,9 @@ u32_to_utf8_unchecked(u32_4tech codepoint, u8_4tech *buffer, u32_4tech *length_o
buffer[3] = (u8_4tech)(0x80 | (codepoint & 0x3F)); buffer[3] = (u8_4tech)(0x80 | (codepoint & 0x3F));
*length_out = 4; *length_out = 4;
} }
else{
*length_out = 0;
}
} }
static umem_4tech static umem_4tech
@ -223,7 +253,7 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech *
codepoint |= ((u32_4tech)((s[2])&0x3F)); codepoint |= ((u32_4tech)((s[2])&0x3F));
utf8_size = 3; utf8_size = 3;
} }
else{ else if (s[0] < 0xF8){
if (limit > 3){ if (limit > 3){
*error = true; *error = true;
break; break;
@ -235,6 +265,15 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech *
codepoint |= ((u32_4tech)((s[3])&0x3F)); codepoint |= ((u32_4tech)((s[3])&0x3F));
utf8_size = 4; utf8_size = 4;
} }
else{
*error = true;
break;
}
if (codepoint < cp_min_by_utf8_length[utf8_size]){
*error = true;
break;
}
s += utf8_size; s += utf8_size;
limit -= utf8_size; limit -= utf8_size;

View File

@ -44,16 +44,17 @@ struct Translation_Emits{
u32 step_count; u32 step_count;
}; };
#define SINGLE_BYTE_ERROR_CLASS max_u8 #define ERROR_BYTE (max_u8-1)
#define CONTINUATION_BYTE max_u8
internal void internal void
translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){
desc_out->byte_class = 0; desc_out->byte_class = 0;
if ((ch >= ' ' && ch < 0x7F) || ch == '\t' || ch == '\n' || ch == '\r'){ if (ch < 0x80){
desc_out->byte_class = 1; desc_out->byte_class = 1;
} }
else if (ch < 0xC0){ else if (ch < 0xC0){
desc_out->byte_class = SINGLE_BYTE_ERROR_CLASS; desc_out->byte_class = CONTINUATION_BYTE;
} }
else if (ch < 0xE0){ else if (ch < 0xE0){
desc_out->byte_class = 2; desc_out->byte_class = 2;
@ -61,9 +62,12 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
else if (ch < 0xF0){ else if (ch < 0xF0){
desc_out->byte_class = 3; desc_out->byte_class = 3;
} }
else{ else if (ch < 0xF8){
desc_out->byte_class = 4; desc_out->byte_class = 4;
} }
else{
desc_out->byte_class = ERROR_BYTE;
}
desc_out->prelim_emit_type = BufferModelUnit_None; desc_out->prelim_emit_type = BufferModelUnit_None;
desc_out->last_byte_handler = TranLBH_None; desc_out->last_byte_handler = TranLBH_None;
@ -75,7 +79,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
if (desc_out->byte_class == 1){ if (desc_out->byte_class == 1){
desc_out->prelim_emit_type = BufferModelUnit_Codepoint; desc_out->prelim_emit_type = BufferModelUnit_Codepoint;
} }
else if (desc_out->byte_class == 0 || desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ else if (desc_out->byte_class == 0 || desc_out->byte_class == CONTINUATION_BYTE || desc_out->byte_class == ERROR_BYTE){
desc_out->prelim_emit_type = BufferModelUnit_Numbers; desc_out->prelim_emit_type = BufferModelUnit_Numbers;
} }
else{ else{
@ -83,7 +87,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
} }
} }
else{ else{
if (desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ if (desc_out->byte_class == CONTINUATION_BYTE){
tran->fill_buffer[tran->fill_i] = ch; tran->fill_buffer[tran->fill_i] = ch;
++tran->fill_i; ++tran->fill_i;
@ -138,8 +142,13 @@ translating_select_emit_rule_with_font(System_Functions *system, Render_Font *fo
type_out->codepoint_length = 0; type_out->codepoint_length = 0;
if (desc.prelim_emit_type == BufferModelUnit_Codepoint){ if (desc.prelim_emit_type == BufferModelUnit_Codepoint){
u32 cp = utf8_to_u32_length_unchecked(tran->fill_buffer, &type_out->codepoint_length); u32 cp = utf8_to_u32_length_unchecked(tran->fill_buffer, &type_out->codepoint_length);
type_out->codepoint = cp; if (type_out->codepoint_length != 0){
if (!font_can_render(system, font, cp)){ type_out->codepoint = cp;
if (!font_can_render(system, font, cp)){
type_out->emit_type = BufferModelUnit_Numbers;
}
}
else{
type_out->emit_type = BufferModelUnit_Numbers; type_out->emit_type = BufferModelUnit_Numbers;
} }
} }