From 3c3f4bd80bad5068eb40b7339423ae2f287ce240 Mon Sep 17 00:00:00 2001 From: Allen Webster Date: Sat, 25 Mar 2017 13:05:38 -0400 Subject: [PATCH] Revert "experimenting with extended utf8" This reverts commit a80ff44e04ce51afff50c80941f6024f488afb4a. --- 4coder_lib/4coder_utf8.h | 95 +++++++--------------------------------- 4ed_translation.cpp | 24 ++++------ 2 files changed, 26 insertions(+), 93 deletions(-) diff --git a/4coder_lib/4coder_utf8.h b/4coder_lib/4coder_utf8.h index b4db6262..22454d63 100644 --- a/4coder_lib/4coder_utf8.h +++ b/4coder_lib/4coder_utf8.h @@ -58,40 +58,23 @@ utf8_to_u32_length_unchecked(u8_4tech *buffer, u32_4tech *length_out){ *length_out = 1; } else if (buffer[0] < 0xE0){ - result = ((u32_4tech)((buffer[0])&0x1F)) << 6; + result = ((u32_4tech)((buffer[0])&0x1F)) << 6; result |= ((u32_4tech)((buffer[1])&0x3F)); *length_out = 2; } else if (buffer[0] < 0xF0){ - result = ((u32_4tech)((buffer[0])&0x0F)) << 12; + result = ((u32_4tech)((buffer[0])&0x0F)) << 12; result |= ((u32_4tech)((buffer[1])&0x3F)) << 6; result |= ((u32_4tech)((buffer[2])&0x3F)); *length_out = 3; } - else if (buffer[0] < 0xF8){ - result = ((u32_4tech)((buffer[0])&0x07)) << 18; + else{ + result = ((u32_4tech)((buffer[0])&0x07)) << 18; result |= ((u32_4tech)((buffer[1])&0x3F)) << 12; result |= ((u32_4tech)((buffer[2])&0x3F)) << 6; result |= ((u32_4tech)((buffer[3])&0x3F)); *length_out = 4; } - else if (buffer[0] < 0xFC){ - result = ((u32_4tech)((buffer[0])&0x03)) << 24; - result |= ((u32_4tech)((buffer[1])&0x3F)) << 18; - result |= ((u32_4tech)((buffer[2])&0x3F)) << 12; - result |= ((u32_4tech)((buffer[3])&0x3F)) << 6; - result |= ((u32_4tech)((buffer[4])&0x3F)); - *length_out = 5; - } - else{ - result = ((u32_4tech)((buffer[0])&0x01)) << 30; - result |= ((u32_4tech)((buffer[1])&0x03)) << 24; - result |= ((u32_4tech)((buffer[2])&0x3F)) << 18; - result |= ((u32_4tech)((buffer[3])&0x3F)) << 12; - result |= ((u32_4tech)((buffer[4])&0x3F)) << 6; - result |= ((u32_4tech)((buffer[5])&0x3F)); - *length_out = 6; - } return(result); } @@ -225,7 +208,7 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech * break; } - codepoint = ((u32_4tech)((s[0])&0x1F)) << 6; + codepoint = ((u32_4tech)((s[0])&0x1F)) << 6; codepoint |= ((u32_4tech)((s[1])&0x3F)); utf8_size = 2; } @@ -235,50 +218,23 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech * break; } - codepoint = ((u32_4tech)((s[0])&0x0F)) << 12; + codepoint = ((u32_4tech)((s[0])&0x0F)) << 12; codepoint |= ((u32_4tech)((s[1])&0x3F)) << 6; codepoint |= ((u32_4tech)((s[2])&0x3F)); utf8_size = 3; } - else if (s[0] < 0xF8){ - if (limit <= 3){ + else{ + if (limit > 3){ *error = true; break; } - codepoint = ((u32_4tech)((s[0])&0x07)) << 18; + codepoint = ((u32_4tech)((s[0])&0x07)) << 18; codepoint |= ((u32_4tech)((s[1])&0x3F)) << 12; codepoint |= ((u32_4tech)((s[2])&0x3F)) << 6; codepoint |= ((u32_4tech)((s[3])&0x3F)); utf8_size = 4; } - else if (s[0] < 0xFC){ - if (limit <= 4){ - *error = true; - break; - } - - codepoint = ((u32_4tech)((s[0])&0x03)) << 24; - codepoint |= ((u32_4tech)((s[1])&0x3F)) << 18; - codepoint |= ((u32_4tech)((s[2])&0x3F)) << 12; - codepoint |= ((u32_4tech)((s[3])&0x3F)) << 6; - codepoint |= ((u32_4tech)((s[4])&0x3F)); - utf8_size = 5; - } - else if (s[0] < 0xFE){ - if (limit <= 5){ - *error = true; - break; - } - - codepoint = ((u32_4tech)((s[0])&0x01)) << 30; - codepoint |= ((u32_4tech)((s[1])&0x3F)) << 24; - codepoint |= ((u32_4tech)((s[2])&0x3F)) << 18; - codepoint |= ((u32_4tech)((s[3])&0x3F)) << 12; - codepoint |= ((u32_4tech)((s[4])&0x3F)) << 6; - codepoint |= ((u32_4tech)((s[5])&0x3F)); - utf8_size = 6; - } s += utf8_size; limit -= utf8_size; @@ -364,7 +320,7 @@ utf16_to_utf8_minimal_checking(u8_4tech *dst, umem_4tech max_chars, u16_4tech *s s += utf16_size; limit -= utf16_size; - u8_4tech d_fill[6]; + u8_4tech d_fill[4]; u32_4tech d_fill_count = 0; if (codepoint <= 0x7F){ @@ -377,35 +333,18 @@ utf16_to_utf8_minimal_checking(u8_4tech *dst, umem_4tech max_chars, u16_4tech *s d_fill_count = 2; } else if (codepoint <= 0xFFFF){ - d_fill[0] = (u8_4tech)(0xE0 | ( codepoint >> 12)); - d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F)); - d_fill[2] = (u8_4tech)(0x80 | ((codepoint) & 0x3F)); + d_fill[0] = (u8_4tech)(0xE0 | (codepoint >> 12)); + d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 6) & 0x3F)); + d_fill[2] = (u8_4tech)(0x80 | (codepoint & 0x3F)); d_fill_count = 3; } - else if (codepoint <= 0x1FFFFF){ - d_fill[0] = (u8_4tech)(0xF0 | ( codepoint >> 18)); + else if (codepoint <= 0x10FFFF){ + d_fill[0] = (u8_4tech)(0xF0 | (codepoint >> 18)); d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F)); - d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F)); - d_fill[3] = (u8_4tech)(0x80 | ((codepoint) & 0x3F)); + d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 6) & 0x3F)); + d_fill[3] = (u8_4tech)(0x80 | (codepoint & 0x3F)); d_fill_count = 4; } - else if (codepoint <= 0x3FFFFFF){ - d_fill[0] = (u8_4tech)(0xF8 | ( codepoint >> 24)); - d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 18) & 0x3F)); - d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F)); - d_fill[3] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F)); - d_fill[4] = (u8_4tech)(0x80 | ((codepoint) & 0x3F)); - d_fill_count = 5; - } - else if (codepoint <= 0x7FFFFFFF){ - d_fill[0] = (u8_4tech)(0xFC | ( codepoint >> 30)); - d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 24) & 0x3F)); - d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 18) & 0x3F)); - d_fill[3] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F)); - d_fill[4] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F)); - d_fill[5] = (u8_4tech)(0x80 | ((codepoint) & 0x3F)); - d_fill_count = 6; - } else{ *error = true; break; diff --git a/4ed_translation.cpp b/4ed_translation.cpp index 0e5c4853..259240c0 100644 --- a/4ed_translation.cpp +++ b/4ed_translation.cpp @@ -12,7 +12,7 @@ #include "4ed_buffer_model.h" struct Translation_State{ - u8 fill_buffer[6]; + u8 fill_buffer[4]; u32 fill_start_i; u8 fill_i; u8 fill_expected; @@ -40,20 +40,20 @@ struct Translation_Emit_Rule{ }; struct Translation_Emits{ - Buffer_Model_Step steps[7]; + Buffer_Model_Step steps[5]; u32 step_count; }; -#define CONTINUATION_BYTE max_u8 +#define SINGLE_BYTE_ERROR_CLASS max_u8 internal void translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){ desc_out->byte_class = 0; - if (ch < 0x80){ + if ((ch >= ' ' && ch < 0x7F) || ch == '\t' || ch == '\n' || ch == '\r'){ desc_out->byte_class = 1; } else if (ch < 0xC0){ - desc_out->byte_class = CONTINUATION_BYTE; + desc_out->byte_class = SINGLE_BYTE_ERROR_CLASS; } else if (ch < 0xE0){ desc_out->byte_class = 2; @@ -61,14 +61,8 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl else if (ch < 0xF0){ desc_out->byte_class = 3; } - else if (ch < 0xF8){ - desc_out->byte_class = 4; - } - else if (ch < 0xFC){ - desc_out->byte_class = 5; - } else{ - desc_out->byte_class = 6; + desc_out->byte_class = 4; } desc_out->prelim_emit_type = BufferModelUnit_None; @@ -81,7 +75,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl if (desc_out->byte_class == 1){ desc_out->prelim_emit_type = BufferModelUnit_Codepoint; } - else if (desc_out->byte_class == 0 || desc_out->byte_class == CONTINUATION_BYTE){ + else if (desc_out->byte_class == 0 || desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ desc_out->prelim_emit_type = BufferModelUnit_Numbers; } else{ @@ -89,7 +83,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl } } else{ - if (desc_out->byte_class == CONTINUATION_BYTE){ + if (desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){ tran->fill_buffer[tran->fill_i] = ch; ++tran->fill_i; @@ -98,7 +92,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl } } else{ - if (desc_out->byte_class >= 2 && desc_out->byte_class <= 6){ + if (desc_out->byte_class >= 2 && desc_out->byte_class <= 4){ desc_out->last_byte_handler = TranLBH_Rebuffer; } else if (desc_out->byte_class == 1){