experimenting with extended utf8
parent
7521c2f436
commit
a80ff44e04
|
@ -68,13 +68,30 @@ utf8_to_u32_length_unchecked(u8_4tech *buffer, u32_4tech *length_out){
|
||||||
result |= ((u32_4tech)((buffer[2])&0x3F));
|
result |= ((u32_4tech)((buffer[2])&0x3F));
|
||||||
*length_out = 3;
|
*length_out = 3;
|
||||||
}
|
}
|
||||||
else{
|
else if (buffer[0] < 0xF8){
|
||||||
result = ((u32_4tech)((buffer[0])&0x07)) << 18;
|
result = ((u32_4tech)((buffer[0])&0x07)) << 18;
|
||||||
result |= ((u32_4tech)((buffer[1])&0x3F)) << 12;
|
result |= ((u32_4tech)((buffer[1])&0x3F)) << 12;
|
||||||
result |= ((u32_4tech)((buffer[2])&0x3F)) << 6;
|
result |= ((u32_4tech)((buffer[2])&0x3F)) << 6;
|
||||||
result |= ((u32_4tech)((buffer[3])&0x3F));
|
result |= ((u32_4tech)((buffer[3])&0x3F));
|
||||||
*length_out = 4;
|
*length_out = 4;
|
||||||
}
|
}
|
||||||
|
else if (buffer[0] < 0xFC){
|
||||||
|
result = ((u32_4tech)((buffer[0])&0x03)) << 24;
|
||||||
|
result |= ((u32_4tech)((buffer[1])&0x3F)) << 18;
|
||||||
|
result |= ((u32_4tech)((buffer[2])&0x3F)) << 12;
|
||||||
|
result |= ((u32_4tech)((buffer[3])&0x3F)) << 6;
|
||||||
|
result |= ((u32_4tech)((buffer[4])&0x3F));
|
||||||
|
*length_out = 5;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
result = ((u32_4tech)((buffer[0])&0x01)) << 30;
|
||||||
|
result |= ((u32_4tech)((buffer[1])&0x03)) << 24;
|
||||||
|
result |= ((u32_4tech)((buffer[2])&0x3F)) << 18;
|
||||||
|
result |= ((u32_4tech)((buffer[3])&0x3F)) << 12;
|
||||||
|
result |= ((u32_4tech)((buffer[4])&0x3F)) << 6;
|
||||||
|
result |= ((u32_4tech)((buffer[5])&0x3F));
|
||||||
|
*length_out = 6;
|
||||||
|
}
|
||||||
|
|
||||||
return(result);
|
return(result);
|
||||||
}
|
}
|
||||||
|
@ -223,8 +240,8 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech *
|
||||||
codepoint |= ((u32_4tech)((s[2])&0x3F));
|
codepoint |= ((u32_4tech)((s[2])&0x3F));
|
||||||
utf8_size = 3;
|
utf8_size = 3;
|
||||||
}
|
}
|
||||||
else{
|
else if (s[0] < 0xF8){
|
||||||
if (limit > 3){
|
if (limit <= 3){
|
||||||
*error = true;
|
*error = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -235,6 +252,33 @@ utf8_to_utf16_minimal_checking(u16_4tech *dst, umem_4tech max_wchars, u8_4tech *
|
||||||
codepoint |= ((u32_4tech)((s[3])&0x3F));
|
codepoint |= ((u32_4tech)((s[3])&0x3F));
|
||||||
utf8_size = 4;
|
utf8_size = 4;
|
||||||
}
|
}
|
||||||
|
else if (s[0] < 0xFC){
|
||||||
|
if (limit <= 4){
|
||||||
|
*error = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
codepoint = ((u32_4tech)((s[0])&0x03)) << 24;
|
||||||
|
codepoint |= ((u32_4tech)((s[1])&0x3F)) << 18;
|
||||||
|
codepoint |= ((u32_4tech)((s[2])&0x3F)) << 12;
|
||||||
|
codepoint |= ((u32_4tech)((s[3])&0x3F)) << 6;
|
||||||
|
codepoint |= ((u32_4tech)((s[4])&0x3F));
|
||||||
|
utf8_size = 5;
|
||||||
|
}
|
||||||
|
else if (s[0] < 0xFE){
|
||||||
|
if (limit <= 5){
|
||||||
|
*error = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
codepoint = ((u32_4tech)((s[0])&0x01)) << 30;
|
||||||
|
codepoint |= ((u32_4tech)((s[1])&0x3F)) << 24;
|
||||||
|
codepoint |= ((u32_4tech)((s[2])&0x3F)) << 18;
|
||||||
|
codepoint |= ((u32_4tech)((s[3])&0x3F)) << 12;
|
||||||
|
codepoint |= ((u32_4tech)((s[4])&0x3F)) << 6;
|
||||||
|
codepoint |= ((u32_4tech)((s[5])&0x3F));
|
||||||
|
utf8_size = 6;
|
||||||
|
}
|
||||||
|
|
||||||
s += utf8_size;
|
s += utf8_size;
|
||||||
limit -= utf8_size;
|
limit -= utf8_size;
|
||||||
|
@ -320,7 +364,7 @@ utf16_to_utf8_minimal_checking(u8_4tech *dst, umem_4tech max_chars, u16_4tech *s
|
||||||
s += utf16_size;
|
s += utf16_size;
|
||||||
limit -= utf16_size;
|
limit -= utf16_size;
|
||||||
|
|
||||||
u8_4tech d_fill[4];
|
u8_4tech d_fill[6];
|
||||||
u32_4tech d_fill_count = 0;
|
u32_4tech d_fill_count = 0;
|
||||||
|
|
||||||
if (codepoint <= 0x7F){
|
if (codepoint <= 0x7F){
|
||||||
|
@ -335,16 +379,33 @@ utf16_to_utf8_minimal_checking(u8_4tech *dst, umem_4tech max_chars, u16_4tech *s
|
||||||
else if (codepoint <= 0xFFFF){
|
else if (codepoint <= 0xFFFF){
|
||||||
d_fill[0] = (u8_4tech)(0xE0 | ( codepoint >> 12));
|
d_fill[0] = (u8_4tech)(0xE0 | ( codepoint >> 12));
|
||||||
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
||||||
d_fill[2] = (u8_4tech)(0x80 | (codepoint & 0x3F));
|
d_fill[2] = (u8_4tech)(0x80 | ((codepoint) & 0x3F));
|
||||||
d_fill_count = 3;
|
d_fill_count = 3;
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0x10FFFF){
|
else if (codepoint <= 0x1FFFFF){
|
||||||
d_fill[0] = (u8_4tech)(0xF0 | ( codepoint >> 18));
|
d_fill[0] = (u8_4tech)(0xF0 | ( codepoint >> 18));
|
||||||
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F));
|
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
||||||
d_fill[3] = (u8_4tech)(0x80 | (codepoint & 0x3F));
|
d_fill[3] = (u8_4tech)(0x80 | ((codepoint) & 0x3F));
|
||||||
d_fill_count = 4;
|
d_fill_count = 4;
|
||||||
}
|
}
|
||||||
|
else if (codepoint <= 0x3FFFFFF){
|
||||||
|
d_fill[0] = (u8_4tech)(0xF8 | ( codepoint >> 24));
|
||||||
|
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 18) & 0x3F));
|
||||||
|
d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
|
d_fill[3] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
||||||
|
d_fill[4] = (u8_4tech)(0x80 | ((codepoint) & 0x3F));
|
||||||
|
d_fill_count = 5;
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x7FFFFFFF){
|
||||||
|
d_fill[0] = (u8_4tech)(0xFC | ( codepoint >> 30));
|
||||||
|
d_fill[1] = (u8_4tech)(0x80 | ((codepoint >> 24) & 0x3F));
|
||||||
|
d_fill[2] = (u8_4tech)(0x80 | ((codepoint >> 18) & 0x3F));
|
||||||
|
d_fill[3] = (u8_4tech)(0x80 | ((codepoint >> 12) & 0x3F));
|
||||||
|
d_fill[4] = (u8_4tech)(0x80 | ((codepoint >> 6 ) & 0x3F));
|
||||||
|
d_fill[5] = (u8_4tech)(0x80 | ((codepoint) & 0x3F));
|
||||||
|
d_fill_count = 6;
|
||||||
|
}
|
||||||
else{
|
else{
|
||||||
*error = true;
|
*error = true;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
#include "4ed_buffer_model.h"
|
#include "4ed_buffer_model.h"
|
||||||
|
|
||||||
struct Translation_State{
|
struct Translation_State{
|
||||||
u8 fill_buffer[4];
|
u8 fill_buffer[6];
|
||||||
u32 fill_start_i;
|
u32 fill_start_i;
|
||||||
u8 fill_i;
|
u8 fill_i;
|
||||||
u8 fill_expected;
|
u8 fill_expected;
|
||||||
|
@ -40,20 +40,20 @@ struct Translation_Emit_Rule{
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Translation_Emits{
|
struct Translation_Emits{
|
||||||
Buffer_Model_Step steps[5];
|
Buffer_Model_Step steps[7];
|
||||||
u32 step_count;
|
u32 step_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SINGLE_BYTE_ERROR_CLASS max_u8
|
#define CONTINUATION_BYTE max_u8
|
||||||
|
|
||||||
internal void
|
internal void
|
||||||
translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){
|
translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Translation_Byte_Description *desc_out){
|
||||||
desc_out->byte_class = 0;
|
desc_out->byte_class = 0;
|
||||||
if ((ch >= ' ' && ch < 0x7F) || ch == '\t' || ch == '\n' || ch == '\r'){
|
if (ch < 0x80){
|
||||||
desc_out->byte_class = 1;
|
desc_out->byte_class = 1;
|
||||||
}
|
}
|
||||||
else if (ch < 0xC0){
|
else if (ch < 0xC0){
|
||||||
desc_out->byte_class = SINGLE_BYTE_ERROR_CLASS;
|
desc_out->byte_class = CONTINUATION_BYTE;
|
||||||
}
|
}
|
||||||
else if (ch < 0xE0){
|
else if (ch < 0xE0){
|
||||||
desc_out->byte_class = 2;
|
desc_out->byte_class = 2;
|
||||||
|
@ -61,9 +61,15 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
|
||||||
else if (ch < 0xF0){
|
else if (ch < 0xF0){
|
||||||
desc_out->byte_class = 3;
|
desc_out->byte_class = 3;
|
||||||
}
|
}
|
||||||
else{
|
else if (ch < 0xF8){
|
||||||
desc_out->byte_class = 4;
|
desc_out->byte_class = 4;
|
||||||
}
|
}
|
||||||
|
else if (ch < 0xFC){
|
||||||
|
desc_out->byte_class = 5;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
desc_out->byte_class = 6;
|
||||||
|
}
|
||||||
|
|
||||||
desc_out->prelim_emit_type = BufferModelUnit_None;
|
desc_out->prelim_emit_type = BufferModelUnit_None;
|
||||||
desc_out->last_byte_handler = TranLBH_None;
|
desc_out->last_byte_handler = TranLBH_None;
|
||||||
|
@ -75,7 +81,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
|
||||||
if (desc_out->byte_class == 1){
|
if (desc_out->byte_class == 1){
|
||||||
desc_out->prelim_emit_type = BufferModelUnit_Codepoint;
|
desc_out->prelim_emit_type = BufferModelUnit_Codepoint;
|
||||||
}
|
}
|
||||||
else if (desc_out->byte_class == 0 || desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){
|
else if (desc_out->byte_class == 0 || desc_out->byte_class == CONTINUATION_BYTE){
|
||||||
desc_out->prelim_emit_type = BufferModelUnit_Numbers;
|
desc_out->prelim_emit_type = BufferModelUnit_Numbers;
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
|
@ -83,7 +89,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
if (desc_out->byte_class == SINGLE_BYTE_ERROR_CLASS){
|
if (desc_out->byte_class == CONTINUATION_BYTE){
|
||||||
tran->fill_buffer[tran->fill_i] = ch;
|
tran->fill_buffer[tran->fill_i] = ch;
|
||||||
++tran->fill_i;
|
++tran->fill_i;
|
||||||
|
|
||||||
|
@ -92,7 +98,7 @@ translating_consume_byte(Translation_State *tran, u8 ch, u32 i, u32 size, Transl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
if (desc_out->byte_class >= 2 && desc_out->byte_class <= 4){
|
if (desc_out->byte_class >= 2 && desc_out->byte_class <= 6){
|
||||||
desc_out->last_byte_handler = TranLBH_Rebuffer;
|
desc_out->last_byte_handler = TranLBH_Rebuffer;
|
||||||
}
|
}
|
||||||
else if (desc_out->byte_class == 1){
|
else if (desc_out->byte_class == 1){
|
||||||
|
|
Loading…
Reference in New Issue