396 lines
9.6 KiB
C
396 lines
9.6 KiB
C
/*
|
|
* Mr. 4th Dimention - Allen Webster
|
|
*
|
|
* 17.02.2017
|
|
*
|
|
* Code for converting to and from utf8 to ANSI and utf16 text encodings.
|
|
*
|
|
*/
|
|
|
|
// TOP
|
|
|
|
#if !defined(FED_UTF8_CONVERSION_H)
|
|
#define FED_UTF8_CONVERSION_H
|
|
|
|
static u32 cp_min_by_utf8_length[] = {
|
|
0x0,
|
|
0x0,
|
|
0x80,
|
|
0x800,
|
|
0x10000,
|
|
};
|
|
|
|
static u32 surrogate_min = 0xD800;
|
|
static u32 surrogate_max = 0xDFFF;
|
|
|
|
static u32 nonchar_min = 0xFDD0;
|
|
static u32 nonchar_max = 0xFDEF;
|
|
|
|
static b32
|
|
codepoint_is_whitespace(u32 codepoint){
|
|
b32 result = false;
|
|
if (codepoint == ' ' || codepoint == '\r' || codepoint == '\n' || codepoint == '\t'){
|
|
result = true;
|
|
}
|
|
return(result);
|
|
}
|
|
|
|
static u32
|
|
utf8_to_u32_length_unchecked(u8 *buffer, u32 *length_out){
|
|
u32 result = 0;
|
|
|
|
if (buffer[0] < 0x80){
|
|
result = (u32)buffer[0];
|
|
*length_out = 1;
|
|
}
|
|
else if (buffer[0] < 0xE0){
|
|
result = ((u32)((buffer[0])&0x1F)) << 6;
|
|
result |= ((u32)((buffer[1])&0x3F));
|
|
*length_out = 2;
|
|
}
|
|
else if (buffer[0] < 0xF0){
|
|
result = ((u32)((buffer[0])&0x0F)) << 12;
|
|
result |= ((u32)((buffer[1])&0x3F)) << 6;
|
|
result |= ((u32)((buffer[2])&0x3F));
|
|
*length_out = 3;
|
|
}
|
|
else{
|
|
result = ((u32)((buffer[0])&0x07)) << 18;
|
|
result |= ((u32)((buffer[1])&0x3F)) << 12;
|
|
result |= ((u32)((buffer[2])&0x3F)) << 6;
|
|
result |= ((u32)((buffer[3])&0x3F));
|
|
*length_out = 4;
|
|
}
|
|
|
|
if (result < cp_min_by_utf8_length[*length_out] || (result >= surrogate_min && result <= surrogate_max)){
|
|
result = 0;
|
|
*length_out = 0;
|
|
}
|
|
|
|
return(result);
|
|
}
|
|
|
|
static u32
|
|
utf8_to_u32_unchecked(u8 *buffer){
|
|
u32 ignore;
|
|
u32 result = utf8_to_u32_length_unchecked(buffer, &ignore);
|
|
return(result);
|
|
}
|
|
|
|
static u32
|
|
utf8_to_u32(u8 **buffer_ptr, u8 *end){
|
|
u8 *buffer = *buffer_ptr;
|
|
u32 limit = (u32)(end - buffer);
|
|
|
|
u32 length = 0;
|
|
if (buffer[0] < 0x80){
|
|
length = 1;
|
|
}
|
|
else if (buffer[0] < 0xC0){
|
|
length = 0;
|
|
}
|
|
else if (buffer[0] < 0xE0){
|
|
length = 2;
|
|
}
|
|
else if (buffer[0] < 0xF0){
|
|
length = 3;
|
|
}
|
|
else if (buffer[0] < 0xF8){
|
|
length = 4;
|
|
}
|
|
else{
|
|
length = 0;
|
|
}
|
|
|
|
for (u32 i = 1; i < length; ++i){
|
|
if ((buffer[i] & 0xC0) != 0x80){
|
|
length = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
u32 result = 0;
|
|
if (length != 0 && length <= limit){
|
|
switch (length){
|
|
case 1:
|
|
{
|
|
result = (u32)buffer[0];
|
|
}break;
|
|
|
|
case 2:
|
|
{
|
|
result = ((u32)((buffer[0])&0x1F)) << 6;
|
|
result |= ((u32)((buffer[1])&0x3F));
|
|
}break;
|
|
|
|
case 3:
|
|
{
|
|
result = ((u32)((buffer[0])&0x0F)) << 12;
|
|
result |= ((u32)((buffer[1])&0x3F)) << 6;
|
|
result |= ((u32)((buffer[2])&0x3F));
|
|
}break;
|
|
|
|
case 4:
|
|
{
|
|
result = ((u32)((buffer[0])&0x07)) << 18;
|
|
result |= ((u32)((buffer[1])&0x3F)) << 12;
|
|
result |= ((u32)((buffer[2])&0x3F)) << 6;
|
|
result |= ((u32)((buffer[3])&0x3F));
|
|
}break;
|
|
}
|
|
|
|
if (result < cp_min_by_utf8_length[length] || (result >= surrogate_min && result <= surrogate_max)){
|
|
result = 0;
|
|
length = 0;
|
|
}
|
|
|
|
*buffer_ptr = buffer + length;
|
|
}
|
|
else{
|
|
*buffer_ptr = end;
|
|
}
|
|
|
|
return(result);
|
|
}
|
|
|
|
static void
|
|
u32_to_utf8_unchecked(u32 codepoint, u8 *buffer, u32 *length_out){
|
|
if (codepoint <= 0x7F){
|
|
buffer[0] = (u8)codepoint;
|
|
*length_out = 1;
|
|
}
|
|
else if (codepoint <= 0x7FF){
|
|
buffer[0] = (u8)(0xC0 | (codepoint >> 6));
|
|
buffer[1] = (u8)(0x80 | (codepoint & 0x3F));
|
|
*length_out = 2;
|
|
}
|
|
else if (codepoint <= 0xFFFF){
|
|
buffer[0] = (u8)(0xE0 | (codepoint >> 12));
|
|
buffer[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
|
|
buffer[2] = (u8)(0x80 | (codepoint & 0x3F));
|
|
*length_out = 3;
|
|
}
|
|
else if (codepoint <= 0x10FFFF){
|
|
codepoint &= 0x001FFFFF;
|
|
buffer[0] = (u8)(0xF0 | (codepoint >> 18));
|
|
buffer[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
|
|
buffer[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
|
|
buffer[3] = (u8)(0x80 | (codepoint & 0x3F));
|
|
*length_out = 4;
|
|
}
|
|
else{
|
|
*length_out = 0;
|
|
}
|
|
}
|
|
|
|
static umem
|
|
utf8_to_utf16_minimal_checking(u16 *dst, umem max_wchars, u8 *src, umem length, b32 *error){
|
|
u8 *s = src;
|
|
u8 *s_end = s + length;
|
|
|
|
u16 *d = dst;
|
|
u16 *d_end = d + max_wchars;
|
|
umem limit = length;
|
|
|
|
umem needed_max = 0;
|
|
u32 advance = 1;
|
|
|
|
*error = false;
|
|
for(; s < s_end;){
|
|
u32 codepoint = 0;
|
|
u32 utf8_size = 0;
|
|
|
|
if (s[0] < 0x80){
|
|
codepoint = (u32)s[0];
|
|
utf8_size = 1;
|
|
}
|
|
else if (s[0] < 0xE0){
|
|
if (limit <= 1){
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
codepoint = ((u32)((s[0])&0x1F)) << 6;
|
|
codepoint |= ((u32)((s[1])&0x3F));
|
|
utf8_size = 2;
|
|
}
|
|
else if (s[0] < 0xF0){
|
|
if (limit <= 2){
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
codepoint = ((u32)((s[0])&0x0F)) << 12;
|
|
codepoint |= ((u32)((s[1])&0x3F)) << 6;
|
|
codepoint |= ((u32)((s[2])&0x3F));
|
|
utf8_size = 3;
|
|
}
|
|
else if (s[0] < 0xF8){
|
|
if (limit > 3){
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
codepoint = ((u32)((s[0])&0x07)) << 18;
|
|
codepoint |= ((u32)((s[1])&0x3F)) << 12;
|
|
codepoint |= ((u32)((s[2])&0x3F)) << 6;
|
|
codepoint |= ((u32)((s[3])&0x3F));
|
|
utf8_size = 4;
|
|
}
|
|
else{
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
if (codepoint < cp_min_by_utf8_length[utf8_size]){
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
s += utf8_size;
|
|
limit -= utf8_size;
|
|
|
|
if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)){
|
|
*d = (u16)(codepoint);
|
|
d += advance;
|
|
needed_max += 1;
|
|
}
|
|
else if (codepoint >= 0x10000 && codepoint <= 0x10FFFF){
|
|
codepoint -= 0x10000;
|
|
|
|
u32 high = (codepoint >> 10) & 0x03FF;
|
|
u32 low = (codepoint) & 0x03FF;
|
|
|
|
high += 0xD800;
|
|
low += 0xDC00;
|
|
|
|
if (d + advance < d_end){
|
|
*d = (u16)high;
|
|
d += advance;
|
|
*d = (u16)low;
|
|
d += advance;
|
|
}
|
|
else{
|
|
advance = 0;
|
|
}
|
|
|
|
needed_max += 2;
|
|
}
|
|
else{
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
if (d >= d_end){
|
|
advance = 0;
|
|
}
|
|
}
|
|
|
|
return(needed_max);
|
|
}
|
|
|
|
static umem
|
|
utf16_to_utf8_minimal_checking(u8 *dst, umem max_chars, u16 *src, umem length, b32 *error){
|
|
u16 *s = src;
|
|
u16 *s_end = s + length;
|
|
|
|
u8 *d = dst;
|
|
u8 *d_end = d + max_chars;
|
|
umem limit = length;
|
|
|
|
umem needed_max = 0;
|
|
|
|
*error = false;
|
|
|
|
for (; s < s_end;){
|
|
u32 codepoint = 0;
|
|
u32 utf16_size = 0;
|
|
|
|
if (s[0] <= 0xD7FF || (s[0] >= 0xE000 && s[0] <= 0xFFFF)){
|
|
codepoint = s[0];
|
|
utf16_size = 1;
|
|
}
|
|
else{
|
|
if (s[0] >= 0xD800 && s[0] <= 0xDBFF){
|
|
if (limit <= 1){
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
u32 high = s[0] - 0xD800;
|
|
u32 low = s[1] - 0xDC00;
|
|
codepoint = ((high << 10) | (low)) + 0x10000;
|
|
utf16_size = 2;
|
|
}
|
|
else{
|
|
*error = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
s += utf16_size;
|
|
limit -= utf16_size;
|
|
|
|
u8 d_fill[4];
|
|
u32 d_fill_count = 0;
|
|
|
|
if (codepoint <= 0x7F){
|
|
d_fill[0] = (u8)codepoint;
|
|
d_fill_count = 1;
|
|
}
|
|
else if (codepoint <= 0x7FF){
|
|
d_fill[0] = (u8)(0xC0 | (codepoint >> 6));
|
|
d_fill[1] = (u8)(0x80 | (codepoint & 0x3F));
|
|
d_fill_count = 2;
|
|
}
|
|
else if (codepoint <= 0xFFFF){
|
|
d_fill[0] = (u8)(0xE0 | (codepoint >> 12));
|
|
d_fill[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
|
|
d_fill[2] = (u8)(0x80 | (codepoint & 0x3F));
|
|
d_fill_count = 3;
|
|
}
|
|
else if (codepoint <= 0x10FFFF){
|
|
d_fill[0] = (u8)(0xF0 | (codepoint >> 18));
|
|
d_fill[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
|
|
d_fill[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
|
|
d_fill[3] = (u8)(0x80 | (codepoint & 0x3F));
|
|
d_fill_count = 4;
|
|
}
|
|
else{
|
|
*error = true;
|
|
break;
|
|
}
|
|
|
|
if (d + d_fill_count <= d_end){
|
|
for (u32 i = 0; i < d_fill_count; ++i){
|
|
*d = d_fill[i];
|
|
++d;
|
|
}
|
|
}
|
|
needed_max += d_fill_count;
|
|
}
|
|
|
|
return(needed_max);
|
|
}
|
|
|
|
static void
|
|
byte_to_ascii(u8 n, u8 *out){
|
|
u8 C = '0' + (n / 0x10);
|
|
if ((n / 0x10) > 0x9){
|
|
C = ('A' - 0xA) + (n / 0x10);
|
|
}
|
|
out[0] = C;
|
|
|
|
n = (n % 0x10);
|
|
C = '0' + n;
|
|
if (n > 0x9){
|
|
C = ('A' - 0xA) + n;
|
|
}
|
|
out[1] = C;
|
|
}
|
|
|
|
#endif
|
|
|
|
// BOTTOM
|
|
|