4coder/4coder_lib/4coder_utf8.h

396 lines
9.6 KiB
C

/*
* Mr. 4th Dimention - Allen Webster
*
* 17.02.2017
*
* Code for converting to and from utf8 to ANSI and utf16 text encodings.
*
*/
// TOP
#if !defined(FED_UTF8_CONVERSION_H)
#define FED_UTF8_CONVERSION_H
static u32 cp_min_by_utf8_length[] = {
0x0,
0x0,
0x80,
0x800,
0x10000,
};
static u32 surrogate_min = 0xD800;
static u32 surrogate_max = 0xDFFF;
static u32 nonchar_min = 0xFDD0;
static u32 nonchar_max = 0xFDEF;
static b32
codepoint_is_whitespace(u32 codepoint){
b32 result = false;
if (codepoint == ' ' || codepoint == '\r' || codepoint == '\n' || codepoint == '\t'){
result = true;
}
return(result);
}
static u32
utf8_to_u32_length_unchecked(u8 *buffer, u32 *length_out){
u32 result = 0;
if (buffer[0] < 0x80){
result = (u32)buffer[0];
*length_out = 1;
}
else if (buffer[0] < 0xE0){
result = ((u32)((buffer[0])&0x1F)) << 6;
result |= ((u32)((buffer[1])&0x3F));
*length_out = 2;
}
else if (buffer[0] < 0xF0){
result = ((u32)((buffer[0])&0x0F)) << 12;
result |= ((u32)((buffer[1])&0x3F)) << 6;
result |= ((u32)((buffer[2])&0x3F));
*length_out = 3;
}
else{
result = ((u32)((buffer[0])&0x07)) << 18;
result |= ((u32)((buffer[1])&0x3F)) << 12;
result |= ((u32)((buffer[2])&0x3F)) << 6;
result |= ((u32)((buffer[3])&0x3F));
*length_out = 4;
}
if (result < cp_min_by_utf8_length[*length_out] || (result >= surrogate_min && result <= surrogate_max)){
result = 0;
*length_out = 0;
}
return(result);
}
static u32
utf8_to_u32_unchecked(u8 *buffer){
u32 ignore;
u32 result = utf8_to_u32_length_unchecked(buffer, &ignore);
return(result);
}
static u32
utf8_to_u32(u8 **buffer_ptr, u8 *end){
u8 *buffer = *buffer_ptr;
u32 limit = (u32)(end - buffer);
u32 length = 0;
if (buffer[0] < 0x80){
length = 1;
}
else if (buffer[0] < 0xC0){
length = 0;
}
else if (buffer[0] < 0xE0){
length = 2;
}
else if (buffer[0] < 0xF0){
length = 3;
}
else if (buffer[0] < 0xF8){
length = 4;
}
else{
length = 0;
}
for (u32 i = 1; i < length; ++i){
if ((buffer[i] & 0xC0) != 0x80){
length = 0;
break;
}
}
u32 result = 0;
if (length != 0 && length <= limit){
switch (length){
case 1:
{
result = (u32)buffer[0];
}break;
case 2:
{
result = ((u32)((buffer[0])&0x1F)) << 6;
result |= ((u32)((buffer[1])&0x3F));
}break;
case 3:
{
result = ((u32)((buffer[0])&0x0F)) << 12;
result |= ((u32)((buffer[1])&0x3F)) << 6;
result |= ((u32)((buffer[2])&0x3F));
}break;
case 4:
{
result = ((u32)((buffer[0])&0x07)) << 18;
result |= ((u32)((buffer[1])&0x3F)) << 12;
result |= ((u32)((buffer[2])&0x3F)) << 6;
result |= ((u32)((buffer[3])&0x3F));
}break;
}
if (result < cp_min_by_utf8_length[length] || (result >= surrogate_min && result <= surrogate_max)){
result = 0;
length = 0;
}
*buffer_ptr = buffer + length;
}
else{
*buffer_ptr = end;
}
return(result);
}
static void
u32_to_utf8_unchecked(u32 codepoint, u8 *buffer, u32 *length_out){
if (codepoint <= 0x7F){
buffer[0] = (u8)codepoint;
*length_out = 1;
}
else if (codepoint <= 0x7FF){
buffer[0] = (u8)(0xC0 | (codepoint >> 6));
buffer[1] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 2;
}
else if (codepoint <= 0xFFFF){
buffer[0] = (u8)(0xE0 | (codepoint >> 12));
buffer[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
buffer[2] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 3;
}
else if (codepoint <= 0x10FFFF){
codepoint &= 0x001FFFFF;
buffer[0] = (u8)(0xF0 | (codepoint >> 18));
buffer[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
buffer[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
buffer[3] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 4;
}
else{
*length_out = 0;
}
}
static umem
utf8_to_utf16_minimal_checking(u16 *dst, umem max_wchars, u8 *src, umem length, b32 *error){
u8 *s = src;
u8 *s_end = s + length;
u16 *d = dst;
u16 *d_end = d + max_wchars;
umem limit = length;
umem needed_max = 0;
u32 advance = 1;
*error = false;
for(; s < s_end;){
u32 codepoint = 0;
u32 utf8_size = 0;
if (s[0] < 0x80){
codepoint = (u32)s[0];
utf8_size = 1;
}
else if (s[0] < 0xE0){
if (limit <= 1){
*error = true;
break;
}
codepoint = ((u32)((s[0])&0x1F)) << 6;
codepoint |= ((u32)((s[1])&0x3F));
utf8_size = 2;
}
else if (s[0] < 0xF0){
if (limit <= 2){
*error = true;
break;
}
codepoint = ((u32)((s[0])&0x0F)) << 12;
codepoint |= ((u32)((s[1])&0x3F)) << 6;
codepoint |= ((u32)((s[2])&0x3F));
utf8_size = 3;
}
else if (s[0] < 0xF8){
if (limit > 3){
*error = true;
break;
}
codepoint = ((u32)((s[0])&0x07)) << 18;
codepoint |= ((u32)((s[1])&0x3F)) << 12;
codepoint |= ((u32)((s[2])&0x3F)) << 6;
codepoint |= ((u32)((s[3])&0x3F));
utf8_size = 4;
}
else{
*error = true;
break;
}
if (codepoint < cp_min_by_utf8_length[utf8_size]){
*error = true;
break;
}
s += utf8_size;
limit -= utf8_size;
if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)){
*d = (u16)(codepoint);
d += advance;
needed_max += 1;
}
else if (codepoint >= 0x10000 && codepoint <= 0x10FFFF){
codepoint -= 0x10000;
u32 high = (codepoint >> 10) & 0x03FF;
u32 low = (codepoint) & 0x03FF;
high += 0xD800;
low += 0xDC00;
if (d + advance < d_end){
*d = (u16)high;
d += advance;
*d = (u16)low;
d += advance;
}
else{
advance = 0;
}
needed_max += 2;
}
else{
*error = true;
break;
}
if (d >= d_end){
advance = 0;
}
}
return(needed_max);
}
static umem
utf16_to_utf8_minimal_checking(u8 *dst, umem max_chars, u16 *src, umem length, b32 *error){
u16 *s = src;
u16 *s_end = s + length;
u8 *d = dst;
u8 *d_end = d + max_chars;
umem limit = length;
umem needed_max = 0;
*error = false;
for (; s < s_end;){
u32 codepoint = 0;
u32 utf16_size = 0;
if (s[0] <= 0xD7FF || (s[0] >= 0xE000 && s[0] <= 0xFFFF)){
codepoint = s[0];
utf16_size = 1;
}
else{
if (s[0] >= 0xD800 && s[0] <= 0xDBFF){
if (limit <= 1){
*error = true;
break;
}
u32 high = s[0] - 0xD800;
u32 low = s[1] - 0xDC00;
codepoint = ((high << 10) | (low)) + 0x10000;
utf16_size = 2;
}
else{
*error = true;
break;
}
}
s += utf16_size;
limit -= utf16_size;
u8 d_fill[4];
u32 d_fill_count = 0;
if (codepoint <= 0x7F){
d_fill[0] = (u8)codepoint;
d_fill_count = 1;
}
else if (codepoint <= 0x7FF){
d_fill[0] = (u8)(0xC0 | (codepoint >> 6));
d_fill[1] = (u8)(0x80 | (codepoint & 0x3F));
d_fill_count = 2;
}
else if (codepoint <= 0xFFFF){
d_fill[0] = (u8)(0xE0 | (codepoint >> 12));
d_fill[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
d_fill[2] = (u8)(0x80 | (codepoint & 0x3F));
d_fill_count = 3;
}
else if (codepoint <= 0x10FFFF){
d_fill[0] = (u8)(0xF0 | (codepoint >> 18));
d_fill[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
d_fill[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
d_fill[3] = (u8)(0x80 | (codepoint & 0x3F));
d_fill_count = 4;
}
else{
*error = true;
break;
}
if (d + d_fill_count <= d_end){
for (u32 i = 0; i < d_fill_count; ++i){
*d = d_fill[i];
++d;
}
}
needed_max += d_fill_count;
}
return(needed_max);
}
static void
byte_to_ascii(u8 n, u8 *out){
u8 C = '0' + (n / 0x10);
if ((n / 0x10) > 0x9){
C = ('A' - 0xA) + (n / 0x10);
}
out[0] = C;
n = (n % 0x10);
C = '0' + n;
if (n > 0x9){
C = ('A' - 0xA) + n;
}
out[1] = C;
}
#endif
// BOTTOM