4coder/4coder_lib/4coder_utf8.h

396 lines
9.6 KiB
C
Raw Normal View History

2017-02-18 01:04:41 +00:00
/*
* Mr. 4th Dimention - Allen Webster
*
* 17.02.2017
*
* Code for converting to and from utf8 to ANSI and utf16 text encodings.
*
*/
// TOP
#if !defined(FED_UTF8_CONVERSION_H)
#define FED_UTF8_CONVERSION_H
2019-06-01 23:58:28 +00:00
static u32 cp_min_by_utf8_length[] = {
2017-03-25 18:04:27 +00:00
0x0,
0x0,
0x80,
0x800,
0x10000,
};
2019-06-01 23:58:28 +00:00
static u32 surrogate_min = 0xD800;
static u32 surrogate_max = 0xDFFF;
2017-03-25 18:04:27 +00:00
2019-06-01 23:58:28 +00:00
static u32 nonchar_min = 0xFDD0;
static u32 nonchar_max = 0xFDEF;
2017-03-25 18:04:27 +00:00
2019-06-01 23:58:28 +00:00
static b32
codepoint_is_whitespace(u32 codepoint){
b32 result = false;
if (codepoint == ' ' || codepoint == '\r' || codepoint == '\n' || codepoint == '\t'){
result = true;
}
return(result);
}
2019-06-01 23:58:28 +00:00
static u32
utf8_to_u32_length_unchecked(u8 *buffer, u32 *length_out){
u32 result = 0;
2017-02-18 01:04:41 +00:00
if (buffer[0] < 0x80){
2019-06-01 23:58:28 +00:00
result = (u32)buffer[0];
*length_out = 1;
2017-02-18 01:04:41 +00:00
}
else if (buffer[0] < 0xE0){
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x1F)) << 6;
result |= ((u32)((buffer[1])&0x3F));
*length_out = 2;
2017-02-18 01:04:41 +00:00
}
else if (buffer[0] < 0xF0){
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x0F)) << 12;
result |= ((u32)((buffer[1])&0x3F)) << 6;
result |= ((u32)((buffer[2])&0x3F));
*length_out = 3;
2017-02-18 01:04:41 +00:00
}
else{
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x07)) << 18;
result |= ((u32)((buffer[1])&0x3F)) << 12;
result |= ((u32)((buffer[2])&0x3F)) << 6;
result |= ((u32)((buffer[3])&0x3F));
*length_out = 4;
}
2017-03-25 20:49:54 +00:00
if (result < cp_min_by_utf8_length[*length_out] || (result >= surrogate_min && result <= surrogate_max)){
2017-03-25 18:04:27 +00:00
result = 0;
*length_out = 0;
}
return(result);
}
2019-06-01 23:58:28 +00:00
static u32
utf8_to_u32_unchecked(u8 *buffer){
u32 ignore;
u32 result = utf8_to_u32_length_unchecked(buffer, &ignore);
return(result);
}
2019-06-01 23:58:28 +00:00
static u32
utf8_to_u32(u8 **buffer_ptr, u8 *end){
u8 *buffer = *buffer_ptr;
u32 limit = (u32)(end - buffer);
2019-06-01 23:58:28 +00:00
u32 length = 0;
if (buffer[0] < 0x80){
length = 1;
}
else if (buffer[0] < 0xC0){
length = 0;
}
else if (buffer[0] < 0xE0){
length = 2;
}
else if (buffer[0] < 0xF0){
length = 3;
}
2017-03-25 18:04:27 +00:00
else if (buffer[0] < 0xF8){
length = 4;
}
2017-03-25 18:04:27 +00:00
else{
length = 0;
}
2019-06-01 23:58:28 +00:00
for (u32 i = 1; i < length; ++i){
if ((buffer[i] & 0xC0) != 0x80){
length = 0;
break;
}
}
2019-06-01 23:58:28 +00:00
u32 result = 0;
if (length != 0 && length <= limit){
switch (length){
case 1:
{
2019-06-01 23:58:28 +00:00
result = (u32)buffer[0];
}break;
case 2:
{
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x1F)) << 6;
result |= ((u32)((buffer[1])&0x3F));
}break;
case 3:
{
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x0F)) << 12;
result |= ((u32)((buffer[1])&0x3F)) << 6;
result |= ((u32)((buffer[2])&0x3F));
}break;
case 4:
{
2019-06-01 23:58:28 +00:00
result = ((u32)((buffer[0])&0x07)) << 18;
result |= ((u32)((buffer[1])&0x3F)) << 12;
result |= ((u32)((buffer[2])&0x3F)) << 6;
result |= ((u32)((buffer[3])&0x3F));
}break;
}
2017-03-25 20:49:54 +00:00
if (result < cp_min_by_utf8_length[length] || (result >= surrogate_min && result <= surrogate_max)){
2017-03-25 18:04:27 +00:00
result = 0;
length = 0;
}
*buffer_ptr = buffer + length;
}
else{
*buffer_ptr = end;
2017-02-18 01:04:41 +00:00
}
return(result);
}
static void
2019-06-01 23:58:28 +00:00
u32_to_utf8_unchecked(u32 codepoint, u8 *buffer, u32 *length_out){
2017-03-10 20:44:42 +00:00
if (codepoint <= 0x7F){
2019-06-01 23:58:28 +00:00
buffer[0] = (u8)codepoint;
*length_out = 1;
}
2017-03-10 20:44:42 +00:00
else if (codepoint <= 0x7FF){
2019-06-01 23:58:28 +00:00
buffer[0] = (u8)(0xC0 | (codepoint >> 6));
buffer[1] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 2;
}
2017-03-10 20:44:42 +00:00
else if (codepoint <= 0xFFFF){
2019-06-01 23:58:28 +00:00
buffer[0] = (u8)(0xE0 | (codepoint >> 12));
buffer[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
buffer[2] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 3;
}
2017-03-25 18:04:27 +00:00
else if (codepoint <= 0x10FFFF){
2017-03-10 20:44:42 +00:00
codepoint &= 0x001FFFFF;
2019-06-01 23:58:28 +00:00
buffer[0] = (u8)(0xF0 | (codepoint >> 18));
buffer[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
buffer[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
buffer[3] = (u8)(0x80 | (codepoint & 0x3F));
*length_out = 4;
}
2017-03-25 18:04:27 +00:00
else{
*length_out = 0;
}
}
2019-06-01 23:58:28 +00:00
static umem
utf8_to_utf16_minimal_checking(u16 *dst, umem max_wchars, u8 *src, umem length, b32 *error){
u8 *s = src;
u8 *s_end = s + length;
2017-02-18 01:04:41 +00:00
2019-06-01 23:58:28 +00:00
u16 *d = dst;
u16 *d_end = d + max_wchars;
umem limit = length;
2017-02-18 01:04:41 +00:00
2019-06-01 23:58:28 +00:00
umem needed_max = 0;
u32 advance = 1;
2017-02-18 01:04:41 +00:00
*error = false;
for(; s < s_end;){
2019-06-01 23:58:28 +00:00
u32 codepoint = 0;
u32 utf8_size = 0;
2017-02-18 01:04:41 +00:00
if (s[0] < 0x80){
2019-06-01 23:58:28 +00:00
codepoint = (u32)s[0];
2017-02-18 01:04:41 +00:00
utf8_size = 1;
}
else if (s[0] < 0xE0){
2017-02-18 01:04:41 +00:00
if (limit <= 1){
*error = true;
break;
}
2019-06-01 23:58:28 +00:00
codepoint = ((u32)((s[0])&0x1F)) << 6;
codepoint |= ((u32)((s[1])&0x3F));
2017-02-18 01:04:41 +00:00
utf8_size = 2;
}
else if (s[0] < 0xF0){
2017-02-18 01:04:41 +00:00
if (limit <= 2){
*error = true;
break;
}
2019-06-01 23:58:28 +00:00
codepoint = ((u32)((s[0])&0x0F)) << 12;
codepoint |= ((u32)((s[1])&0x3F)) << 6;
codepoint |= ((u32)((s[2])&0x3F));
2017-02-18 01:04:41 +00:00
utf8_size = 3;
}
2017-03-25 18:04:27 +00:00
else if (s[0] < 0xF8){
if (limit > 3){
2017-02-18 01:04:41 +00:00
*error = true;
break;
}
2019-06-01 23:58:28 +00:00
codepoint = ((u32)((s[0])&0x07)) << 18;
codepoint |= ((u32)((s[1])&0x3F)) << 12;
codepoint |= ((u32)((s[2])&0x3F)) << 6;
codepoint |= ((u32)((s[3])&0x3F));
2017-02-18 01:04:41 +00:00
utf8_size = 4;
}
2017-03-25 18:04:27 +00:00
else{
*error = true;
break;
}
if (codepoint < cp_min_by_utf8_length[utf8_size]){
*error = true;
break;
}
2017-02-18 01:04:41 +00:00
s += utf8_size;
limit -= utf8_size;
2017-03-10 20:44:42 +00:00
if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)){
2019-06-01 23:58:28 +00:00
*d = (u16)(codepoint);
2017-02-18 01:04:41 +00:00
d += advance;
needed_max += 1;
}
2017-03-10 20:44:42 +00:00
else if (codepoint >= 0x10000 && codepoint <= 0x10FFFF){
codepoint -= 0x10000;
2017-02-18 01:04:41 +00:00
2019-06-01 23:58:28 +00:00
u32 high = (codepoint >> 10) & 0x03FF;
u32 low = (codepoint) & 0x03FF;
2017-02-18 01:04:41 +00:00
high += 0xD800;
low += 0xDC00;
if (d + advance < d_end){
2019-06-01 23:58:28 +00:00
*d = (u16)high;
2017-02-18 01:04:41 +00:00
d += advance;
2019-06-01 23:58:28 +00:00
*d = (u16)low;
2017-02-18 01:04:41 +00:00
d += advance;
}
else{
advance = 0;
}
needed_max += 2;
}
else{
*error = true;
break;
}
if (d >= d_end){
advance = 0;
}
}
return(needed_max);
}
2019-06-01 23:58:28 +00:00
static umem
utf16_to_utf8_minimal_checking(u8 *dst, umem max_chars, u16 *src, umem length, b32 *error){
u16 *s = src;
u16 *s_end = s + length;
2017-02-18 01:04:41 +00:00
2019-06-01 23:58:28 +00:00
u8 *d = dst;
u8 *d_end = d + max_chars;
umem limit = length;
2017-02-18 01:04:41 +00:00
2019-06-01 23:58:28 +00:00
umem needed_max = 0;
2017-02-18 01:04:41 +00:00
*error = false;
for (; s < s_end;){
2019-06-01 23:58:28 +00:00
u32 codepoint = 0;
u32 utf16_size = 0;
2017-02-18 01:04:41 +00:00
if (s[0] <= 0xD7FF || (s[0] >= 0xE000 && s[0] <= 0xFFFF)){
2017-03-10 20:44:42 +00:00
codepoint = s[0];
2017-02-18 01:04:41 +00:00
utf16_size = 1;
}
else{
if (s[0] >= 0xD800 && s[0] <= 0xDBFF){
if (limit <= 1){
*error = true;
break;
}
2019-06-01 23:58:28 +00:00
u32 high = s[0] - 0xD800;
u32 low = s[1] - 0xDC00;
2017-03-10 20:44:42 +00:00
codepoint = ((high << 10) | (low)) + 0x10000;
2017-02-18 01:04:41 +00:00
utf16_size = 2;
}
else{
*error = true;
break;
}
}
s += utf16_size;
limit -= utf16_size;
2019-06-01 23:58:28 +00:00
u8 d_fill[4];
u32 d_fill_count = 0;
2017-02-18 01:04:41 +00:00
2017-03-10 20:44:42 +00:00
if (codepoint <= 0x7F){
2019-06-01 23:58:28 +00:00
d_fill[0] = (u8)codepoint;
2017-02-18 01:04:41 +00:00
d_fill_count = 1;
}
2017-03-10 20:44:42 +00:00
else if (codepoint <= 0x7FF){
2019-06-01 23:58:28 +00:00
d_fill[0] = (u8)(0xC0 | (codepoint >> 6));
d_fill[1] = (u8)(0x80 | (codepoint & 0x3F));
2017-02-18 01:04:41 +00:00
d_fill_count = 2;
}
2017-03-10 20:44:42 +00:00
else if (codepoint <= 0xFFFF){
2019-06-01 23:58:28 +00:00
d_fill[0] = (u8)(0xE0 | (codepoint >> 12));
d_fill[1] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
d_fill[2] = (u8)(0x80 | (codepoint & 0x3F));
2017-02-18 01:04:41 +00:00
d_fill_count = 3;
}
else if (codepoint <= 0x10FFFF){
2019-06-01 23:58:28 +00:00
d_fill[0] = (u8)(0xF0 | (codepoint >> 18));
d_fill[1] = (u8)(0x80 | ((codepoint >> 12) & 0x3F));
d_fill[2] = (u8)(0x80 | ((codepoint >> 6) & 0x3F));
d_fill[3] = (u8)(0x80 | (codepoint & 0x3F));
2017-02-18 01:04:41 +00:00
d_fill_count = 4;
}
else{
*error = true;
break;
}
if (d + d_fill_count <= d_end){
2019-06-01 23:58:28 +00:00
for (u32 i = 0; i < d_fill_count; ++i){
2017-02-18 01:04:41 +00:00
*d = d_fill[i];
++d;
}
}
needed_max += d_fill_count;
}
return(needed_max);
}
static void
2019-06-01 23:58:28 +00:00
byte_to_ascii(u8 n, u8 *out){
u8 C = '0' + (n / 0x10);
if ((n / 0x10) > 0x9){
C = ('A' - 0xA) + (n / 0x10);
}
out[0] = C;
n = (n % 0x10);
C = '0' + n;
if (n > 0x9){
C = ('A' - 0xA) + n;
}
out[1] = C;
}
2017-02-18 01:04:41 +00:00
#endif
// BOTTOM