334 lines
11 KiB
C++
334 lines
11 KiB
C++
/*
|
|
* Mr. 4th Dimention - Allen Webster
|
|
*
|
|
* 16.06.2019
|
|
*
|
|
* Routines for string matching within chunked streams.
|
|
*
|
|
*/
|
|
|
|
// TOP
|
|
|
|
internal u64_Array
|
|
string_compute_prefix_table(Arena *arena, String_Const_u8 string, Scan_Direction direction){
|
|
u64_Array array = {};
|
|
array.count = (i32)(string.size);
|
|
array.vals = push_array(arena, u64, array.count);
|
|
|
|
u8 *str = string.str;
|
|
if (direction == Scan_Backward){
|
|
str = string.str + string.size - 1;
|
|
}
|
|
|
|
array.vals[0] = 0;
|
|
for (u64 i = 1; i < string.size; i += 1){
|
|
u64 previous_longest_prefix = array.vals[i - 1];
|
|
for (;;){
|
|
u8 *a = str + previous_longest_prefix;
|
|
u8 *b = str + i;
|
|
if (direction == Scan_Backward){
|
|
a = str - previous_longest_prefix;
|
|
b = str - i;
|
|
}
|
|
if (character_to_upper(*a) == character_to_upper(*b)){
|
|
array.vals[i] = previous_longest_prefix + 1;
|
|
break;
|
|
}
|
|
if (previous_longest_prefix == 0){
|
|
array.vals[i] = 0;
|
|
break;
|
|
}
|
|
previous_longest_prefix = array.vals[previous_longest_prefix - 1];
|
|
}
|
|
}
|
|
|
|
return(array);
|
|
}
|
|
|
|
internal u64_Array
|
|
string_compute_needle_jump_table(Arena *arena, u64_Array longest_prefixes){
|
|
u64_Array array = {};
|
|
array.count = longest_prefixes.count + 1;
|
|
array.vals = push_array(arena, u64, array.count);
|
|
array.vals[0] = 0;
|
|
for (u64 i = 1; i < array.count; i += 1){
|
|
array.vals[i] = i - longest_prefixes.vals[i - 1];
|
|
}
|
|
return(array);
|
|
}
|
|
|
|
internal u64_Array
|
|
string_compute_needle_jump_table(Arena *arena, String_Const_u8 needle, Scan_Direction direction){
|
|
u64_Array prefix_table = string_compute_prefix_table(arena, needle, direction);
|
|
return(string_compute_needle_jump_table(arena, prefix_table));
|
|
}
|
|
|
|
#define character_predicate_check_character(p, c) (((p).b[(c)/8] & (1 << ((c)%8))) != 0)
|
|
|
|
internal String_Match_List
|
|
find_all_matches_forward(Arena *arena, i32 maximum_output_count,
|
|
List_String_Const_u8 chunks, String_Const_u8 needle,
|
|
u64_Array jump_table, Character_Predicate *predicate,
|
|
u64 base_index, Buffer_ID buffer, i32 string_id){
|
|
String_Match_List list = {};
|
|
|
|
if (chunks.node_count > 0){
|
|
u64 i = 0;
|
|
u64 j = 0;
|
|
b8 current_l = false;
|
|
i64 last_insensitive = -1;
|
|
i64 last_boundary = -1;
|
|
|
|
Node_String_Const_u8 *node = chunks.first;
|
|
i64 chunk_pos = 0;
|
|
|
|
i32 jump_back_code = 0;
|
|
|
|
u8 c = 0;
|
|
u64 n = 0;
|
|
u8 needle_c = 0;
|
|
u64 jump = 0;
|
|
|
|
if (false){
|
|
iterate_forward:
|
|
i += 1;
|
|
chunk_pos += 1;
|
|
if (chunk_pos >= (i64)node->string.size){
|
|
last_boundary = i;
|
|
chunk_pos = 0;
|
|
node = node->next;
|
|
}
|
|
|
|
switch (jump_back_code){
|
|
case 0:
|
|
{
|
|
goto jump_back_0;
|
|
}break;
|
|
case 1:
|
|
{
|
|
goto jump_back_1;
|
|
}break;
|
|
}
|
|
}
|
|
|
|
for (;node != 0;){
|
|
c = node->string.str[chunk_pos];
|
|
n = i - j;
|
|
needle_c = needle.str[n];
|
|
if (character_to_upper(c) == character_to_upper(needle_c)){
|
|
if (c != needle_c){
|
|
last_insensitive = i;
|
|
}
|
|
|
|
jump_back_code = 0;
|
|
goto iterate_forward;
|
|
jump_back_0:
|
|
|
|
if (n + 1 == needle.size){
|
|
String_Match_Flag flags = 0;
|
|
if (!(last_insensitive >= 0 &&
|
|
j <= (u64)last_insensitive &&
|
|
(u64)last_insensitive < j + needle.size)){
|
|
AddFlag(flags, StringMatch_CaseSensitive);
|
|
}
|
|
if (!(last_boundary >= 0 &&
|
|
j <= (u64)last_boundary &&
|
|
(u64)last_boundary < j + needle.size)){
|
|
AddFlag(flags, StringMatch_Straddled);
|
|
}
|
|
if (node != 0){
|
|
u8 next_c = node->string.str[chunk_pos];
|
|
if (character_predicate_check_character(*predicate, next_c)){
|
|
AddFlag(flags, StringMatch_RightSideSloppy);
|
|
}
|
|
}
|
|
if (current_l){
|
|
AddFlag(flags, StringMatch_LeftSideSloppy);
|
|
}
|
|
string_match_list_push(arena, &list, buffer, string_id, flags,
|
|
base_index + j, needle.size);
|
|
if (list.count >= maximum_output_count){
|
|
break;
|
|
}
|
|
jump = jump_table.vals[n + 1];
|
|
current_l = character_predicate_check_character(*predicate, needle.str[jump - 1]);
|
|
j += jump;
|
|
}
|
|
|
|
}
|
|
else{
|
|
jump = jump_table.vals[n];
|
|
if (jump == 0){
|
|
current_l = character_predicate_check_character(*predicate, c);
|
|
|
|
jump_back_code = 1;
|
|
goto iterate_forward;
|
|
jump_back_1:
|
|
|
|
j += 1;
|
|
}
|
|
else{
|
|
u8 prev_c = needle.str[jump - 1];
|
|
current_l = character_predicate_check_character(*predicate, prev_c);
|
|
j += jump;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return(list);
|
|
}
|
|
|
|
internal String_Match_List
|
|
find_all_matches_backward(Arena *arena, i32 maximum_output_count,
|
|
List_String_Const_u8 chunks, String_Const_u8 needle,
|
|
u64_Array jump_table, Character_Predicate *predicate,
|
|
u64 base_index, Buffer_ID buffer, i32 string_id){
|
|
String_Match_List list = {};
|
|
|
|
string_list_reverse(&chunks);
|
|
|
|
if (chunks.node_count > 0){
|
|
i64 size = (i64)chunks.total_size;
|
|
|
|
i64 i = size - 1;
|
|
i64 j = size - 1;
|
|
b8 current_r = false;
|
|
i64 last_insensitive = size;
|
|
i64 last_boundary = size;
|
|
|
|
Node_String_Const_u8 *node = chunks.first;
|
|
i64 chunk_pos = node->string.size - 1;
|
|
|
|
i32 jump_back_code = 0;
|
|
|
|
u8 c = 0;
|
|
u64 n = 0;
|
|
u8 needle_c = 0;
|
|
u64 jump = 0;
|
|
|
|
if (false){
|
|
iterate_backward:
|
|
i -= 1;
|
|
chunk_pos -= 1;
|
|
if (chunk_pos < 0){
|
|
last_boundary = i;
|
|
node = node->next;
|
|
if (node != 0){
|
|
chunk_pos = node->string.size - 1;
|
|
}
|
|
}
|
|
|
|
switch (jump_back_code){
|
|
case 0:
|
|
{
|
|
goto jump_back_0;
|
|
}break;
|
|
case 1:
|
|
{
|
|
goto jump_back_1;
|
|
}break;
|
|
}
|
|
}
|
|
|
|
for (;node != 0;){
|
|
c = node->string.str[chunk_pos];
|
|
n = j - i;
|
|
needle_c = needle.str[needle.size - 1 - n];
|
|
if (character_to_upper(c) == character_to_upper(needle_c)){
|
|
if (c != needle_c){
|
|
last_insensitive = i;
|
|
}
|
|
|
|
jump_back_code = 0;
|
|
goto iterate_backward;
|
|
jump_back_0:
|
|
|
|
if (n + 1 == needle.size){
|
|
String_Match_Flag flags = 0;
|
|
if (!(last_insensitive < size &&
|
|
j >= last_insensitive &&
|
|
last_insensitive > j - (i64)needle.size)){
|
|
AddFlag(flags, StringMatch_CaseSensitive);
|
|
}
|
|
if (!(last_boundary < size &&
|
|
j >= last_boundary &&
|
|
last_boundary > j - (i64)needle.size)){
|
|
AddFlag(flags, StringMatch_Straddled);
|
|
}
|
|
if (node != 0){
|
|
u8 next_c = node->string.str[chunk_pos];
|
|
if (character_predicate_check_character(*predicate, next_c)){
|
|
AddFlag(flags, StringMatch_LeftSideSloppy);
|
|
}
|
|
}
|
|
if (current_r){
|
|
AddFlag(flags, StringMatch_RightSideSloppy);
|
|
}
|
|
string_match_list_push(arena, &list, buffer, string_id, flags,
|
|
base_index + (j - (needle.size - 1)), needle.size);
|
|
if (list.count >= maximum_output_count){
|
|
break;
|
|
}
|
|
jump = jump_table.vals[n + 1];
|
|
u64 m = needle.size - jump;
|
|
u8 needle_m = needle.str[m];
|
|
current_r = character_predicate_check_character(*predicate, needle_m);
|
|
j -= jump;
|
|
}
|
|
|
|
}
|
|
else{
|
|
jump = jump_table.vals[n];
|
|
if (jump == 0){
|
|
current_r = character_predicate_check_character(*predicate, c);
|
|
|
|
jump_back_code = 1;
|
|
goto iterate_backward;
|
|
jump_back_1:
|
|
|
|
j -= 1;
|
|
}
|
|
else{
|
|
u64 m = needle.size - jump;
|
|
u8 needle_m = needle.str[m];
|
|
current_r = character_predicate_check_character(*predicate, needle_m);
|
|
j -= jump;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
string_list_reverse(&chunks);
|
|
|
|
return(list);
|
|
}
|
|
|
|
internal String_Match_List
|
|
find_all_matches(Arena *arena, i32 maximum_output_count,
|
|
List_String_Const_u8 chunks, String_Const_u8 needle,
|
|
u64_Array jump_table, Character_Predicate *predicate,
|
|
Scan_Direction direction,
|
|
u64 base_index, Buffer_ID buffer, i32 string_id){
|
|
String_Match_List list = {};
|
|
switch (direction){
|
|
case Scan_Forward:
|
|
{
|
|
list = find_all_matches_forward(arena, maximum_output_count,
|
|
chunks, needle, jump_table, predicate,
|
|
base_index, buffer, string_id);
|
|
}break;
|
|
|
|
case Scan_Backward:
|
|
{
|
|
list = find_all_matches_backward(arena, maximum_output_count,
|
|
chunks, needle, jump_table, predicate,
|
|
base_index, buffer, string_id);
|
|
}break;
|
|
}
|
|
return(list);
|
|
}
|
|
|
|
// BOTTOM
|
|
|