mr4th/src/euler_asm.asm

1839 lines
27 KiB
NASM

.code
; rcx - n
; n*(n + 1)/2
triangle_number PROC
mov rax, rcx ; rax = n
add rax, 1 ; rax = n + 1
mul rcx ; rax = (n + 1)*n
shr rax, 1 ; rax = (n + 1)*n/2
ret
triangle_number ENDP
; rcx - n
; n*(2n + 1)*(n + 1)/6
sum_of_squares PROC
; rdx = 2n + 1
lea rdx, [1 + rcx*2]
; r8 = n + 1
lea r8, [1 + rcx]
; rax = n*(2n + 1)*(n + 1)
mov rax, rcx
imul rax, rdx
imul rax, r8
; rax = n*(2n + 1)*(n + 1)/6
xor rdx,rdx
mov rcx,6
idiv rcx
ret
sum_of_squares ENDP
; rcx - dst, rdx - src
fibonacci_stepper_mul PROC
; rax = n1*k1
mov rax, [rcx]
imul rax, [rdx]
; r8 = n2*k2
mov r8, [rcx + 8]
imul r8, [rdx + 8]
; r9 = n1*k2
mov r9, [rcx]
imul r9, [rdx + 8]
; p1 = n1*k1 + n2*k2 = (rax + r8)
mov [rcx], rax
add [rcx], r8
; rax = n2*(k1 + k2)
mov rax, [rdx]
add rax, [rdx + 8]
imul rax, [rcx + 8]
; p2 = n1*k2 + n2*(k1 + k2) = (r9 + rax)
mov [rcx + 8], r9
add [rcx + 8], rax
ret
fibonacci_stepper_mul ENDP
; rcx - dst
fibonacci_stepper_sqr PROC
mov rax, [rcx] ; rax = n1
mov r8, [rcx + 8] ; r8 = n2
mov r9, rax
imul r9, r8 ; r9 = n1*n2
imul rax, rax ; rax = n1*n1
imul r8, r8 ; r8 = n2*n2
; p1 = n1*n1 + n2*n2
add rax, r8 ; rax = n1*n1 + n2*n2
mov [rcx], rax
; p2 = 2*n1*n2 + n2*n2
lea rax, [r8 + r9*2] ; rax = 2*n1*n2 + n2*n2
mov [rcx + 8], rax
ret
fibonacci_stepper_sqr ENDP
; rcx - n
fibonacci_number PROC FRAME
mov qword ptr [rsp + 8],rsi
.savereg rsi, 8
sub rsp, 64
.allocstack 64
.endprolog
mov qword ptr [rsp + 32], 1 ; a = [1, 0]
mov qword ptr [rsp + 40], 0
mov qword ptr [rsp + 48], 0 ; s = [0, 1]
mov qword ptr [rsp + 56], 1
; relocate n
mov rsi, rcx
; if n == 0 goto done;
test rsi, rsi
jz done
loop0:
; if (n&1) == 0 goto skip_accumulator;
test rsi, 1
jz skip_accumulator
; mul(a,s)
lea rcx, [rsp + 32]
lea rdx, [rsp + 48]
call fibonacci_stepper_mul
skip_accumulator:
; n = n >> 1
shr rsi, 1
; if n == 0 goto done;
jz done
; sqr(s)
lea rcx, [rsp + 48]
call fibonacci_stepper_sqr
; goto loop0;
jmp loop0
done:
mov rax, [rsp + 40]
add rsp, 64
mov rsi, qword ptr [rsp + 8]
ret
fibonacci_number ENDP
; rcx - n
; fib(n + 2) - 1
fibonacci_sigma PROC FRAME
sub rsp,32
.allocstack 32
.endprolog
add rcx, 2
call fibonacci_number
dec rax
add rsp,32
ret
fibonacci_sigma ENDP
; rcx - buffer (U64*)
; rdx - n (U64)
; r8 - m (U64)
; assumptions:
; buffer points to at least n + 1 U64s worth of free memory
; m <= n
choose__asm PROC
; m' = n - m
mov r9,rdx
sub r9,r8
; r = min(m, m'), c = max(m, m')
; r8 contains, r9 contains c
mov rax,r8
cmp r8,r9
cmova r8,r9
cmova r9,rax
; if (r == 0) goto return_1;
test r8,r8
jz return_1
; if (r == 1) goto return_n;
cmp r8,1
je return_n
; if (r & 1) goto r_odd_case;
test r8,1
jnz r_odd_case
;; EVEN CASE ;;
; i = 2
mov rax,2
; k = 6
mov r11,6
loop0_even:
; buffer[i] = k
mov qword ptr [rcx + rax*8],r11
; i += 1
inc rax
; k += i + 1
inc r11
add r11,rax
; if (i <= c) goto loop0_even;
cmp rax,r9
jbe loop0_even
; row = 4
mov rax,4
; goto loop1
jmp loop1
;; ODD CASE ;;
r_odd_case:
; i = 3
mov rax,3
; minor_slot = 10
mov rdx,10
; k = 20
mov r11,20
loop0_odd:
; buffer[i] = k
mov qword ptr [rcx + rax*8],r11
; i += 1
inc rax
; minor_slot += i + 1
inc rdx
add rdx,rax
; k += minor_slot
add r11,rdx
; if (i <= c) goto loop0_odd;
cmp rax,r9
jbe loop0_odd
; row = 5
mov rax,5
loop1:
; if (row > r) goto return_buffer_c
cmp rax,r8
ja return_buffer_c
; col = row
mov r10,rax
;; initialize the loop
mov r11,qword ptr [rcx + r10*8 - 8]
shl r11,1
add r11,qword ptr [rcx + r10*8]
mov rdx,r11
shl rdx,1
; buffer[col] = major_slot
mov qword ptr [rcx + r10*8],rdx
loop2:
; col += 1
inc r10
; if (col > c) goto loop2_done;
cmp r10,r9
ja loop2_done
; minor_slot += buffer[col]
add r11,qword ptr [rcx + r10*8]
; major_slot += minor_slot
add rdx,r11
; buffer[col] = major_slot
mov qword ptr [rcx + r10*8],rdx
; goto loop2
jmp loop2
loop2_done:
; row += 2
add rax,2
; goto loop1
jmp loop1
return_buffer_c:
mov rax,qword ptr [rcx + r9*8]
ret
return_1:
mov rax,1
ret
return_n:
mov rax,rdx
ret
choose__asm ENDP
; rcx - table (U64*)
; rdx - count (U64)
fill_factorial_table PROC
; i = 0
xor rax,rax
; n = 1
mov r8,1
;
loop0:
; if (i >= count) goto done
cmp rax,rdx
jae done
; table[i] = n
mov [rcx + rax*8],r8
; i += 1
; n *= i + 1
add rax,2
imul r8,rax
dec rax
; goto loop0
jmp loop0
done:
ret
fill_factorial_table ENDP
; rcx - table_memory (B8*)
; rdx - primes_list (U32*)
; r8d - first (U32)
; r9d - opl (U32)
; [rsp + 0x28] - node (ListNode_U32*)
prime_sieve__asm PROC FRAME
push r15
.pushreg r15
push r14
.pushreg r14
push r13
.pushreg r13
push r12
.pushreg r12
sub rsp,16
.allocstack 16
.endprolog
; extend first and opl to 64-bits
mov r8d,r8d
mov r9d,r9d
; save primes list at [rsp]
mov [rsp],rdx
; save counter at [rsp + 1]
mov qword ptr [rsp+8],0
; move node to r15
mov r15, [rsp + 58h]
; if (node != 0) goto skip_emit_2;
test r15,r15
jnz skip_emit_2;
; emit 2
mov dword ptr [rdx],2
mov qword ptr [rsp+8],1
skip_emit_2:
; M' = (first even)?(first + 1):(first);
or r8d,1
; move opl into r12
mov r12,r9
; ibound = (opl - M' + 1)/2
sub r9,r8
inc r9
shr r9,1
; table_memory_bound = table_memory + ibound
mov r13,rcx
add r13,r9
mixing0:
; if (node == 0) goto done_mixing;
test r15,r15
jz done_mixing
; ptr = node->array.v
mov r14, [r15 + 8]
; ptr_opl = node->array.v + node->array.count
mov r10, [r15 + 16]
lea r10, [r14 + r10*4]
mixing1:
; if (ptr >= ptr_opl) goto mixing0_next;
cmp r14,r10
jae mixing0_next
; p = *ptr
mov r11d, dword ptr [r14]
; if (p == 2) goto mixing1_next;
cmp r11,2
je mixing1_next;
; if (p*p >= opl) goto done_mixing;
mov rax,r11
imul rax,rax
cmp rax,r12
jae done_mixing
; k = floor((M' + p - 1)/p)
mov rax,r8
add rax,r11
dec rax
xor rdx,rdx
idiv r11
; if (k < p) k = p;
cmp rax,r11
cmovl rax,r11
; if (k even) k += 1;
or rax,1
; x = k*p
imul rax,r11
; j = (x - M')/2
sub rax,r8
shr rax,1
; ptr = table_memory + j
add rax, rcx
mixing2:
; if (ptr >= table_memory_bound) goto mixing1_next;
cmp rax,r13
jae mixing1_next;
; mark x in the table
mov byte ptr [rax],1
; j += p, goto mixing2
add rax,r11
jmp mixing2
mixing1_next:
; increment r14
add r14,4
jmp mixing1
mixing0_next:
; node = node->next
mov r15,[r15]
jmp mixing0
done_mixing:
; retrieve base of primes_list
mov rdx,[rsp]
; retrieve counter
mov rax,[rsp+8]
; ptr = table_memory
mov r12, rcx
scanning0:
; mark = *ptr
movzx r15, byte ptr [r12]
; if (mark != 0) goto scanning0_next;
test r15,r15
jnz scanning0_next;
; j = ptr - table_memory
mov r14, r12
sub r14, rcx
; p = M' + 2*j
lea r14, [r8 + r14*2]
; emit p
mov dword ptr [rdx + rax*4], r14d
inc rax
; j' = (p*p - M')/2
mov r10,r14
imul r10,r10
sub r10,r8
shr r10,1
; ptr' = table_memory + j'
add r10, rcx
scanning1:
; if (ptr' >= table_memory_bound) goto scanning0_next;
cmp r10,r13
jae scanning0_next;
; *(ptr') = 1
mov byte ptr [r10], 1
; ptr' += p
add r10,r14
jmp scanning1
scanning0_next:
; ptr += 1
inc r12
; if (ptr < table_memory_bound) goto scanning0;
cmp r12,r13
jb scanning0
add rsp,16
pop r12
pop r13
pop r14
pop r15
ret
prime_sieve__asm ENDP
; rcx - table_memory (U32*)
; rdx - count (U32)
; assumptions:
; rcx is a pointer to 4*edx bytes, cleared to zero
factorization_table__asm PROC
; zero extend count to 64-bit
mov edx,edx
; i = 0
xor rax,rax
loop0:
; if (i >= count) goto done;
cmp rax,rdx
jae done
; f = table_memory[i]
mov r9d, dword ptr [rcx + rax*4]
; if (f != 0) goto loop0_next
test r9,r9
jnz loop0_next
; n = 3 + 2*i
lea r8, [3 + rax*2]
; table_memory[i] = n
mov dword ptr [rcx + rax*4], r8d
; r9 = n*n
mov r9,r8
imul r9,r9
; j = (n*n - 3)/2
sub r9,3
shr r9,1
loop1:
; if (j >= count) goto loop0_next;
cmp r9,rdx
jae loop0_next
; table_memory[j] = n
mov dword ptr [rcx + r9*4], r8d
; increment
add r9,r8
jmp loop1
loop0_next:
; increment
inc rax
jmp loop0
done:
ret
factorization_table__asm ENDP
; rcx - fact_tbl (U32*)
; edx - count (U32)
; r8d - n (U32)
get_prime_factor PROC
; if (n even) goto return_2;
test r8d,1
jz return_2
; if (n == 1) goto return_1;
cmp r8d,1
je return_1
; j = (n - 3)/2
sub r8d,3
shr r8d,1
; if (j >= count) goto return_0;
cmp r8d,edx
jae return_0
; f = fact_tbl[j]
mov eax, dword ptr [rcx + r8*4]
ret
return_2:
mov rax,2
ret
return_1:
mov rax,1
ret
return_0:
xor rax,rax
ret
get_prime_factor ENDP
; ecx - n
; edx - f
max_divide PROC
; move f into r8
mov r8d,edx
; r = 0
xor r9d,r9d
loop0:
; compute n/f and n%f
xor edx,edx
mov eax,ecx
idiv r8d
; if ((remainder) != 0) goto done;
test edx,edx
jnz done
; r += 1
inc r9d
; n = (quotient)
mov ecx,eax
jmp loop0
done:
mov eax,ecx
shl rax,32
or rax,r9
ret
max_divide ENDP
; rcx - fact_tbl (U32*)
; edx - count (U32)
; r8d - n (U32)
divisor_count PROC FRAME
push rsi
.pushreg rsi
push rdi
.pushreg rdi
push r12
.pushreg r12
push r13
.pushreg r13
sub rsp,32
.allocstack 32
.endprolog
; move fact_tbl to rsi
mov rsi, rcx
; move count to edi
mov edi, edx
; move n to r12d
mov r12d, r8d
; accum = 1
mov r13d,1
loop0:
; if (n <= 1) goto done;
cmp r12d,1
jbe done
; call get_prime_factor(fact_tbl, count, n)
mov rcx, rsi
mov edx, edi
mov r8d, r12d
call get_prime_factor
; f is in rax
; call max_divide(n, f)
mov ecx, r12d
mov edx, eax
call max_divide
; {r,n'} are in rax
; accum *= (r + 1)
mov ecx,eax
inc ecx
imul r13d,ecx
; n = n'
shr rax,32
mov r12d,eax
jmp loop0
done:
mov eax,r13d
add rsp,32
pop r13
pop r12
pop rdi
pop rsi
ret
divisor_count ENDP
; rcx - fact_tbl (U32*)
; edx - count (U32)
; r8d - n (U32)
divisor_sum PROC FRAME
push rsi
.pushreg rsi
sub rsp,32
.allocstack 32
.endprolog
; save params
mov qword ptr [rsp + 48],rcx
mov dword ptr [rsp + 56],edx
mov dword ptr [rsp + 64],r8d
; accum = 1
mov esi,1
loop0:
; recover n from stack
mov r8d, dword ptr [rsp + 64]
; if (n <= 1) goto done0
cmp r8d,1
jbe done0
; call get_prime_factor(fact_tbl, count, n)
mov rcx, qword ptr [rsp + 48]
mov edx, dword ptr [rsp + 56]
; r8d is already set to n
call get_prime_factor
; move p to rcx
mov rcx,rax
; zero rdx
xor rdx,rdx
; move n to rax
mov eax, dword ptr [rsp + 64]
; n,r = n div p
div rcx
; p_power = p
mov r8,rcx
; p_sum = 1 + p
lea r9,[1 + rcx]
loop1:
; good_n = n
mov r10d,eax
; n,r = n div p
div rcx
; if (r != 0) goto done1
test rdx,rdx
jnz done1
; p_power *= p
imul r8,rcx
; p_sum += p_power
add r9,r8
jmp loop1
done1:
; move n back to stack memory
mov dword ptr [rsp + 64],r10d
; accum *= p_sum
imul rsi,r9
jmp loop0
done0:
mov rax,rsi
add rsp,32
pop rsi
ret
divisor_sum ENDP
; rcx - return pointer (Pair_U64*)
; rdx - min (U64)
; r8 - max (U64)
; r9 - target_product (U64)
; assumptions:
; rdx < r8
; rdx*rdx <= target_product <= max*max
find_bounded_factors PROC
; clear result to zero
mov qword ptr [rcx], 0
mov qword ptr [rcx+8], 0
; move min to r11
mov r11, rdx
; product = min*max
mov r10, r11
imul r10, r8
; if (product == target_product) goto return_min_max;
; if (product < target_product) goto too_small;
; goto too_big
cmp r10, r9
jz return_min_max
jb too_small
jmp too_big
too_small:
; increase min so that:
; min*max >= target_product & (min - 1)*max < target_product
; min >= target_product/max & min < target_product/max + 1
; target_product/max <= min < target_product/max + 1
; min <- ceil(target_product/max)
; q,r = target_product div max
xor rdx,rdx
mov rax, r9
idiv r8
; if (r == 0) goto too_small0;
; goto too_small1;
test rdx,rdx
jz too_small0
too_small1:
; min = floor((0:target_product)/max)
mov r11, rax
inc r11
; if (min > max) goto return;
cmp r11, r8
ja return
jmp too_big
too_small0:
; min = floor((0:target_product)/max)
mov r11, rax
; if (min > max) goto return;
cmp r11, r8
ja return
jmp return_min_max
too_big:
; decrease max so that:
; min*max <= target_product & min*(max + 1) > target_product
; max <= target_product/min & max > (target_product/min) - 1
; (target_product/min) - 1 < max <= target_product/min
; max <- floor(target_product/min)
; max = floor((0:target_product)/min)
xor rdx,rdx
mov rax, r9
idiv r11
mov r8, rax
; if (min > max) goto return;
cmp r11, r8
ja return
; if ((0:target_product) == 0) goto return_min_max;
; goto too_small;
test rdx, rdx
jz return_min_max
jmp too_small
return_min_max:
mov [rcx], r11
mov [rcx+8], r8
return:
mov rax, rcx
ret
find_bounded_factors ENDP
; rcx - A (U64)
; rdx - B (U64)
gcd_euclidean PROC
; transfer B into r8
mov r8, rdx
; if (A <= B) goto loop0;
cmp rcx,r8
jbe loop0
; swap
mov r8, rcx
mov rcx, rdx
loop0:
; if (A == 0) goto return
test rcx,rcx
jz return
; euclidean_step
mov rax, r8
xor rdx, rdx
idiv rcx
mov r8, rcx
mov rcx, rdx
jmp loop0
return:
mov rax, r8
ret
gcd_euclidean ENDP
; rcx - A (U64)
; rdx - B (U64)
lcm_euclidean PROC FRAME
mov qword ptr [rsp + 8],rsi
.savereg rsi,8
sub rsp,32
.allocstack 32
.endprolog
; rsi = A*B
mov rsi,rcx
imul rsi,rdx
; call gcd(...)
call gcd_euclidean
; rcx = gcd(...)
mov rcx,rax
; rax = A*B/gcd(...)
mov rax, rsi
xor rdx, rdx
idiv rcx
add rsp,32
mov rsi,qword ptr [rsp + 8]
ret
lcm_euclidean ENDP
; rcx - d (U64)
dec_repeating_cycle_score PROC
; if (d == 0) goto return_0;
test rcx,rcx
jz return_0
;; divide out all factors of 2
loop2:
; if (d & 1) goto loop2_done;
test rcx,1
jnz loop2_done
; d /= 2
shr rcx,1
jmp loop2
loop2_done:
;; divide out all factors of 5
xor rdx,rdx
mov rax,rcx
loop5:
mov rcx,rax
; d /= 5
mov r8,5
div r8
; if (remainder != 0) goto loop5_done
test rdx,rdx
jnz loop5_done
jmp loop5
loop5_done:
; if (d == 1) goto return_0;
cmp rcx,1
je return_0
; s = 1
mov r8,1
; n = 10
mov r9,10
loop9:
; m = n - 1
mov rax,r9
dec rax
; m div d
xor rdx,rdx
div rcx
; if (remainder == 0) goto loop9_done
test rdx,rdx
jz loop9_done
; s += 1
inc r8
; if (s >= 18) assert
cmp r8,18
jb normal
int 3
normal:
; n *= 10
imul r9,10
jmp loop9
loop9_done:
mov rax,r8
ret
return_0:
xor rax,rax
ret
dec_repeating_cycle_score ENDP
; rcx - return pointer (EulerData*)
; rdx - text (String8*)
; r8 - out (U8*)
euler_data_from_text__asm PROC
; store og return pointer for later
mov [rsp + 8],rcx
; result.v = out
mov [rcx],r8
; ptr = text.str
mov rax,[rdx]
; opl = ptr + text.size
mov rdx,[rdx + 8]
lea rdx,[rax + rdx]
; line_count = 0
xor r10,r10
loop0:
; if (ptr >= opl) goto done0;
cmp rax,rdx
jae done0
; c = *ptr
movzx rcx, byte ptr [rax]
; if (c == '\n') goto inc_line_counter;
cmp rcx,0ah
je inc_line_counter
; d = c - '0'
sub rcx,'0'
; if (d <= 9) goto process_digit;
cmp rcx,9
jbe process_digit
jmp loop0_next
inc_line_counter:
; line_count += 1
inc r10
jmp loop0_next
process_digit:
; *out_ptr = d, out_ptr += 1
mov [r8],rcx
inc r8
loop0_next:
; increment (ptr += 1, goto loop0)
inc rax
jmp loop0
done0:
mov rax,[rsp + 8]
; result.count = out_ptr - out;
mov rdx,[rax]
sub r8,rdx
mov [rax + 8],r8
; result.line_count = line_count;
mov [rax + 16],r10
ret
euler_data_from_text__asm ENDP
; rcx - return pointer (EulerData*)
; rdx - text (String8*)
; r8 - out (U8*)
euler_data_from_text_2dig__asm PROC
; store og return pointer for later
mov [rsp + 8],rcx
; result.v = out
mov [rcx],r8
; ptr = text.str
mov rax,[rdx]
; opl = ptr + text.size
mov rdx,[rdx + 8]
lea rdx,[rax + rdx]
; line_count = 0
xor r10,r10
; dig_count = 0
xor r9,r9
; accum = 0
xor r11,r11
loop0:
; if (ptr >= opl) goto done0;
cmp rax,rdx
jae done0
; c = *ptr
movzx rcx, byte ptr [rax]
; if (c == '\n') goto inc_line_counter;
cmp rcx,0ah
je inc_line_counter
; d = c - '0'
sub rcx,'0'
; if (d <= 9) goto process_digit;
cmp rcx,9
jbe process_digit
jmp loop0_next
inc_line_counter:
; line_count += 1
inc r10
jmp loop0_next
process_digit:
; accum *= 10
imul r11,10
; accum += d
add r11,rcx
; dig_count += 1
inc r9
; if (dig_count != 2) goto loop0_next
cmp r9,2
jne loop0_next
; *out_ptr = accum, out_ptr += 1
mov [r8],r11
inc r8
; accum = 0, dig_count = 0
xor r11,r11
xor r9,r9
loop0_next:
; increment (ptr += 1, goto loop0)
inc rax
jmp loop0
done0:
mov rax,[rsp + 8]
; result.count = out_ptr - out;
mov rdx,[rax]
sub r8,rdx
mov [rax + 8],r8
; result.line_count = line_count;
mov [rax + 16],r10
ret
euler_data_from_text_2dig__asm ENDP
; rcx - result pointer (U64x3*)
; rdx - a (U64x3*)
; r8 - b (U64x3*)
add_u64x3 PROC
; c0,r0 = a0 + b0
xor r9,r9
mov rax, qword ptr [rdx]
add rax, qword ptr [r8]
setc r9b
; result[0] = r0
mov qword ptr [rcx], rax
; /c1,/r1 = a1 + b1
xor r10,r10
mov rax, qword ptr [rdx + 8]
add rax, qword ptr [r8 + 8]
setc r10b
; ^c1,r1 = /r1 + c0
xor r11,r11
add rax, r9
setc r11b
; result[1] = r1
mov qword ptr [rcx + 8], rax
; c1 = ^c1 | /c1
or r10,r11
; /r2 = a2 + b2
; r2 = /r2 + c1
mov rax, qword ptr [rdx + 16]
add rax, qword ptr [r8 + 16]
add rax, r10
; result[2] = r2
mov qword ptr [rcx + 16], rax
; return the result pointer
mov rax, rcx
ret
add_u64x3 ENDP
; rcx - result pointer (U64x3*)
; rdx - a (U64x3*)
; r8 - b (U64)
add_small_u64x3 PROC
; c0,r0 = a0 + b
xor r9,r9
mov rax, qword ptr [rdx]
add rax, r8
setc r9b
; result[0] = r0
mov qword ptr [rcx], rax
; /r1 = a1
mov rax, qword ptr [rdx + 8]
; c1,r1 = /r1 + c0
xor r10,r10
add rax, r9
setc r10b
; result[1] = r1
mov qword ptr [rcx + 8], rax
; /r2 = a2
mov rax, qword ptr [rdx + 16]
; r2 = /r2 + c1
add rax, r10
; result[2] = r2
mov qword ptr [rcx + 16], rax
; return the result pointer
mov rax, rcx
ret
add_small_u64x3 ENDP
; rcx - result pointer (U64x3*)
; rdx - a (U64x3*)
; r8 - b (U64)
mul_small_u64x3 PROC
; move a to r9
mov r9,rdx
; c0,r0 = a0*b (c0:rdx, r0:rax)
mov rax, qword ptr [r9]
mul r8
; result[0] = r0
mov qword ptr [rcx], rax
; move c0 to r10
mov r10, rdx
; /c1,/r1 = a1*b (/c1:rdx, /r1:rax)
mov rax, qword ptr [r9 + 8]
mul r8
; ^c1,r1 = /r1 + c0
xor r11,r11
add rax,r10
setc r11b
; result[1] = r1
mov qword ptr [rcx + 8], rax
; c1 = /c1 + ^c1
add r11, rdx
; /r2 = a2*b
mov rax, qword ptr [r9 + 16]
mul r8
; r2 = /r2 + c1
add rax,r11
; result[2] = r2
mov qword ptr [rcx + 16], rax
; return the result pointer
mov rax, rcx
ret
mul_small_u64x3 ENDP
; rcx - result pointer (U64x3_DivR*)
; rdx - n (U64x3*)
; r8 - d (U64)
div_small_u64x3 PROC
; move n into r9
mov r9,rdx
; (q2,r2) = n2 div d
mov rax, qword ptr [r9 + 16]
xor rdx,rdx
div r8
; result[2] = q2
mov qword ptr [rcx + 16], rax
; (q1,r1) = r2:n1 div d
mov rax, qword ptr [r9 + 8]
;; note: move r2 into high bits is a noop (rdx->rdx)
div r8
; result[1] = q1
mov qword ptr [rcx + 8], rax
; (q0,r0) = r1:n0 div d
mov rax, qword ptr [r9]
;; note: move r2 into high bits is a noop (rdx->rdx)
div r8
; result[0] = q0
mov qword ptr [rcx], rax
; result.r = r0
mov qword ptr [rcx + 24], rdx
; return the result pointer
mov rax,rcx
ret
div_small_u64x3 ENDP
; rcx - result pointer (U64x3*)
; rdx - digits (U8*)
; r8 - count (U64)
u64x3_from_dec PROC FRAME
push r12
.pushreg r12
push r13
.pushreg r13
push r14
.pushreg r14
push rbx
.pushreg rbx
sub rsp,32
.allocstack 32
.endprolog
; clear result to zer0
mov qword ptr [rcx],0
mov qword ptr [rcx + 8],0
mov qword ptr [rcx + 16],0
; move result pointer to rbx
mov rbx,rcx
; ptr = digits
mov r12, rdx
; opl = ptr + count
mov r13,r12
add r13,r8
loop0_begin:
; if (ptr >= opl) goto done
cmp r12,r13
jae done
; x = 0
xor r14,r14
; m = 1
mov r8,1
loop0:
; x *= 10, x += *ptr
imul r14,10
movzx rax, byte ptr [r12]
add r14, rax
; m *= 10
imul r8,10
; ptr += 1
inc r12
; if (m >= 1000000000000000000) goto loop1
mov rdx,1000000000000000000
cmp r8,rdx
jae loop1
; if (ptr >= opl) goto loop1
cmp r12,r13
jae loop1
; goto loop0
jmp loop0
loop1:
; call mul_small_u64x3(result pointer, result pointer, m)
mov rcx,rbx
mov rdx,rbx
;mov r8,r8
call mul_small_u64x3
; call add_small_u64x3(result pointer, result pointer, x)
mov rcx,rbx
mov rdx,rbx
mov r8,r14
call add_small_u64x3
jmp loop0_begin
done:
; return result pointer
mov rax, rbx
add rsp,32
pop rbx
pop r14
pop r13
pop r12
ret
u64x3_from_dec ENDP
; rcx - buffer (U8*)
; rdx - x (U64x3*)
dec_from_u64x3__asm PROC FRAME
push r12
.pushreg r12
push r13
.pushreg r13
push r14
.pushreg r14
sub rsp,64
.allocstack 64
.endprolog
; move buffer into r12
mov r12,rcx
; ptr = buffer
mov r13,r12
; move x to rsp + 32 "by value"
mov rax,[rdx]
mov [rsp + 32],rax
mov rax,[rdx + 8]
mov [rsp + 40],rax
mov rax,[rdx + 16]
mov [rsp + 48],rax
loop0:
; if (x[0] == 0 && x[1] == 0 && x[2] == 0) goto finish;
mov rax,[rsp + 32]
or rax,[rsp + 40]
or rax,[rsp + 48]
test rax,rax
jz finish
; call div_small_u64x3(div-result, x, 10)
lea rcx,[rsp + 32]
lea rdx,[rsp + 32]
mov r8,10
call div_small_u64x3
; d = div_result.r
mov rcx,[rsp + 56]
; *ptr = d
mov byte ptr [r13], cl
; ptr += 1
inc r13
; goto loop0
jmp loop0
finish:
; set return value to (ptr - buffer)
mov rax,r13
sub rax,r12
; ptra = ptr
; ptrb = buffer
; (ptra: r12, ptrb: r13)
; ptrb -= 1
dec r13
reverse_loop:
; if (ptra >= ptrb) goto done
cmp r12, r13
jae done
; swap
mov dl, byte ptr [r12]
mov cl, byte ptr [r13]
mov byte ptr [r12], cl
mov byte ptr [r13], dl
; ptra += 1, ptrb -= 1
inc r12
dec r13
; goto reverse_loop
jmp reverse_loop
done:
add rsp,64
pop r14
pop r13
pop r12
ret
dec_from_u64x3__asm ENDP
; rcx - a (U64xN*)
; rdx - b (U64)
; assumption: a has an extra slot of memory available
add_small_in_place_u64xn PROC
; size = a.size
mov r11, [rcx]
; if (size == 0) goto simple_case;
test r11,r11
jz simple_case
; c,r0 = a[0] + b
xor r9,r9
mov rax, qword ptr [rcx + 8]
add rax, rdx
setc r9b
; a[0] = r0
mov qword ptr [rcx + 8], rax
; ptr = &a[1]
lea rdx, [rcx + 16]
; opl = &a[size]
lea r8, [8 + rcx + r11*8]
; *opl = 0
mov qword ptr [r8],0
loop0:
; if (c == 0) goto loop0_done
test r9b,r9b
jz loop0_done
; c',r = *ptr + c
xor r10,r10
mov rax, qword ptr [rdx]
add rax, r9
setc r10b
; *ptr = r
mov qword ptr [rdx], rax
; ptr += 1
add rdx,8
; c = c'
mov r9,r10
jmp loop0
loop0_done:
; if (*opl == 0) goto skip_inc_size;
mov rax, [r8]
test rax,rax
jz skip_inc_size
inc r11
mov [rcx],r11
skip_inc_size:
ret
simple_case:
mov qword ptr [rcx], 1
mov [rcx + 8], rdx
ret
add_small_in_place_u64xn ENDP
; rcx - a (U64xN*)
; rdx - b (U64)
; assumption: a has an extra slot of memory available
mul_small_in_place_u64xn PROC FRAME
mov [rsp + 8],r12
.savereg r12, 8
.endprolog
; move b into r9
mov r9,rdx
; f = 0
xor r8,r8
; i = 0
xor r10,r10
; save a.size in r12
mov r12,[rcx]
loop0:
; if (i >= a.size) goto done
cmp r10,r12
jae done
; l,h = a[i]*b
mov rax, qword ptr [rcx + r10*8 + 8]
mul r9
; c,r = l + f
xor r11,r11
add rax,r8
setc r11b
; a[i] = r
mov qword ptr [rcx + r10*8 + 8], rax
; f = h + c
mov r8,r11
add r8,rdx
; i += 1
inc r10
; goto loop0
jmp loop0
done:
; if (f == 0) goto return;
test r8,r8
jz return
; a[a.size] = f
mov qword ptr [rcx + r12*8 + 8], r8
; a.size += 1
inc r12
mov qword ptr [rcx],r12
return:
mov r12,[rsp + 8]
ret
mul_small_in_place_u64xn ENDP
; rcx - n (U64xN*)
; rdx - d (U64)
div_small_in_place_u64xn PROC
; move d into r8
mov r8,rdx
; r = 0
xor rdx,rdx
; i = n.size
mov r9, qword ptr [rcx]
loop0:
; if (i == 0) goto done;
test r9,r9
jz done
; i -= 1
dec r9
; q,r = r:n[i] div d
mov rax, qword ptr [rcx + r9*8 + 8]
div r8
; n[i] = q
mov qword ptr [rcx + r9*8 + 8], rax
; goto loop0
jmp loop0
done:
; if (n.size > 0 && n[n.size - 1] == 0) n.size -= 1
mov r9, qword ptr [rcx]
test r9,r9
jz return
mov r8, qword ptr [rcx + r9*8]
test r8,r8
jnz return
dec r9
mov qword ptr [rcx], r9
return:
; set result value to last remainder
mov rax,rdx
ret
div_small_in_place_u64xn ENDP
; rcx - buffer (U8*)
; rdx - x (U64xN*) (we're allowed to modify this in place)
dec_from_u64xn__asm PROC FRAME
push r12
.pushreg r12
push r13
.pushreg r13
push r14
.pushreg r14
sub rsp,32
.allocstack 32
.endprolog
; move buffer into r12
mov r12,rcx
; ptr = buffer
mov r13,r12
; move x to r14
mov r14,rdx
loop0:
; if (*x == 0) goto finish;
mov rax, qword ptr [r14]
test rax,rax
jz finish
; call div_small_in_place_u64xn(x, 10)
mov rcx,r14
mov rdx,10
call div_small_in_place_u64xn
; *ptr = remainder
mov byte ptr [r13], al
; ptr += 1
inc r13
; goto loop0
jmp loop0
finish:
; set return value to (ptr - buffer)
mov rax,r13
sub rax,r12
; ptra = ptr
; ptrb = buffer
; (ptra: r12, ptrb: r13)
; ptrb -= 1
dec r13
reverse_loop:
; if (ptra >= ptrb) goto done
cmp r12, r13
jae done
; swap
mov dl, byte ptr [r12]
mov cl, byte ptr [r13]
mov byte ptr [r12], cl
mov byte ptr [r13], dl
; ptra += 1, ptrb -= 1
inc r12
dec r13
; goto reverse_loop
jmp reverse_loop
done:
add rsp,32
pop r14
pop r13
pop r12
ret
dec_from_u64xn__asm ENDP
END