From 93fed26a19968a0c3b5f1f0719f6ddc94ff8622a Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Mon, 16 Mar 2026 16:03:00 -0600 Subject: [PATCH 1/6] converted division by a power of two into a bitshift --- src/fatdrvce/fatdrvce.asm | 49 ++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index a22afb01e..59455be87 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -2495,32 +2495,49 @@ util_get_cluster_offset: ;------------------------------------------------------------------------------- util_cluster_entry_to_block: - ld e,a - xor a,a - ld bc,128 - call ti._ldivu - ld bc,(yfat.fat_pos) + ; input: + ; - A:UHL + ; output: + ; - A:UHL = floor(A:UHL / 128) + (yfat.fat_pos) + ; destroys: + ; - BC, flags + push hl + pop bc + ld l,7 + call ti._lshru + ld hl,(yfat.fat_pos) add hl,bc - adc a,e + adc a,0 ret ;------------------------------------------------------------------------------- util_ceil_byte_size_to_block_size: + ; input: + ; - A:UHL + ; output: + ; - A:UHL = ceil(A:UHL / 512) + ; - A = 0 + ; destroys: + ; - BC, flags compare_auhl_zero ret z - ld e,a - push hl,de - xor a,a - ld bc,512 - push bc - call ti._lremu - compare_hl_zero - pop bc,de,hl + ; test if the low 9 bits are non-zero + inc l + dec l + jr nz,.round_up + bit 0,h +.round_up: push af - xor a,a - call ti._ldivu + push hl + pop bc + ld l,9 + call ti._lshru + push bc + pop hl pop af + ld a,0 ret z + ; round up inc hl ret From 65392f1276a19754035d34d7374536b4a54b9c94 Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Sat, 18 Apr 2026 21:23:28 -0600 Subject: [PATCH 2/6] optimize ceil(A:UHL / 512) (runner112) Co-authored-by: Zachary Wassall --- src/fatdrvce/fatdrvce.asm | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index 59455be87..54704ee15 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -2519,25 +2519,26 @@ util_ceil_byte_size_to_block_size: ; - A = 0 ; destroys: ; - BC, flags - compare_auhl_zero + + ; if A:UHL == 0, abort + compare_hl_zero + jr nz, .dec_hl + or a, a ret z - ; test if the low 9 bits are non-zero - inc l - dec l - jr nz,.round_up - bit 0,h -.round_up: - push af + ; A:UHL-- + dec a +.dec_hl: + dec hl + + ; A:UHL /= 512 push hl pop bc ld l,9 call ti._lshru push bc pop hl - pop af - ld a,0 - ret z - ; round up + + ; A:UHL++ inc hl ret From 57d23556d429e873cb484d4c04dfcdabb55aae30 Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sun, 19 Apr 2026 16:05:03 -0400 Subject: [PATCH 3/6] inline constant shifts, convert more divisions by blocks_per_cluster into shifts --- src/fatdrvce/fatdrvce.asm | 119 +++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 54 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index 54704ee15..9f1d448e0 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -818,7 +818,7 @@ fat_SetFileSize: ret .writegood: ld a,hl,(yfat.working_size) - call util_ceil_byte_size_to_blocks_per_cluster + call util_ceil_byte_size_to_cluster_size pop iy ret @@ -911,20 +911,21 @@ fat_SetFileBlockOffset: ld hl,(yfatFile.block_index) .followchain: ld c,(yfatFile.blocks_per_cluster) - xor a,a - ld b,24 + ld a,c + dec a + and a,l + ld (.cluster_block),a + ld a,c .divloop: add hl,hl rla - cp a,c - jr c,.divskip - sub a,c - inc l -.divskip: - djnz .divloop - ld (.cluster_block),a + jr nc,.divloop + push af + inc sp push hl + inc sp pop bc + inc sp ld a,hl,(yfatFile.current_cluster) jr .entergetpos .getclusterpos: @@ -2501,13 +2502,22 @@ util_cluster_entry_to_block: ; - A:UHL = floor(A:UHL / 128) + (yfat.fat_pos) ; destroys: ; - BC, flags + + ; shift A:UHL left by 1 and shift the 33-bit result right by 8 + add hl,hl + rla + push af + inc sp push hl - pop bc - ld l,7 - call ti._lshru - ld hl,(yfat.fat_pos) + inc sp + pop hl + inc sp + ccf + sbc a,a + ; the increment to set A to 0 or 1 is deferred until the adc below + ld bc,(yfat.fat_pos) add hl,bc - adc a,0 + adc a,1 ret ;------------------------------------------------------------------------------- @@ -2516,53 +2526,54 @@ util_ceil_byte_size_to_block_size: ; - A:UHL ; output: ; - A:UHL = ceil(A:UHL / 512) - ; - A = 0 + ; - A = BCU = B = 0 ; destroys: ; - BC, flags - ; if A:UHL == 0, abort - compare_hl_zero - jr nz, .dec_hl - or a, a - ret z - ; A:UHL-- - dec a -.dec_hl: - dec hl - - ; A:UHL /= 512 + ; add 511 to A:UHL and shift the 33-bit result right by 9 + ld bc,511 + add hl,bc + dec b + adc a,b + rra + push af + inc sp push hl - pop bc - ld l,9 - call ti._lshru - push bc + inc sp pop hl - - ; A:UHL++ - inc hl + inc sp + rr h + rr l + xor a,a ret ;------------------------------------------------------------------------------- -util_ceil_byte_size_to_blocks_per_cluster: - compare_auhl_zero - ret z - push af,hl - xor a,a - sbc hl,hl - ld h,(yfat.blocks_per_cluster) +util_ceil_byte_size_to_cluster_size: + ; input: + ; - A:UHL + ; - IY = fat struct + ; output: + ; - UHL = ceil(A:UHL / (512 * blocks_per_cluster)) + ; destroys: + ; - A, BC, flags + + ; First, get ceiling block size + call util_ceil_byte_size_to_block_size + + ; Now, get ceiling cluster size by adding (blocks_per_cluster-1) and dividing by blocks_per_cluster + ld c,(yfat.blocks_per_cluster) + ld a,c + dec c + ; Note: ceiling block size is at most $800000 so this addition cannot overflow + add hl,bc +.loop: add hl,hl - push hl - pop bc - pop hl,de - ld e,d - push hl,de,bc - call ti._lremu - compare_hl_zero - pop bc,de,hl + rla + jr nc,.loop push af - xor a,a - call ti._ldivu - pop af - ret z - inc hl + inc sp + push hl + inc sp + pop hl + inc sp ret From f75a43f660716c195eb3e7b66c386f0762eb018c Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sun, 19 Apr 2026 16:06:18 -0400 Subject: [PATCH 4/6] miscellaneous optimizations for blocks_per_cluster modulo operations --- src/fatdrvce/fatdrvce.asm | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index 9f1d448e0..ce41cc83f 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -858,7 +858,7 @@ fat_SetFileBlockOffset: add hl,de jq nc,.invalid ld de,(yfatFile.block_index) - or a,a + xor a,a sbc hl,de ret z ; if at the same block position, done (return 0) add hl,de ; hl = new block_index, de = current block_index @@ -866,9 +866,7 @@ fat_SetFileBlockOffset: ; check if block is within current cluster ; we can optimize by just changing the block position ld (yfatFile.block_index),hl - ld a,(yfatFile.blocks_per_cluster) - dec a - cpl + sub a,(yfatFile.blocks_per_cluster) ld c,a ; mask for number of blocks and a,l ld l,a @@ -2298,16 +2296,14 @@ util_fat_read_write: compare_hl_zero ; no more blocks left, exit jq z,.return - ld d,(xfatFile.blocks_per_cluster) - ld a,d - dec a - and a,(xfatFile.block_index) ; mask off the number of blocks remaining in this cluster - ld e,a - ld a,d - sub a,e - ld de,0 - ld e,a - ex de,hl ; de = total remaining, hl = remaining in cluster + ex de,hl ; de = total remaining + xor a,a + sub a,(xfatFile.blocks_per_cluster) ; mask for number of blocks + or a,(xfatFile.block_index) ; mask the negative number of blocks remaining in this cluster + cpl + inc a + sbc hl,hl + ld l,a ; hl = remaining in cluster compare_hl_de jr nc,.singleclusterread .multicluster: @@ -2343,11 +2339,9 @@ util_fat_read_write: ld hl,(xfatFile.block_index) add hl,de ld (xfatFile.block_index),hl - ld h,(xfatFile.blocks_per_cluster) - ld a,h + ld a,(xfatFile.blocks_per_cluster) dec a - and a,l - or a,a ; if at end of cluster, get the next one + and a,l ; if at end of cluster, get the next one call z,.getnextcluster .return: ld hl,(xfatFile.block_index) From 27953b98b44f7afed5d156c26ad55ae67c6e2bf1 Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sun, 19 Apr 2026 19:23:11 -0400 Subject: [PATCH 5/6] optimize util_block_to_cluster and util_ceil_byte_size_to_cluster_size --- src/fatdrvce/fatdrvce.asm | 42 +++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index ce41cc83f..d0ab73c1c 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -2198,17 +2198,16 @@ util_block_to_cluster: or a,a sbc hl,bc sbc a,(yfat.data_region + 3) - ld de,(yfat.blocks_per_cluster - 2) - ld d,0 - ld e,a + ; AC:UHL = A:UHL << (8 - log2(blocks_per_cluster)) + ld c,a + ld a,(yfat.blocks_per_cluster) .loop: add hl,hl - ex de,hl - adc hl,hl - ex de,hl + rl c + rla jr nc,.loop - ld a,d - push de + ; A:UHL = (AC:UHL >> 8) + 2 + push bc push hl inc sp pop hl @@ -2551,23 +2550,28 @@ util_ceil_byte_size_to_cluster_size: ; destroys: ; - A, BC, flags - ; First, get ceiling block size - call util_ceil_byte_size_to_block_size - - ; Now, get ceiling cluster size by adding (blocks_per_cluster-1) and dividing by blocks_per_cluster - ld c,(yfat.blocks_per_cluster) - ld a,c - dec c - ; Note: ceiling block size is at most $800000 so this addition cannot overflow - add hl,bc + ; BC:UHL = A:UHL << (7 - log2(blocks_per_cluster)) + ld b,(yfat.blocks_per_cluster) + or a,a + jr .enter .loop: add hl,hl rla +.enter: + rl b jr nc,.loop - push af - inc sp + ld c,a + ; check if HL != 0 + ld a,h + or a,l + ; UHL = BC:UHL >> 16 + push bc push hl inc sp + inc sp pop hl inc sp + ; round up if any shifted-out bits were non-zero + ret z + inc hl ret From 90fc5bc9d68214e52adb2828d32913a3ccfba0f5 Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sun, 19 Apr 2026 19:44:18 -0400 Subject: [PATCH 6/6] save 1 more byte in util_ceil_byte_size_to_block_size --- src/fatdrvce/fatdrvce.asm | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/fatdrvce/fatdrvce.asm b/src/fatdrvce/fatdrvce.asm index d0ab73c1c..6183d11fd 100644 --- a/src/fatdrvce/fatdrvce.asm +++ b/src/fatdrvce/fatdrvce.asm @@ -2519,16 +2519,13 @@ util_ceil_byte_size_to_block_size: ; - A:UHL ; output: ; - A:UHL = ceil(A:UHL / 512) - ; - A = BCU = B = 0 + ; - A = 0 ; destroys: - ; - BC, flags + ; - C, flags - ; add 511 to A:UHL and shift the 33-bit result right by 9 - ld bc,511 - add hl,bc - dec b - adc a,b - rra + ; UHL = A:UHL >> 9 + ld c,l + srl a push af inc sp push hl @@ -2537,6 +2534,11 @@ util_ceil_byte_size_to_block_size: inc sp rr h rr l + ; round up if any shifted-out bits were non-zero + sbc a,a + or a,c + ret z + inc hl xor a,a ret