Source: Shell Sort (16-bit)

Shell Sort (for 16-Bit Elements) by Fredrik Ramsberg
Here's my implementation of Shell Sort, a sort algorithm with rather amazing properties. All mathematicians that have tried to analyze this algorithm have failed, but there's lots of empirical data that suggests it's roughly O(n log n). I wish I could say I invented this algorithm, but I didn't.
This implementation can sort more than 32,000 16-bit values (The only reason it can't sort 32,767 values is that there still has to be room for the routine and a few bytes for temporary storage). There's presently no other sorting routine in the repository that can handle more than 256 values.
; This is source code for what is meant to be an efficient implementation
; of ShellSort in 6502 assembler. The source code is in a format suitable for 
; the excellent and free ACME cross-assembler by Marco Baye, but should be
; easy to convert for other assemblers.
;
; While ShellSort is very good for entirely unsorted arrays, it is also
; reasonably good for almost sorted arrays. However, if you happen to know
; that very few values are out of place OR that the values that are out
; of place are not very far from their right position, InsertionSort is a
; better choice. InsertionSort is also provided in this file, since 
; ShellSort is really just a clever extension of InsertionSort. 
; Here are some examples of sort times @ 1MHz (10000 values):
;
;                                                    InsertionSort  ShellSort
; Array is entirely sorted from the start:                    1,9s      14,7s
; 1 value is at the wrong end of the array:                   2,9s      15,7s
; 10 values are at the wrong end of the array:               11,8s      16,9s
; 50 values are at the wrong end of the array:               51,3s      17,6s
; Array is entirely unsorted:                              2464,2s      30,5s



  !to "shellsrt.o"                    ; An assembler directive to set out-file
  !sl "shelllbl.a"                    ; Tells the assembler to write all label
                                      ; values to a file

  *=$1000                             ; Start address. Can safely be set to 
                                      ; anything from $0100 to $fe00

  j=$fb                               ; Uses two bytes. Has to be on zero-page
  j_plus_h=$fd                        ; Uses two bytes. Has to be on zero-page
  arr_length = j_plus_h               ; Can safely use the same location as
                                      ; j_plus_h, but doesn't have to be on ZP

; To increase speed and reduce code size, you can optionally place one or more
; of these 2-byte fields on zero-page (the suggested values work on a C64):
; v_plus_1 = $5 
; h = $7
; arr_start = $A
; Some simple tests using an array of 10000 completely unsorted values showed 
; a 5.6% shorter execution time if all three fields were placed on ZP, with 
; v_plus_1 being a little more important than the others.
;
; To go even further, placing these 2-byte fields on zero-page will provide a
; small improvement:
; v
; i
; arr_end



; This routine mimics this ShellSort implementation (Language=JavaScript):

; function ShellSort(arr,length) {
;   var j, i, v, h, k;
;   for (h=1; h < length; h=3*h+1);
;   while (h=(h-1)/3)
;     for (i=h, j=i, v=arr[i]; i<=length; arr[j+h]=v, i++, j=i, v=arr[i])
;       while((j-=h) >= 0 && arr[j] > v)
;         arr[j+h]=arr[j];
; }


; To call the routine, create a word-array at address nnnn in memory. The
; first word should contain the number of bytes to be sorted (= 2 * the number
; of elements). Then come all those elements. Then sort the elements using
; ShellShort like this: 
; lda #<nnnn, ldx #>nnnn, jsr shell_sort 
; or to perform an InsertionSort:
; lda #<nnnn, ldx #>nnnn, jsr insertion_sort 
; ( < means lowbyte and > means highbyte )

shell_sort      ldy #h_high - h_low - 1
                bne sort_main         ; Always branch
insertion_sort  ldy #0

sort_main       sty h_start_index
                cld
                sta j
                sta in_address
                
                clc
                adc #2
                sta arr_start
                
                stx j + 1
                stx in_address + 1
                
                txa
                adc #0
                sta arr_start + 1
                
                ldy #0
                lda (j),y
                sta arr_length
                
                clc
                adc arr_start
                sta arr_end
                
                iny
                lda (j),y
                sta arr_length + 1

                adc arr_start + 1
                sta arr_end + 1

;   for (h=1; h < length; h=3*h+1);
                
                ldx h_start_index     ; Start with highest value of h
chk_prev_h      lda h_low,x
                cmp arr_length
                lda h_high,x
                sbc arr_length + 1
                bcc end_of_init       ; If h < array_length, we've found the right h
                dex
                bpl chk_prev_h
                rts                   ; array length is 0 or 1. No sorting needed.

end_of_init     inx
                stx h_index

;   while (h=(h-1)/3)

h_loop          dec h_index
                bpl get_h
                rts                   ; All done!
                
get_h           ldy h_index
                lda h_low,y
                sta h
                clc
                adc in_address        ; ( in_address is arr_start - 2)
                sta i
                lda h_high,y
                sta h + 1
                adc in_address + 1
                sta i + 1
                
; for (i=h, j=i, v=arr[i]; i<=length; arr[j+h]=v, i++, j=i, v=arr[i])

i_loop          lda i
                clc
                adc #2
                sta i
                sta j
                lda i + 1
                adc #0
                sta i + 1
                sta j + 1

                ldx i
                cpx arr_end
                lda i + 1
                sbc arr_end + 1
                bcs h_loop

                ldy #0
                lda (j),y
                sta v
                clc
                adc #1
                sta v_plus_1
                iny
                lda (j),y
                sta v + 1
                adc #0
                bcs i_loop            ; v=$ffff, so no j-loop necessary
                sta v_plus_1 + 1
                
                dey                   ; Set y=0

;         while((j-=h) >= 0 && arr[j] > v)

j_loop          lda j
                sta j_plus_h
                sec
                sbc h
                sta j
                tax
                lda j + 1
                sta j_plus_h + 1
                sbc h + 1
                sta j + 1

; Check if we've reached the bottom of the array

                bcc exit_j_loop
                cpx arr_start
                sbc arr_start + 1
                bcc exit_j_loop
                
; Do the actual comparison:  arr[j] > v

                lda (j),y
                tax
                iny                   ; Set y=1
                lda (j),y
                cpx v_plus_1
                sbc v_plus_1 + 1
                bcc exit_j_loop

;           arr[j+h]=arr[j];

                lda (j),y
                sta (j_plus_h),y
                dey                   ; Set y=0
                txa
                sta (j_plus_h),y
                bcs j_loop            ; Always branch

;       for (i=h, j=i, v=arr[i]; i<length; arr[j+h]=v, i++, j=i, v=arr[i])  ***  arr[j+h]=v part

exit_j_loop     lda v
                ldy #0
                sta (j_plus_h),y
                iny
                lda v + 1
                sta (j_plus_h),y
                jmp i_loop


; This describes the sequence h(0)=1; h(n)=k*h(n-1)+1 for k=3 (1,4,13,40...)
; All word-values are muliplied by 2, since we are sorting 2-byte values

h_low           !byte <2, <8, <26, <80, <242, <728, <2186, <6560, <19682
h_high          !byte >2, >8, >26, >80, >242, >728, >2186, >6560, >19682
h_start_index   !byte 0
h_index         !byte 0
h               !word 0
in_address      !word 0
arr_start       !word 0
arr_end         !word 0
i               !word 0
v               !word 0
v_plus_1        !word 0
Last page update: Jan 14, 2005.