1 files changed, 469 insertions, 0 deletions
diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S
index f7a8f6335c4..c592c4caa5d 100644
--- a/libgcc/config/avr/lib1funcs.S
+++ b/libgcc/config/avr/lib1funcs.S
@@ -61,6 +61,15 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #endif
 	.endm
 
+.macro	wmov  r_dest, r_src
+#if defined (__AVR_HAVE_MOVW__)
+    movw \r_dest,   \r_src
+#else
+    mov \r_dest,    \r_src
+    mov \r_dest+1,  \r_src+1
+#endif
+.endm
+
 #if defined (__AVR_HAVE_JMP_CALL__)
 #define XCALL call
 #define XJMP  jmp
@@ -846,6 +855,352 @@ __divmodsi4_exit:
 ENDF __divmodsi4
 #endif /* defined (L_divmodsi4) */
 
+
+/*******************************************************
+       Division 64 / 64
+       Modulo   64 % 64
+*******************************************************/
+
+;; Use Speed-optimized Version on "big" Devices, i.e. Devices with
+;; at least 16k of Program Memory.  For smaller Devices, depend
+;; on MOVW.
+
+#if defined (__AVR_HAVE_JMP_CALL__)
+#   define SPEED_DIV 8
+#elif defined (__AVR_HAVE_MOVW__)
+#   define SPEED_DIV 16
+#else
+#   define SPEED_DIV 0
+#endif
+
+;; A[0..7]: In: Dividend;
+;; Out: Quotient  (T = 0)
+;; Out: Remainder (T = 1)
+#define A0  18
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+#define A4  A0+4
+#define A5  A0+5
+#define A6  A0+6
+#define A7  A0+7
+
+;; B[0..7]: In: Divisor;   Out: Clobber
+#define B0  10
+#define B1  B0+1
+#define B2  B0+2
+#define B3  B0+3
+#define B4  B0+4
+#define B5  B0+5
+#define B6  B0+6
+#define B7  B0+7
+
+;; C[0..7]: Expand remainder;  Out: Remainder (unused)
+#define C0  8
+#define C1  C0+1
+#define C2  30
+#define C3  C2+1
+#define C4  28
+#define C5  C4+1
+#define C6  26
+#define C7  C6+1
+
+;; Holds Signs during Division Routine
+#define SS      __tmp_reg__
+
+;; Bit-Counter in Division Routine
+#define R_cnt   __zero_reg__
+
+;; Scratch Register for Negation
+#define NN      r31
+
+#if defined (L_udivdi3)
+
+;; R25:R18 = R24:R18  umod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __umoddi3
+    set
+    rjmp __udivdi3_umoddi3
+ENDF __umoddi3
+
+;; R25:R18 = R24:R18  udiv  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __udivdi3
+    clt
+ENDF __udivdi3
+
+DEFUN __udivdi3_umoddi3
+    push    C0
+    push    C1
+    push    C4
+    push    C5
+    XCALL   __udivmod64
+    pop     C5
+    pop     C4
+    pop     C1
+    pop     C0
+    ret
+ENDF __udivdi3_umoddi3
+#endif /* L_udivdi3 */
+
+#if defined (L_udivmod64)
+
+;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation
+;; No Registers saved/restored; the Callers will take Care.
+;; Preserves B[] and T-flag
+;; T = 0: Compute Quotient  in A[]
+;; T = 1: Compute Remainder in A[] and shift SS one Bit left
+
+DEFUN __udivmod64
+
+    ;; Clear Remainder (C6, C7 will follow)
+    clr     C0
+    clr     C1
+    wmov    C2, C0
+    wmov    C4, C0
+    ldi     C7, 64
+
+#if SPEED_DIV == 0 || SPEED_DIV == 16
+    ;; Initialize Loop-Counter
+    mov     R_cnt, C7
+    wmov    C6, C0
+#endif /* SPEED_DIV */
+
+#if SPEED_DIV == 8
+
+    push    A7
+    clr     C6
+
+1:  ;; Compare shifted Devidend against Divisor
+    ;; If -- even after Shifting -- it is smaller...
+    CP  A7,B0  $  cpc C0,B1  $  cpc C1,B2  $  cpc C2,B3  
+    cpc C3,B4  $  cpc C4,B5  $  cpc C5,B6  $  cpc C6,B7  
+    brcc    2f
+
+    ;; ...then we can subtract it.  Thus, it is legal to shift left
+               $  mov C6,C5  $  mov C5,C4  $  mov C4,C3
+    mov C3,C2  $  mov C2,C1  $  mov C1,C0  $  mov C0,A7
+    mov A7,A6  $  mov A6,A5  $  mov A5,A4  $  mov A4,A3
+    mov A3,A2  $  mov A2,A1  $  mov A1,A0  $  clr A0
+
+    ;; 8 Bits are done
+    subi    C7, 8
+    brne    1b
+
+    ;; Shifted 64 Bits:  A7 has traveled to C7
+    pop     C7
+    ;; Divisor is greater than Dividend. We have:
+    ;; A[] % B[] = A[]
+    ;; A[] / B[] = 0
+    ;; Thus, we can return immediately
+    rjmp    5f
+
+2:  ;; Initialze Bit-Counter with Number of Bits still to be performed
+    mov     R_cnt, C7
+
+    ;; Push of A7 is not needed because C7 is still 0
+    pop     C7
+    clr     C7
+
+#elif  SPEED_DIV == 16
+
+    ;; Compare shifted Dividend against Divisor
+    cp      A7, B3
+    cpc     C0, B4
+    cpc     C1, B5
+    cpc     C2, B6
+    cpc     C3, B7
+    brcc    2f
+
+    ;; Divisor is greater than shifted Dividen: We can shift the Dividend
+    ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk
+    wmov  C2,A6  $  wmov C0,A4
+    wmov  A6,A2  $  wmov A4,A0
+    wmov  A2,C6  $  wmov A0,C4
+
+    ;; Set Bit Counter to 32
+    lsr     R_cnt
+2:
+#elif SPEED_DIV
+#error SPEED_DIV = ?
+#endif /* SPEED_DIV */
+
+;; The very Division + Remainder Routine
+
+3:  ;; Left-shift Dividend...
+    lsl A0     $  rol A1     $  rol A2     $  rol A3
+    rol A4     $  rol A5     $  rol A6     $  rol A7
+
+    ;; ...into Remainder
+    rol C0     $  rol C1     $  rol C2     $  rol C3
+    rol C4     $  rol C5     $  rol C6     $  rol C7
+
+    ;; Compare Remainder and Divisor
+    CP  C0,B0  $  cpc C1,B1  $  cpc C2,B2  $  cpc C3,B3
+    cpc C4,B4  $  cpc C5,B5  $  cpc C6,B6  $  cpc C7,B7
+
+    brcs 4f
+
+    ;; Divisor fits into Remainder:  Subtract it from Remainder...
+    SUB C0,B0  $  sbc C1,B1  $  sbc C2,B2  $  sbc C3,B3
+    sbc C4,B4  $  sbc C5,B5  $  sbc C6,B6  $  sbc C7,B7
+
+    ;; ...and set according Bit in the upcoming Quotient
+    ;; The Bit will travel to its final Position
+    ori A0, 1
+
+4:  ;; This Bit is done
+    dec     R_cnt
+    brne    3b
+    ;; __zero_reg__ is 0 again
+
+    ;; T = 0: We are fine with the Quotient in A[]
+    ;; T = 1: Copy Remainder to A[]
+5:  brtc    6f
+    wmov    A0, C0
+    wmov    A2, C2
+    wmov    A4, C4
+    wmov    A6, C6
+    ;; Move the Sign of the Result to SS.7
+    lsl     SS
+
+6:  ret
+
+ENDF __udivmod64
+#endif /* L_udivmod64 */
+    
+
+#if defined (L_divdi3)
+
+;; R25:R18 = R24:R18  mod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __moddi3
+    set
+    rjmp    __divdi3_moddi3
+ENDF __moddi3
+
+;; R25:R18 = R24:R18  div  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __divdi3
+    clt
+ENDF __divdi3
+
+DEFUN  __divdi3_moddi3
+#if SPEED_DIV
+    mov     r31, A7
+    or      r31, B7
+    brmi    0f
+    ;; Both Signs are 0:  the following Complexitiy is not needed
+    XJMP    __udivdi3_umoddi3
+#endif /* SPEED_DIV */    
+
+0:  ;; The Prologue
+    ;; Save Z = 12 Registers:  Y, 17...8
+    ;; No Frame needed (X = 0)
+    clr r26
+    clr r27
+    ldi r30, lo8(gs(1f))
+    ldi r31, hi8(gs(1f))
+    XJMP __prologue_saves__ + ((18 - 12) * 2)
+
+1:  ;; SS.7 will contain the Sign of the Quotient  (A.sign * B.sign)
+    ;; SS.6 will contain the Sign of the Remainder (A.sign)
+    mov     SS, A7
+    asr     SS
+    ;; Adjust Dividend's Sign as needed
+#if SPEED_DIV
+    ;; Compiling for Speed we know that at least one Sign must be < 0
+    ;; Thus, if A[] >= 0 then we know B[] < 0
+    brpl    22f
+#else
+    brpl    21f
+#endif /* SPEED_DIV */
+   
+    XCALL   __negdi2
+
+    ;; Adjust Divisor's Sign and SS.7 as needed
+21: tst     B7
+    brpl    3f
+22: ldi     NN, 1 << 7
+    eor     SS, NN
+
+    ldi NN, -1
+    com B4     $  com B5     $  com B6     $  com B7
+               $  com B1     $  com B2     $  com B3
+    NEG B0
+               $  sbc B1,NN  $  sbc B2,NN  $  sbc B3,NN
+    sbc B4,NN  $  sbc B5,NN  $  sbc B6,NN  $  sbc B7,NN
+
+3:  ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag)
+    XCALL   __udivmod64
+
+    ;; Adjust Result's Sign
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    tst     SS
+    brpl    4f
+#else
+    sbrc    SS, 7
+#endif /* __AVR_HAVE_JMP_CALL__ */
+    XCALL   __negdi2
+
+4:  ;; Epilogue: Restore the Z = 12 Registers and return
+    in r28, __SP_L__
+    in r29, __SP_H__
+    ldi r30, 12
+    XJMP __epilogue_restores__ + ((18 - 12) * 2)
+
+ENDF __divdi3_moddi3
+
+#undef R_cnt
+#undef SS
+#undef NN
+
+#endif /* L_divdi3 */
+
+#if defined (L_negdi2)
+DEFUN __negdi2
+
+    com  A4    $  com  A5    $  com  A6    $  com  A7
+               $  com  A1    $  com  A2    $  com  A3
+    NEG  A0
+               $  sbci A1,-1 $  sbci A2,-1 $  sbci A3,-1
+    sbci A4,-1 $  sbci A5,-1 $  sbci A6,-1 $  sbci A7,-1
+    ret
+
+ENDF __negdi2
+#endif /* L_negdi2 */
+
+#undef C7
+#undef C6
+#undef C5
+#undef C4
+#undef C3
+#undef C2
+#undef C1
+#undef C0
+
+#undef B7
+#undef B6
+#undef B5
+#undef B4
+#undef B3
+#undef B2
+#undef B1
+#undef B0
+
+#undef A7
+#undef A6
+#undef A5
+#undef A4
+#undef A3
+#undef A2
+#undef A1
+#undef A0
+
 
 .section .text.libgcc.prologue, "ax", @progbits
     
@@ -854,6 +1209,7 @@ ENDF __divmodsi4
  **********************************/
 #if defined (L_prologue)
 
+;; This function does not clobber T-flag; 64-bit division relies on it
 DEFUN __prologue_saves__
 	push r2
 	push r3
@@ -1181,6 +1537,119 @@ DEFUN __tablejump_elpm__
 ENDF __tablejump_elpm__
 #endif /* defined (L_tablejump_elpm) */
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Loading n bytes from Flash; n = 3,4
+;; R22... = Flash[Z]
+;; Clobbers: __tmp_reg__
+
+#if (defined (L_load_3)        \
+     || defined (L_load_4))    \
+    && !defined (__AVR_HAVE_LPMX__)
+
+;; Destination
+#define D0  22
+#define D1  D0+1
+#define D2  D0+2
+#define D3  D0+3
+
+.macro  .load dest, n
+    lpm
+    mov     \dest, r0
+.if \dest != D0+\n-1
+    adiw    r30, 1
+.else
+    sbiw    r30, \n-1
+.endif
+.endm
+
+#if defined (L_load_3)
+DEFUN __load_3
+    push  D3
+    XCALL __load_4
+    pop   D3
+    ret
+ENDF __load_3
+#endif /* L_load_3 */
+
+#if defined (L_load_4)
+DEFUN __load_4
+    .load D0, 4
+    .load D1, 4
+    .load D2, 4
+    .load D3, 4
+    ret
+ENDF __load_4
+#endif /* L_load_4 */
+
+#endif /* L_load_3 || L_load_3 */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Loading n bytes from Flash; n = 2,3,4
+;; R22... = Flash[R21:Z]
+;; Clobbers: __tmp_reg__, R21, R30, R31
+
+#if (defined (L_xload_2)            \
+     || defined (L_xload_3)         \
+     || defined (L_xload_4))        \
+    && defined (__AVR_HAVE_ELPM__)  \
+    && !defined (__AVR_HAVE_ELPMX__)
+
+#if !defined (__AVR_HAVE_RAMPZ__)
+#error Need RAMPZ
+#endif /* have RAMPZ */
+
+;; Destination
+#define D0  22
+#define D1  D0+1
+#define D2  D0+2
+#define D3  D0+3
+
+;; Register containing bits 16+ of the address
+
+#define HHI8  21
+
+.macro  .xload dest, n
+    elpm
+    mov     \dest, r0
+.if \dest != D0+\n-1
+    adiw    r30, 1
+    adc     HHI8, __zero_reg__
+    out     __RAMPZ__, HHI8
+.endif
+.endm
+
+#if defined (L_xload_2)
+DEFUN __xload_2
+    out     __RAMPZ__, HHI8
+    .xload D0, 2
+    .xload D1, 2
+    ret
+ENDF __xload_2
+#endif /* L_xload_2 */
+
+#if defined (L_xload_3)
+DEFUN __xload_3
+    out     __RAMPZ__, HHI8
+    .xload D0, 3
+    .xload D1, 3
+    .xload D2, 3
+    ret
+ENDF __xload_3
+#endif /* L_xload_3 */
+
+#if defined (L_xload_4)
+DEFUN __xload_4
+    out     __RAMPZ__, HHI8
+    .xload D0, 4
+    .xload D1, 4
+    .xload D2, 4
+    .xload D3, 4
+    ret
+ENDF __xload_4
+#endif /* L_xload_4 */
+
+#endif /* L_xload_{2|3|4} && ELPM */
+
 
 .section .text.libgcc.builtins, "ax", @progbits