#include "setarch.h"

#include "defines.h"

#ifdef __H8300SX__

	.global _memcpy
_memcpy:
	stm.l	er4-er6,@-er7

	; Set up source and destination pointers for movmd.
	mov.l	er0,er6
	mov.l	er1,er5

	; See whether the copy is long enough to use the movmd.l code.
	; Although the code can handle anything longer than 6 bytes,
	; it can be more expensive than movmd.b for small moves.
	; It's better to use a higher threshold to account for this.
	;
	; Note that the exact overhead of the movmd.l checks depends on
	; the alignments of the length and pointers.  They are faster when
	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
	; are 0.  This threshold is a compromise between the various cases.
	cmp	#16,LEN(r2)
	blo	simple

	; movmd.l only works for even addresses.  If one of the addresses
	; is odd and the other is not, fall back on a simple move.
	bld	#0,r5l
	bxor	#0,r6l
	bcs	simple

	; Make the addresses even.
	bld	#0,r5l
	bcc	word_aligned
	mov.b	@er5+,@er6+
	sub	#1,LEN(r2)

word_aligned:
	; See if copying one word would make the first operand longword
	; aligned.  Although this is only really worthwhile if it aligns
	; the second operand as well, it's no worse if doesn't, so it
	; hardly seems worth the overhead of a "band" check.
	bld	#1,r6l
	bcc	fast_copy
	mov.w	@er5+,@er6+
	sub	#2,LEN(r2)

fast_copy:
	; Set (e)r4 to the number of longwords to copy.
	mov	LEN(r2),LEN(r4)
	shlr	#2,LEN(r4)

#ifdef __NORMAL_MODE__
	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
	; is never reached with r4 == 0.
	movmd.l
	and.w	#3,r2
simple:
	mov.w	r2,r4
	beq	quit
	movmd.b
quit:
	rts/l	er4-er6
#else
	; Skip the first iteration if the number of longwords is divisible
	; by 0x10000.
	mov.w	r4,r4
	beq	fast_loop_next

	; This loop copies r4 (!= 0) longwords the first time round and 65536
	; longwords on each iteration after that.
fast_loop:
	movmd.l
fast_loop_next:
	sub.w	#1,e4
	bhs	fast_loop

	; Mop up any left-over bytes.  We could just fall through to the
	; simple code after the "and" but the version below is quicker
	; and only takes 10 more bytes.
	and.w	#3,r2
	beq	quit
	mov.w	r2,r4
	movmd.b
quit:
	rts/l	er4-er6

simple:
	; Simple bytewise copy.  We need to handle all lengths, including zero.
	mov.w	r2,r4
	beq	simple_loop_next
simple_loop:
	movmd.b
simple_loop_next:
	sub.w	#1,e2
	bhs	simple_loop
	rts/l	er4-er6
#endif

#else

	.global _memcpy
_memcpy:
;	MOVP	@(2/4,r7),A0P	; dst
;	MOVP	@(4/8,r7),A1P	; src
;	MOVP	@(6/12,r7),A2P	; len

	MOVP	A0P,A3P	; keep copy of final dst
	ADDP	A2P,A0P	; point to end of dst
	CMPP	A0P,A3P	; see if anything to do
	beq	quit

	ADDP	A2P,A1P	; point to end of src

	; lets see if we can do this in words
	or	A0L,A2L	; or in the dst address
	or	A3L,A2L	; or the length 
	or	A1L,A2L	; or the src address
	btst	#0,A2L	; see if the lsb is zero
	bne	byteloop

wordloop:
#ifdef __NORMAL_MODE__
	sub	#2,A1P
#else
	subs	#2,A1P		; point to word
#endif
	mov.w	@A1P,A2		; get word
	mov.w	A2,@-A0P	; save word
	CMPP	A0P,A3P		; at the front again ?
	bne 	wordloop
	rts

byteloop:
#ifdef __NORMAL_MODE__
	sub	#1,A1P
#else
	subs	#1,A1P		; point to byte
#endif
	mov.b	@A1P,A2L	; get byte
	mov.b	A2L,@-A0P	; save byte
	CMPP	A0P,A3P 	; at the front again ?
	bne 	byteloop

	; return with A0 pointing to dst
quit:	rts

#endif