150 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			150 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| #include "setarch.h"
 | |
| 
 | |
| #include "defines.h"
 | |
| 
 | |
| #ifdef __H8300SX__
 | |
| 
 | |
| 	.global _memcpy
 | |
| _memcpy:
 | |
| 	stm.l	er4-er6,@-er7
 | |
| 
 | |
| 	; Set up source and destination pointers for movmd.
 | |
| 	mov.l	er0,er6
 | |
| 	mov.l	er1,er5
 | |
| 
 | |
| 	; See whether the copy is long enough to use the movmd.l code.
 | |
| 	; Although the code can handle anything longer than 6 bytes,
 | |
| 	; it can be more expensive than movmd.b for small moves.
 | |
| 	; It's better to use a higher threshold to account for this.
 | |
| 	;
 | |
| 	; Note that the exact overhead of the movmd.l checks depends on
 | |
| 	; the alignments of the length and pointers.  They are faster when
 | |
| 	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
 | |
| 	; are 0.  This threshold is a compromise between the various cases.
 | |
| 	cmp	#16,LEN(r2)
 | |
| 	blo	simple
 | |
| 
 | |
| 	; movmd.l only works for even addresses.  If one of the addresses
 | |
| 	; is odd and the other is not, fall back on a simple move.
 | |
| 	bld	#0,r5l
 | |
| 	bxor	#0,r6l
 | |
| 	bcs	simple
 | |
| 
 | |
| 	; Make the addresses even.
 | |
| 	bld	#0,r5l
 | |
| 	bcc	word_aligned
 | |
| 	mov.b	@er5+,@er6+
 | |
| 	sub	#1,LEN(r2)
 | |
| 
 | |
| word_aligned:
 | |
| 	; See if copying one word would make the first operand longword
 | |
| 	; aligned.  Although this is only really worthwhile if it aligns
 | |
| 	; the second operand as well, it's no worse if doesn't, so it
 | |
| 	; hardly seems worth the overhead of a "band" check.
 | |
| 	bld	#1,r6l
 | |
| 	bcc	fast_copy
 | |
| 	mov.w	@er5+,@er6+
 | |
| 	sub	#2,LEN(r2)
 | |
| 
 | |
| fast_copy:
 | |
| 	; Set (e)r4 to the number of longwords to copy.
 | |
| 	mov	LEN(r2),LEN(r4)
 | |
| 	shlr	#2,LEN(r4)
 | |
| 
 | |
| #ifdef __NORMAL_MODE__
 | |
| 	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
 | |
| 	; is never reached with r4 == 0.
 | |
| 	movmd.l
 | |
| 	and.w	#3,r2
 | |
| simple:
 | |
| 	mov.w	r2,r4
 | |
| 	beq	quit
 | |
| 	movmd.b
 | |
| quit:
 | |
| 	rts/l	er4-er6
 | |
| #else
 | |
| 	; Skip the first iteration if the number of longwords is divisible
 | |
| 	; by 0x10000.
 | |
| 	mov.w	r4,r4
 | |
| 	beq	fast_loop_next
 | |
| 
 | |
| 	; This loop copies r4 (!= 0) longwords the first time round and 65536
 | |
| 	; longwords on each iteration after that.
 | |
| fast_loop:
 | |
| 	movmd.l
 | |
| fast_loop_next:
 | |
| 	sub.w	#1,e4
 | |
| 	bhs	fast_loop
 | |
| 
 | |
| 	; Mop up any left-over bytes.  We could just fall through to the
 | |
| 	; simple code after the "and" but the version below is quicker
 | |
| 	; and only takes 10 more bytes.
 | |
| 	and.w	#3,r2
 | |
| 	beq	quit
 | |
| 	mov.w	r2,r4
 | |
| 	movmd.b
 | |
| quit:
 | |
| 	rts/l	er4-er6
 | |
| 
 | |
| simple:
 | |
| 	; Simple bytewise copy.  We need to handle all lengths, including zero.
 | |
| 	mov.w	r2,r4
 | |
| 	beq	simple_loop_next
 | |
| simple_loop:
 | |
| 	movmd.b
 | |
| simple_loop_next:
 | |
| 	sub.w	#1,e2
 | |
| 	bhs	simple_loop
 | |
| 	rts/l	er4-er6
 | |
| #endif
 | |
| 
 | |
| #else
 | |
| 
 | |
| 	.global _memcpy
 | |
| _memcpy:
 | |
| ;	MOVP	@(2/4,r7),A0P	; dst
 | |
| ;	MOVP	@(4/8,r7),A1P	; src
 | |
| ;	MOVP	@(6/12,r7),A2P	; len
 | |
| 
 | |
| 	MOVP	A0P,A3P	; keep copy of final dst
 | |
| 	ADDP	A2P,A0P	; point to end of dst
 | |
| 	CMPP	A0P,A3P	; see if anything to do
 | |
| 	beq	quit
 | |
| 
 | |
| 	ADDP	A2P,A1P	; point to end of src
 | |
| 
 | |
| 	; lets see if we can do this in words
 | |
| 	or	A0L,A2L	; or in the dst address
 | |
| 	or	A3L,A2L	; or the length 
 | |
| 	or	A1L,A2L	; or the src address
 | |
| 	btst	#0,A2L	; see if the lsb is zero
 | |
| 	bne	byteloop
 | |
| 
 | |
| wordloop:
 | |
| #ifdef __NORMAL_MODE__
 | |
| 	sub	#2,A1P
 | |
| #else
 | |
| 	subs	#2,A1P		; point to word
 | |
| #endif
 | |
| 	mov.w	@A1P,A2		; get word
 | |
| 	mov.w	A2,@-A0P	; save word
 | |
| 	CMPP	A0P,A3P		; at the front again ?
 | |
| 	bne 	wordloop
 | |
| 	rts
 | |
| 
 | |
| byteloop:
 | |
| #ifdef __NORMAL_MODE__
 | |
| 	sub	#1,A1P
 | |
| #else
 | |
| 	subs	#1,A1P		; point to byte
 | |
| #endif
 | |
| 	mov.b	@A1P,A2L	; get byte
 | |
| 	mov.b	A2L,@-A0P	; save byte
 | |
| 	CMPP	A0P,A3P 	; at the front again ?
 | |
| 	bne 	byteloop
 | |
| 
 | |
| 	; return with A0 pointing to dst
 | |
| quit:	rts
 | |
| 
 | |
| #endif
 |