210 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			210 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /* Copyright 2003 SuperH Ltd.  */
 | |
| 
 | |
| #include "asm.h"
 | |
| 
 | |
| #ifdef __SH5__
 | |
| #if __SHMEDIA__
 | |
| 
 | |
| #ifdef __LITTLE_ENDIAN__
 | |
| #define ZPAD_MASK(src, dst) addi src, -1, dst
 | |
| #else
 | |
| #define ZPAD_MASK(src, dst) \
 | |
|  byterev src, dst; addi dst, -1, dst; byterev dst, dst
 | |
| #endif
 | |
| 
 | |
| 
 | |
| /* We assume that the destination is not in the first 16 bytes of memory.
 | |
|    A typical linker script will put the text section first, and as
 | |
|    this code is longer that 16 bytes, you have to get out of your way
 | |
|     to put data there.  */
 | |
| ENTRY(strncpy)
 | |
|  pt L_small, tr2
 | |
|  ldlo.q r3, 0, r0
 | |
|  shlli r3, 3, r19
 | |
|  mcmpeq.b r0, r63, r1
 | |
|  SHHI r1, r19, r7
 | |
|  add r2, r4, r20
 | |
|  addi r20, -8, r5
 | |
|  /* If the size is greater than 8, we know we can read beyond the first
 | |
|     (possibly partial) quadword, and write out a full first and last
 | |
|     (possibly unaligned and/or overlapping) quadword.  */
 | |
|  bge/u r2, r5, tr2 // L_small
 | |
|  pt L_found0, tr0
 | |
|  addi r2, 8, r22
 | |
|  bnei/u r7, 0, tr0  // L_found0
 | |
|  ori r3, -8, r38
 | |
|  pt L_end_early, tr1
 | |
|  sub r2, r38, r22
 | |
|  stlo.q r2, 0, r0
 | |
|  sthi.q r2, 7, r0
 | |
|  sub r3, r2, r6
 | |
|  ldx.q r22, r6, r0
 | |
|  /* Before each iteration, check that we can store in full the next quad we
 | |
|     are about to fetch.  */
 | |
|  addi r5, -8, r36
 | |
|  bgtu/u r22, r36, tr1 // L_end_early
 | |
|  pt L_scan0, tr1
 | |
| L_scan0:
 | |
|  addi r22, 8, r22
 | |
|  mcmpeq.b r0, r63, r1
 | |
|  stlo.q r22, -8, r0
 | |
|  bnei/u r1, 0, tr0   // L_found0
 | |
|  sthi.q r22, -1, r0
 | |
|  ldx.q r22, r6, r0
 | |
|  bgeu/l r36, r22, tr1 // L_scan0
 | |
| L_end:
 | |
|  // At end; we might re-read a few bytes when we fetch the last quad.
 | |
|  // branch mispredict, so load is ready now.
 | |
|  mcmpeq.b r0, r63, r1
 | |
|  addi r22, 8, r22
 | |
|  bnei/u r1, 0, tr0   // L_found0
 | |
|  add r3, r4, r7
 | |
|  ldlo.q r7, -8, r1
 | |
|  ldhi.q r7, -1, r7
 | |
|  ptabs r18, tr0
 | |
|  stlo.q r22, -8, r0
 | |
|  or r1, r7, r1
 | |
|  mcmpeq.b r1, r63, r7
 | |
|  sthi.q r22, -1, r0
 | |
|  ZPAD_MASK (r7, r7)
 | |
|  and r1, r7, r1 // mask out non-zero bytes after first zero byte
 | |
|  stlo.q r20, -8, r1
 | |
|  sthi.q r20, -1, r1
 | |
|  blink tr0, r63
 | |
| 
 | |
| L_end_early:
 | |
|  /* Check if we can store the current quad in full.  */
 | |
|  pt L_end, tr1
 | |
|  add r3, r4, r7
 | |
|  bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.
 | |
|  /* If not, that means we can just proceed to process the last quad.
 | |
|     Two pipeline stalls are unavoidable, as we don't have enough ILP.  */
 | |
|  ldlo.q r7, -8, r1
 | |
|  ldhi.q r7, -1, r7
 | |
|  ptabs r18, tr0
 | |
|  or r1, r7, r1
 | |
|  mcmpeq.b r1, r63, r7
 | |
|  ZPAD_MASK (r7, r7)
 | |
|  and r1, r7, r1 // mask out non-zero bytes after first zero byte
 | |
|  stlo.q r20, -8, r1
 | |
|  sthi.q r20, -1, r1
 | |
|  blink tr0, r63
 | |
| 
 | |
| L_found0:
 | |
|  // r0: string to store, not yet zero-padding normalized.
 | |
|  // r1: result of mcmpeq.b r0, r63, r1.
 | |
|  // r22: store address plus 8.  I.e. address where zero padding beyond the
 | |
|  //      string in r0 goes.
 | |
|  // r20: store end address.
 | |
|  // r5: store end address minus 8.
 | |
|  pt L_write0_multiquad, tr0
 | |
|  ZPAD_MASK (r1, r1)
 | |
|  and r0, r1, r0 // mask out non-zero bytes after first zero byte
 | |
|  stlo.q r22, -8, r0
 | |
|  sthi.q r22, -1, r0
 | |
|  andi r22, -8, r1 // Check if zeros to write fit in one quad word.
 | |
|  bgtu/l r5, r1, tr0 // L_write0_multiquad
 | |
|  ptabs r18, tr1
 | |
|  sub r20, r22, r1
 | |
|  shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is
 | |
|  SHLO r0, r1, r0 // handled correctly.
 | |
|  SHLO r0, r1, r0
 | |
|  sthi.q r20, -1, r0
 | |
|  blink tr1, r63
 | |
| 
 | |
| L_write0_multiquad:
 | |
|  pt L_write0_loop, tr0
 | |
|  ptabs r18, tr1
 | |
|  stlo.q r22, 0, r63
 | |
|  sthi.q r20, -1, r63
 | |
|  addi r1, 8, r1
 | |
|  bgeu/l r5, r1, tr0 // L_write0_loop
 | |
|  blink tr1, r63
 | |
| 
 | |
| L_write0_loop:
 | |
|  st.q r1, 0 ,r63
 | |
|  addi r1, 8, r1
 | |
|  bgeu/l r5, r1, tr0 // L_write0_loop
 | |
|  blink tr1, r63
 | |
| 
 | |
| L_small:
 | |
|  // r0: string to store, not yet zero-padding normalized.
 | |
|  // r1: result of mcmpeq.b r0, r63, r1.
 | |
|  // r7: nonzero indicates relevant zero found r0.
 | |
|  // r2: store address.
 | |
|  // r3: read address.
 | |
|  // r4: size, max 8
 | |
|  // r20: store end address.
 | |
|  // r5: store end address minus 8.
 | |
|  pt L_nohi, tr0
 | |
|  pt L_small_storelong, tr1
 | |
|  ptabs r18, tr2
 | |
|  sub r63, r4, r23
 | |
|  bnei/u r7, 0, tr0  // L_nohi
 | |
|  ori r3, -8, r7
 | |
|  bge/l r23, r7, tr0 // L_nohi
 | |
|  ldhi.q r3, 7, r1
 | |
|  or r0, r1, r0
 | |
|  mcmpeq.b r0, r63, r1
 | |
| L_nohi:
 | |
|  ZPAD_MASK (r1, r1)
 | |
|  and r0, r1, r0
 | |
|  movi 4, r19
 | |
|  bge/u r4, r19, tr1 // L_small_storelong
 | |
| 
 | |
|  pt L_small_end, tr0
 | |
| #ifndef __LITTLE_ENDIAN__
 | |
|  byterev r0, r0
 | |
| #endif
 | |
|  beqi/u r4, 0, tr0 // L_small_end
 | |
|  st.b r2, 0, r0
 | |
|  beqi/u r4, 1, tr0 // L_small_end
 | |
|  shlri r0, 8, r0
 | |
|  st.b r2, 1, r0
 | |
|  beqi/u r4, 2, tr0 // L_small_end
 | |
|  shlri r0, 8, r0
 | |
|  st.b r2, 2, r0
 | |
| L_small_end:
 | |
|  blink tr2, r63
 | |
| 
 | |
| L_small_storelong:
 | |
|  shlli r23, 3, r7
 | |
|  SHHI r0, r7, r1
 | |
| #ifdef __LITTLE_ENDIAN__
 | |
|  shlri r1, 32, r1
 | |
| #else
 | |
|  shlri r0, 32, r0
 | |
| #endif
 | |
|  stlo.l r2, 0, r0
 | |
|  sthi.l r2, 3, r0
 | |
|  stlo.l r20, -4, r1
 | |
|  sthi.l r20, -1, r1
 | |
|  blink tr2, r63
 | |
| 
 | |
| #else /* SHcompact */
 | |
| 
 | |
| /* This code is optimized for size.  Instruction selection is SH5 specific.
 | |
|    SH4 should use a different version.  */
 | |
| ENTRY(strncpy)
 | |
|  mov #0, r6
 | |
|  cmp/eq r4, r6
 | |
|  bt return
 | |
|  mov r2, r5
 | |
|  add #-1, r5
 | |
|  add r5, r4
 | |
| loop:
 | |
|  bt/s found0
 | |
|  add #1, r5
 | |
|  mov.b @r3+, r1
 | |
| found0:
 | |
|  cmp/eq r5,r4
 | |
|  mov.b r1, @r5
 | |
|  bf/s loop
 | |
|  cmp/eq r1, r6
 | |
| return:
 | |
|  rts
 | |
|  nop
 | |
|  
 | |
| #endif /* SHcompact */
 | |
| #endif /* __SH5__ */
 |