253 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			253 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /*
 | |
|  * Copyright (c) 2014
 | |
|  *      Imagination Technologies Limited.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions
 | |
|  * are met:
 | |
|  * 1. Redistributions of source code must retain the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer.
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright
 | |
|  *    notice, this list of conditions and the following disclaimer in the
 | |
|  *    documentation and/or other materials provided with the distribution.
 | |
|  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 | |
|  *    contributors may be used to endorse or promote products derived from
 | |
|  *    this software without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY IMAGINATION TECHNOLOGIES LIMITED ``AS IS'' AND
 | |
|  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
|  * ARE DISCLAIMED. IN NO EVENT SHALL IMAGINATION TECHNOLOGIES LIMITED BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 | |
|  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 | |
|  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | |
|  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | |
|  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 | |
|  * SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| #ifdef ANDROID_CHANGES
 | |
| # include "machine/asm.h"
 | |
| # include "machine/regdef.h"
 | |
| #elif _COMPILING_NEWLIB
 | |
| # include "machine/asm.h"
 | |
| # include "machine/regdef.h"
 | |
| #else
 | |
| # include <regdef.h>
 | |
| # include <sys/asm.h>
 | |
| #endif
 | |
| 
 | |
| /* Technically strcmp should not read past the end of the strings being
 | |
|    compared.  We will read a full word that may contain excess bits beyond
 | |
|    the NULL string terminator but unless ENABLE_READAHEAD is set, we will not
 | |
|    read the next word after the end of string.  Setting ENABLE_READAHEAD will
 | |
|    improve performance but is technically illegal based on the definition of
 | |
|    strcmp.  */
 | |
| #ifdef ENABLE_READAHEAD
 | |
| # define DELAY_READ
 | |
| #else
 | |
| # define DELAY_READ nop
 | |
| #endif
 | |
| 
 | |
| /* Testing on a little endian machine showed using CLZ was a
 | |
|    performance loss, so we are not turning it on by default.  */
 | |
| #if defined(ENABLE_CLZ) && (__mips_isa_rev > 1)
 | |
| # define USE_CLZ
 | |
| #endif
 | |
| 
 | |
| /* Some asm.h files do not have the L macro definition.  */
 | |
| #ifndef L
 | |
| # if _MIPS_SIM == _ABIO32
 | |
| #  define L(label) $L ## label
 | |
| # else
 | |
| #  define L(label) .L ## label
 | |
| # endif
 | |
| #endif
 | |
| 
 | |
| /* Some asm.h files do not have the PTR_ADDIU macro definition.  */
 | |
| #ifndef PTR_ADDIU
 | |
| # ifdef USE_DOUBLE
 | |
| #  define PTR_ADDIU       daddiu
 | |
| # else
 | |
| #  define PTR_ADDIU       addiu
 | |
| # endif
 | |
| #endif
 | |
| 
 | |
| /* Allow the routine to be named something else if desired.  */
 | |
| #ifndef STRCMP_NAME
 | |
| # define STRCMP_NAME strcmp
 | |
| #endif
 | |
| 
 | |
| #ifdef ANDROID_CHANGES
 | |
| LEAF(STRCMP_NAME, 0)
 | |
| #else
 | |
| LEAF(STRCMP_NAME)
 | |
| #endif
 | |
| 	.set	nomips16
 | |
| 	.set	noreorder
 | |
| 
 | |
| 	or	t0, a0, a1
 | |
| 	andi	t0,0x3
 | |
| 	bne	t0, zero, L(byteloop)
 | |
| 
 | |
| /* Both strings are 4 byte aligned at this point.  */
 | |
| 
 | |
| 	lui	t8, 0x0101
 | |
| 	ori	t8, t8, 0x0101
 | |
| 	lui	t9, 0x7f7f
 | |
| 	ori	t9, 0x7f7f
 | |
| 
 | |
| #define STRCMP32(OFFSET) \
 | |
| 	lw	v0, OFFSET(a0); \
 | |
| 	lw	v1, OFFSET(a1); \
 | |
| 	subu	t0, v0, t8; \
 | |
| 	bne	v0, v1, L(worddiff); \
 | |
| 	nor	t1, v0, t9; \
 | |
| 	and	t0, t0, t1; \
 | |
| 	bne	t0, zero, L(returnzero)
 | |
| 
 | |
| L(wordloop):
 | |
| 	STRCMP32(0)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(4)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(8)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(12)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(16)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(20)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(24)
 | |
| 	DELAY_READ
 | |
| 	STRCMP32(28)
 | |
| 	PTR_ADDIU a0, a0, 32
 | |
| 	b	L(wordloop)
 | |
| 	PTR_ADDIU a1, a1, 32
 | |
| 
 | |
| L(returnzero):
 | |
| 	j	ra
 | |
| 	move	v0, zero
 | |
| 
 | |
| L(worddiff):
 | |
| #ifdef USE_CLZ
 | |
| 	subu	t0, v0, t8
 | |
| 	nor	t1, v0, t9
 | |
| 	and	t1, t0, t1
 | |
| 	xor	t0, v0, v1
 | |
| 	or	t0, t0, t1
 | |
| # if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 | |
| 	wsbh	t0, t0
 | |
| 	rotr	t0, t0, 16
 | |
| # endif
 | |
| 	clz	t1, t0
 | |
| 	and	t1, 0xf8
 | |
| # if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 | |
| 	neg	t1
 | |
| 	addu	t1, 24
 | |
| # endif
 | |
| 	rotrv	v0, v0, t1
 | |
| 	rotrv	v1, v1, t1
 | |
| 	and	v0, v0, 0xff
 | |
| 	and	v1, v1, 0xff
 | |
| 	j	ra
 | |
| 	subu	v0, v0, v1
 | |
| #else /* USE_CLZ */
 | |
| # if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 | |
| 	andi	t0, v0, 0xff
 | |
| 	beq	t0, zero, L(wexit01)
 | |
| 	andi	t1, v1, 0xff
 | |
| 	bne	t0, t1, L(wexit01)
 | |
| 
 | |
| 	srl	t8, v0, 8
 | |
| 	srl	t9, v1, 8
 | |
| 	andi	t8, t8, 0xff
 | |
| 	beq	t8, zero, L(wexit89)
 | |
| 	andi	t9, t9, 0xff
 | |
| 	bne	t8, t9, L(wexit89)
 | |
| 
 | |
| 	srl	t0, v0, 16
 | |
| 	srl	t1, v1, 16
 | |
| 	andi	t0, t0, 0xff
 | |
| 	beq	t0, zero, L(wexit01)
 | |
| 	andi	t1, t1, 0xff
 | |
| 	bne	t0, t1, L(wexit01)
 | |
| 
 | |
| 	srl	t8, v0, 24
 | |
| 	srl	t9, v1, 24
 | |
| # else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
 | |
| 	srl	t0, v0, 24
 | |
| 	beq	t0, zero, L(wexit01)
 | |
| 	srl	t1, v1, 24
 | |
| 	bne	t0, t1, L(wexit01)
 | |
| 
 | |
| 	srl	t8, v0, 16
 | |
| 	srl	t9, v1, 16
 | |
| 	andi	t8, t8, 0xff
 | |
| 	beq	t8, zero, L(wexit89)
 | |
| 	andi	t9, t9, 0xff
 | |
| 	bne	t8, t9, L(wexit89)
 | |
| 
 | |
| 	srl	t0, v0, 8
 | |
| 	srl	t1, v1, 8
 | |
| 	andi	t0, t0, 0xff
 | |
| 	beq	t0, zero, L(wexit01)
 | |
| 	andi	t1, t1, 0xff
 | |
| 	bne	t0, t1, L(wexit01)
 | |
| 
 | |
| 	andi	t8, v0, 0xff
 | |
| 	andi	t9, v1, 0xff
 | |
| # endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
 | |
| 
 | |
| L(wexit89):
 | |
| 	j	ra
 | |
| 	subu	v0, t8, t9
 | |
| L(wexit01):
 | |
| 	j	ra
 | |
| 	subu	v0, t0, t1
 | |
| #endif /* USE_CLZ */
 | |
| 
 | |
| /* It might seem better to do the 'beq' instruction between the two 'lbu'
 | |
|    instructions so that the nop is not needed but testing showed that this
 | |
|    code is actually faster (based on glibc strcmp test).  */
 | |
| #define BYTECMP01(OFFSET) \
 | |
| 	lbu	v0, OFFSET(a0); \
 | |
| 	lbu	v1, OFFSET(a1); \
 | |
| 	beq	v0, zero, L(bexit01); \
 | |
| 	nop; \
 | |
| 	bne	v0, v1, L(bexit01)
 | |
| 
 | |
| #define BYTECMP89(OFFSET) \
 | |
| 	lbu	t8, OFFSET(a0); \
 | |
| 	lbu	t9, OFFSET(a1); \
 | |
| 	beq	t8, zero, L(bexit89); \
 | |
| 	nop;	\
 | |
| 	bne	t8, t9, L(bexit89)
 | |
| 
 | |
| L(byteloop):
 | |
| 	BYTECMP01(0)
 | |
| 	BYTECMP89(1)
 | |
| 	BYTECMP01(2)
 | |
| 	BYTECMP89(3)
 | |
| 	BYTECMP01(4)
 | |
| 	BYTECMP89(5)
 | |
| 	BYTECMP01(6)
 | |
| 	BYTECMP89(7)
 | |
| 	PTR_ADDIU a0, a0, 8
 | |
| 	b	L(byteloop)
 | |
| 	PTR_ADDIU a1, a1, 8
 | |
| 
 | |
| L(bexit01):
 | |
| 	j	ra
 | |
| 	subu	v0, v0, v1
 | |
| L(bexit89):
 | |
| 	j	ra
 | |
| 	subu	v0, t8, t9
 | |
| 
 | |
| 	.set	at
 | |
| 	.set	reorder
 | |
| 
 | |
| END(STRCMP_NAME)
 |