#include "or1k-asm.h"
/*
 * Assembly functions for software multiplication and devision.
 */

#define ENTRY(symbol)	\
	.align 4	;\
	.global symbol	;\
	.type	symbol, @function ;\
symbol:

#ifdef L__mulsi3
ENTRY(__mulsi3)
	l.addi r11,r0,0x0
	l.sfne r3,r11
OR1K_DELAYED(
	OR1K_INST(l.ori r5,r3,0x0),
	OR1K_INST(l.bnf 3f)
)
	l.addi r6,r0,0x0
1:
	l.andi r3,r5,0x1
	l.sfeq r3,r6
OR1K_DELAYED(
	OR1K_INST(l.srli r5,r5,0x1),
	OR1K_INST(l.bf 2f)
)
	l.add r11,r11,r4
2:
	l.sfne r5,r6
OR1K_DELAYED(
	OR1K_INST(l.slli r4,r4,0x1),
	OR1K_INST(l.bf 1b)
)
3:
OR1K_DELAYED_NOP(
	OR1K_INST(l.jr r9)
)
.size __mulsi3,.-__mulsi3
#endif

#ifdef L__udivsi3
.global __udivsi3_internal
.hidden __udivsi3_internal
__udivsi3_internal:
ENTRY(__udivsi3)
	l.addi          r1,r1,-4
        l.sw            0(r1),r9
        l.addi          r11,r0,0
        l.addi          r8,r4,0
        l.addi          r5,r3,0
        l.sfne          r8,r11
OR1K_DELAYED(
	OR1K_INST(l.addi r7,r0,0),
	OR1K_INST(l.bnf 4f)
)
	/* The following work equally on delay and no-delay implementations */
        l.sfgtu         r8,r5
        l.bf            5f
        l.sfeq          r8,r5
        l.bf            6f
        l.sfltu         r11,r8

OR1K_DELAYED(
	OR1K_INST(l.addi r13,r0,32),
        OR1K_INST(l.bnf 2f)
)
        l.movhi         r9,hi(0x80000000)
        l.addi          r6,r0,-1
1:
        l.and           r3,r5,r9
        l.slli          r4,r7,1
        l.addi          r15,r5,0
        l.srli          r3,r3,31
        l.add           r13,r13,r6
        l.or            r7,r4,r3
        l.sfltu         r7,r8
OR1K_DELAYED(
        OR1K_INST(l.slli r5,r5,1),
        OR1K_INST(l.bf 1b)
)
2:
        l.srli          r7,r7,1
        l.addi          r13,r13,1
        l.addi          r9,r0,0
        l.sfltu         r9,r13
OR1K_DELAYED(
        OR1K_INST(l.addi r5,r15,0),
        OR1K_INST(l.bnf 4f)
)
        l.movhi         r15,hi(0x80000000)
        l.addi          r17,r0,0
3:
        l.and           r3,r5,r15
        l.slli          r4,r7,1
        l.srli          r3,r3,31
        l.or            r7,r4,r3
        l.sub           r6,r7,r8
        l.and           r3,r6,r15
        l.srli          r3,r3,31
        l.addi          r4,r0,0
        l.sfne          r3,r4
OR1K_DELAYED(
        OR1K_INST(l.slli r3,r11,1),
        OR1K_INST(l.bf 1f)
)
        l.addi          r4,r0,1
1:
        l.slli          r5,r5,1
        l.sfne          r4,r17
OR1K_DELAYED(
        OR1K_INST(l.or r11,r3,r4),
        OR1K_INST(l.bnf 2f)
)
        l.addi          r7,r6,0
2:
        l.addi          r9,r9,1
        l.sfltu         r9,r13
OR1K_DELAYED_NOP(
        OR1K_INST(l.bf 3b)
)
OR1K_DELAYED_NOP(
	OR1K_INST(l.j 4f)
)
6:
OR1K_DELAYED(
	OR1K_INST(l.addi r11,r0,1),
	OR1K_INST(l.j 4f)
)
5:
	l.addi		r7,r5,0
4:
        l.lwz           r9,0(r1)
OR1K_DELAYED(
        OR1K_INST(l.addi r1,r1,4),
        OR1K_INST(l.jr r9)
)
.size __udivsi3,.-__udivsi3
#endif


#ifdef L__divsi3
ENTRY(__divsi3)
	l.addi          r1,r1,-8
        l.sw            0(r1),r9
        l.sw            4(r1),r14
        l.addi          r5,r3,0
        l.addi          r14,r0,0
        l.sflts         r5,r0
OR1K_DELAYED(
        OR1K_INST(l.addi r3,r0,0),
        OR1K_INST(l.bnf 1f)
)
        l.addi          r14,r0,1
        l.sub           r5,r0,r5
1:
        l.sflts         r4,r0
OR1K_DELAYED_NOP(
        OR1K_INST(l.bnf 1f)
)
        l.addi          r14,r14,1
        l.sub           r4,r0,r4
1:
OR1K_DELAYED(
        OR1K_INST(l.addi r3,r5,0),
        OR1K_INST(l.jal __udivsi3_internal)
)
        l.sfeqi         r14,1
OR1K_DELAYED_NOP(
        OR1K_INST(l.bnf 1f)
)
        l.sub           r11,r0,r11
1:
        l.lwz           r9,0(r1)
        l.lwz           r14,4(r1)
OR1K_DELAYED(
        OR1K_INST(l.addi r1,r1,8),
        OR1K_INST(l.jr r9)
)
.size __divsi3,.-__divsi3
#endif


#ifdef L__umodsi3
ENTRY(__umodsi3)
	l.addi          r1,r1,-4
	l.sw            0(r1),r9
OR1K_DELAYED_NOP(
	OR1K_INST(l.jal __udivsi3_internal)
)
	l.addi		r11,r7,0
	l.lwz           r9,0(r1)
OR1K_DELAYED(
	OR1K_INST(l.addi r1,r1,4),
	OR1K_INST(l.jr r9)
)
.size __umodsi3,.-__umodsi3
#endif


#ifdef L__modsi3
ENTRY(__modsi3)
        l.addi          r1,r1,-8
        l.sw            0(r1),r9
        l.sw            4(r1),r14
        l.addi          r14,r0,0
        l.sflts         r3,r0
OR1K_DELAYED_NOP(
        OR1K_INST(l.bnf 1f)
)
        l.addi          r14,r0,1
        l.sub           r3,r0,r3
1:
        l.sflts         r4,r0
OR1K_DELAYED_NOP(
        OR1K_INST(l.bnf 1f)
)
        l.sub           r4,r0,r4
1:
OR1K_DELAYED_NOP(
        OR1K_INST(l.jal __udivsi3_internal)
)
        l.sfeqi         r14,1
OR1K_DELAYED(
        OR1K_INST(l.addi r11,r7,0),
        OR1K_INST(l.bnf 1f)
)
        l.sub           r11,r0,r11
1:
        l.lwz           r9,0(r1)
        l.lwz           r14,4(r1)
OR1K_DELAYED(
        OR1K_INST(l.addi r1,r1,8),
        OR1K_INST(l.jr r9)
)
.size __modsi3,.-__modsi3
#endif