/*
 *   MIRACL Comba's method for ultimate speed modular multiplication
 *   mrcomba.tpl 
 *
 *   See "Exponentiation Cryptosystems on the IBM PC", IBM Systems
 *   Journal Vol. 29 No. 4 1990. Comba's method has been extended to 
 *   implement Montgomery reduction. 
 *
 *   Here the inner loops of the basic multiplication, squaring and 
 *   Montgomery's redc() functions are completely unravelled, and 
 *   reorganised for maximum possible speed. 
 *
 *   This approach is recommended for maximum speed where parameters
 *   are fixed and compute resources are constrained. The processor must 
 *   support an unsigned multiply instruction, and should have a carry flag.
 *
 *   This file is a template. To fill in the gaps and create mrcomba.c, 
 *   you must run the mex.c program to insert the C or assembly language 
 *   macros from the appropriate .mcs file. For use with C MR_NOASM must
 *   be defined in mirdef.h
 *
 *   This method would appear to be particularly useful for implementing 
 *   fast Elliptic Curve Cryptosystems over GF(p) and fast 1024-bit RSA
 *   decryption.
 *
 *   The #define MR_COMBA in mirdef.h determines the FIXED size of 
 *   modulus to be used. This *must* be determined at compile time. 
 *
 *   Note that this module can generate a *lot* of code for large values 
 *   of MR_COMBA. This should have a maximum value of 8-16. Any larger 
 *   that and you should define MR_KCM instead - see mrkcm.tpl
 *
 *   Note that on some processors it is *VITAL* that arrays be aligned on 
 *   4-byte boundaries
 *
 *  **** This code does not like -fomit-frame-pointer using GCC  ***********
 *
 *   Copyright (c) 1988-2001 Shamus Software Ltd.
 */

#include "miracl.h"    


#ifdef MR_COMBA

/* NOTE! z must be distinct from x and y */

asm c_mul(mr_small *a,mr_small *b,mr_small *c)
{
/*** MULTIPLY ***/      /* multiply a by b, result in c */    
  mov $dspmode,1
  xor $r7,$r7
  xor $r1,$r1
  xor $r0,$r0
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*0)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*1)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*2)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*3)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*4)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*5)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*6)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*7)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*8)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*9)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r4(4*10)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  stw $r4(4*11)
  mov $data0,$r0
  ret
}

void comba_mult(_MIPD_ big x,big y,big z) 
{ /* comba multiplier */
    int i;
    mr_small *a,*b,*c;
   

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
   
    for (i=2*MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
  
    z->len=2*MR_COMBA;
    a=x->w; b=y->w; c=z->w;

    c_mul(a,b,c);

    if (z->w[2*MR_COMBA-1]==0) mr_lzero(z);
}   
 

/* NOTE! z and x must be distinct */

asm c_sqr(mr_small *a,mr_small *c)
{
/*** SQUARE ***/    /* squares a, result in b */
  mov $dspmode,1
  xor $r7,$r7
  xor $r1,$r1
  xor $r0,$r0
  ldw $r2(4*0)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*0)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r2(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*1)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r2(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*2)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r2(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r2(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*3)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r2(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r2(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*4)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r2(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r2(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r2(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*5)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r2(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r2(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*6)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r2(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r2(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*7)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r2(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*8)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r2(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*9)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*5)
  mul $r6,$data1,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r3(4*10)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  stw $r3(4*11)
  mov $data0,$r0
  ret
}

void comba_square(_MIPD_ big x,big z)  
{ /* super comba squarer */
    int i;
    mr_small *a,*c;
  

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
 
    for (i=2*MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;  
 
    z->len=2*MR_COMBA;
    a=x->w; c=z->w;

    c_sqr(a,c);

    if (z->w[2*MR_COMBA-1]==0) mr_lzero(z); 
}                          

asm c_add(mr_small *a,mr_small *b,mr_small *c,mr_small *carry)
{
/*** ADDITION ***/        /* add a and b, result in c */
  stw $r4,4
  ldw $r3(0)	
  mov $r0,$data1
  ldw $r2(0)
  add $data0,$r0,$data1
  ldw $r3(4*1)
  mov $r0,$data1
  ldw $r2(4*1)
  adc $data0,$r0,$data1 /* 1 */
  ldw $r3(4*2)
  mov $r0,$data1
  ldw $r2(4*2)
  adc $data0,$r0,$data1 /* 2 */
  ldw $r3(4*3)
  mov $r0,$data1
  ldw $r2(4*3)
  adc $data0,$r0,$data1 /* 3 */
  ldw $r3(4*4)
  mov $r0,$data1
  ldw $r2(4*4)
  adc $data0,$r0,$data1 /* 4 */
  ldw $r3(4*5)
  mov $r0,$data1
  ldw $r2(4*5)
  adc $data0,$r0,$data1 /* 5 */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r5(0)
  mov $data0,$r0
  ret
}

asm c_dec(mr_small *a,mr_small *b,mr_small *carry)
{
/*** DECREMENT ***/        /* decrement b from a */
  stw $r2,4
  ldw $r2(0)
  mov $r0,$data1
  ldw $r3(0)
  sub $data0,$r0,$data1
  ldw $r2(4*1)
  mov $r0,$data1
  ldw $r3(4*1)
  sbb $data0,$r0,$data1 /* 1 */
  ldw $r2(4*2)
  mov $r0,$data1
  ldw $r3(4*2)
  sbb $data0,$r0,$data1 /* 2 */
  ldw $r2(4*3)
  mov $r0,$data1
  ldw $r3(4*3)
  sbb $data0,$r0,$data1 /* 3 */
  ldw $r2(4*4)
  mov $r0,$data1
  ldw $r3(4*4)
  sbb $data0,$r0,$data1 /* 4 */
  ldw $r2(4*5)
  mov $r0,$data1
  ldw $r3(4*5)
  sbb $data0,$r0,$data1 /* 5 */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r4(0)
  mov $data0,$r0
  ret
}

asm c_sub(mr_small *a,mr_small *b,mr_small *c,mr_small *carry)
{
/*** SUBTRACTION ***/        /* add a and b, result in c */
  stw $r4,4
  ldw $r2(0)	
  mov $r0,$data1
  ldw $r3(0)
  sub $data0,$r0,$data1
  ldw $r2(4*1)
  mov $r0,$data1
  ldw $r3(4*1)
  sbb $data0,$r0,$data1 /* 1  */
  ldw $r2(4*2)
  mov $r0,$data1
  ldw $r3(4*2)
  sbb $data0,$r0,$data1 /* 2  */
  ldw $r2(4*3)
  mov $r0,$data1
  ldw $r3(4*3)
  sbb $data0,$r0,$data1 /* 3  */
  ldw $r2(4*4)
  mov $r0,$data1
  ldw $r3(4*4)
  sbb $data0,$r0,$data1 /* 4  */
  ldw $r2(4*5)
  mov $r0,$data1
  ldw $r3(4*5)
  sbb $data0,$r0,$data1 /* 5  */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r5(0)
  mov $data0,$r0
  ret
}

asm c_inc(mr_small *a,mr_small *b,mr_small *carry)
{
/*** INCREMENT ***/         /* decrement b from a */
  stw $r2,4
  ldw $r2(0)
  mov $r0,$data1
  ldw $r3(0)
  add $data0,$r0,$data1
  ldw $r2(4*1)
  mov $r0,$data1
  ldw $r3(4*1)
  adc $data0,$r0,$data1 /* 1  */
  ldw $r2(4*2)
  mov $r0,$data1
  ldw $r3(4*2)
  adc $data0,$r0,$data1 /* 2  */
  ldw $r2(4*3)
  mov $r0,$data1
  ldw $r3(4*3)
  adc $data0,$r0,$data1 /* 3  */
  ldw $r2(4*4)
  mov $r0,$data1
  ldw $r3(4*4)
  adc $data0,$r0,$data1 /* 4  */
  ldw $r2(4*5)
  mov $r0,$data1
  ldw $r3(4*5)
  adc $data0,$r0,$data1 /* 5  */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r4(0)
  mov $data0,$r0
  ret
}

asm c_redc(mr_small *a,mr_small *b,mr_small ndash)
{
/*** REDC ***/             /* Montgomery's redc */
  mov $dspmode,1
  xor $r7,$r7
  xor $r1,$r1
  ldw $r2(0)
  mov $r0,$data1
  mul $r6,$r0,$r4
  stw $r2(4*0)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mul $r6,$r0,$r4
  stw $r2(4*1)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*1+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mul $r6,$r0,$r4
  stw $r2(4*2)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*2+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mul $r6,$r0,$r4
  stw $r2(4*3)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*3+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mul $r6,$r0,$r4
  stw $r2(4*4)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*4+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mul $r6,$r0,$r4
  stw $r2(4*5)
  mov $data0,$r6
  ldw $r3(0)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*5+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*1)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r2(4*6)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*6+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*2)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r2(4*7)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*7+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*3)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r2(4*8)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*8+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*4)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r2(4*9)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*9+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*5)
  mul $r6,$r6,$data1
  add $r0,$r6
  adc $r1,$msw,$r1
  adc $r7,$r7,$0
  stw $r2(4*10)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*10+4)
  add $r0,$data1
  adc $r1,$r1,$r7
  stw $r2(4*11)
  mov $data0,$r0
  stw $r2(4*11+4)
  mov $data0,$r1
  ret
}

/* NOTE! t and z must be distinct! */

void comba_redc(_MIPD_ big t,big z)     
{  /* super comba Montgomery redc() function */                      
    mr_small ndash,carry;

    unsigned int i;
    big w,modulus;
    mr_small *a,*b;
    BOOL need_subtract;
#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif

#ifdef MR_SPECIAL

/* !!! Implement here a "special" fast method for modular reduction,
   for a particular modulus. Implemented here for 2^192-2^64-1       
   and 2^224-2^96+1 on a 32 bit processor.
   See for example "Software Implementation of the NIST Elliptic
   Curves Over Prime Fields", Brown et al., Report #36, 2000 available
   from www.cacr.math.uwaterloo.ca 

   The generated code can be manually optimised further.....
*/
    int overshoot;
    mr_small k[MR_COMBA];
    mr_small *c;
    modulus=mr_mip->modulus;     
    for (i=MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
 /*      zero(z);   */
    z->len=MR_COMBA;

  #if MIRACL==64

   #if MR_COMBA == 3
/* Special Code for 2^192-2^64-1 - assuming 64-bit processor */

    a=t->w; b=k; c=z->w;
    k[0]=k[1]=a[3]; k[2]=0;

    c_add(a,b,c,&carry);

    overshoot=carry;  
    a=c;  c=t->w;
    k[0]=0;k[1]=k[2]=c[4];

    c_inc(a,b,&carry);

    overshoot+=carry;
    k[0]=k[1]=k[2]=c[5];

    c_inc(a,b,&carry);

    overshoot+=carry;
    b=modulus->w;
    while(overshoot>0)
    {
        c_dec(a,b,&carry);
        overshoot-=carry;
    }
    if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
    {
        if (mr_compare(z,modulus)>=0)
        {
	    c_dec(a,b,&carry);
        }
    }
    if (z->w[MR_COMBA-1]==0) mr_lzero(z);

   #endif
  #endif

  #if MIRACL==32

#if MR_COMBA == 8
#ifdef MR_NOFULLWIDTH

/* Modulus is 2^255-19 - Experimental - not tested! */

w->w=&(t->w[10]);
w->len=9;
premult(_MIPP_ w,608,w);
incr(_MIPP_ w,19*(t->w[9]>>21),w);
t->w[9]&=(1<<21)-1;
t->len++;
z->len=10;
for (i=0;i<10;i++) z->w[i]=t->w[i];
comba_sub(z,w,z);


#endif
#endif

  #if MR_COMBA == 6

/* Special Code for 2^192-2^64-1 - assuming 32-bit processor */

    a=t->w; b=k; c=z->w;
    k[0]=k[2]=a[6]; k[1]=k[3]=a[7]; k[4]=k[5]=0; 
    
    c_add(a,b,c,&carry);

    overshoot=carry;  
    a=c;  c=t->w;
    k[0]=k[1]=0; k[2]=k[4]=c[8]; k[3]=k[5]=c[9];

    c_inc(a,b,&carry);

    overshoot+=carry;
    k[0]=k[2]=k[4]=c[10]; k[1]=k[3]=k[5]=c[11];
                       
    c_inc(a,b,&carry);

    overshoot+=carry;
    b=modulus->w;
    while(overshoot>0)
    {
        c_dec(a,b,&carry);
        overshoot-=carry;
    }
    if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
    {
        if (mr_compare(z,modulus)>=0)
        {
            c_dec(a,b,&carry);
        }
    }
    if (z->w[MR_COMBA-1]==0) mr_lzero(z);

  #endif

  #if MR_COMBA == 7
/* Special Code for 2^224-2^96+1 - assuming 32-bit processor */

    a=t->w; b=k; c=z->w;
    k[0]=k[1]=k[2]=0; k[3]=a[7]; k[4]=a[8]; k[5]=a[9]; k[6]=a[10];

    c_add(a,b,c,&carry);

    overshoot=carry;
    a=c; c=t->w;
    k[0]=k[1]=k[2]=k[6]=0; k[3]=c[11]; k[4]=c[12]; k[5]=c[13];

    c_inc(a,b,&carry);
    overshoot+=carry;
    k[0]=c[7]; k[1]=c[8]; k[2]=c[9]; k[3]=c[10]; k[4]=c[11]; k[5]=c[12]; k[6]=c[13];
    
    c_dec(a,b,&carry);
    overshoot-=carry;
    k[0]=c[11]; k[1]=c[12]; k[2]=c[13]; k[3]=k[4]=k[5]=k[6]=0;

    c_dec(a,b,&carry);
    overshoot-=carry;
    b=modulus->w;
    while (overshoot>0)
    {
        c_dec(a,b,&carry);
        overshoot-=carry;
    }
    while (overshoot<0)
    {
        c_inc(a,b,&carry);
        overshoot+=carry;
    }
    if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
    {
        if (mr_compare(z,modulus)>=0)
        {
            c_dec(a,b,&carry);
        }
    }
    if (z->w[MR_COMBA-1]==0) mr_lzero(z);

  #endif

  #if MR_COMBA == 17

/* Special Code for 2^521-1 - assuming 32-bit processor */

/* split t into 521-bit halves, low half in a, high half in b */

    a=t->w; b=k; c=z->w;

    for (i=0;i<=16;i++)
        b[i]=(a[i+16]>>9)|(a[i+17]<<23);

    b[16]|=(-(a[16]>>9)<<9); /* clever stuff! Set top part of b[16] to minus  *
                              * top part of a[16]. When added they cancel out */

    c_add(a,b,c,&carry);
                             /* ignore carry=1 */
    a=z->w;                   
    b=modulus->w;

    if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
    {
        if (mr_compare(z,modulus)>=0)
        {
            c_dec(a,b,&carry);
        }
    }
    if (z->w[MR_COMBA-1]==0) mr_lzero(z);
  #endif
  #endif
#else
    modulus=mr_mip->modulus;  
    ndash=mr_mip->ndash;
    w=mr_mip->w0;
    if (t!=w) copy(t,w);       
    w->len=2*MR_COMBA+1;
    a=w->w; b=modulus->w;

    c_redc(a,b,ndash);   /* reduces a mod b */
    
    for (i=MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
   
    z->len=MR_COMBA;
    for (i=0;i<MR_COMBA;i++) z->w[i]=w->w[i+MR_COMBA];

    need_subtract=FALSE;

    if (w->w[MR_COMBA+MR_COMBA]!=0)
    {
        need_subtract=TRUE;
    }
    else 
    {
        if (z->w[MR_COMBA-1]!=0)
        {
            if (z->w[MR_COMBA-1]>modulus->w[MR_COMBA-1]) need_subtract=TRUE;
            else
            {
                if (z->w[MR_COMBA-1]==modulus->w[MR_COMBA-1])
                {
                    if (mr_compare(z,modulus)>=0) need_subtract=TRUE;
                }
            }
        }
        else mr_lzero(z);
    }

    if (need_subtract)
    {
        a=z->w; b=modulus->w;
        c_dec(a,b,&carry);   
	z->len=MR_COMBA;
        if (z->w[MR_COMBA-1]==0) mr_lzero(z);
    }

#endif
} 

void comba_add(_MIPD_ big x,big y,big w)
{ /* fast modular addition */
    unsigned int i;
    big modulus;
    BOOL dodec;
    mr_small *a,*b,*c;
    mr_small carry;  

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    modulus=mr_mip->modulus;
    if (w!=x && w!=y) 
    {
        for (i=MR_COMBA;i<(w->len&MR_OBITS);i++) w->w[i]=0;
        /* zero(w); */
    }
    
    a=x->w; b=y->w; c=w->w;

    c_add(a,b,c,&carry);

    w->len=MR_COMBA;

/* if sum is greater than modulus a decrement will be required */

    dodec=FALSE;
    if (carry) dodec=TRUE;  /* possible misprediction here */
    else
    {
        if (w->w[MR_COMBA-1]>modulus->w[MR_COMBA-1]) dodec=TRUE; /* possible misprediction here */
	else
	{
            if (w->w[MR_COMBA-1]==modulus->w[MR_COMBA-1]) /* this will be very rare, so easily predicted */
	    { /* trying to avoid calling this slow function */
	        if (mr_compare(w,modulus)>=0) dodec=TRUE; /* do full comparison */
	    }
        }
    }

    if (dodec)  /* prediction here correlated to earlier predictions, so should predict nicely */
    {
        a=w->w; b=modulus->w;

	c_dec(a,b,&carry);

    }
    if (w->w[MR_COMBA-1]==0) mr_lzero(w);   

}

asm c_add2(mr_small *a,mr_small *b,mr_small *c,mr_small *carry)
{
/*** ADDITION2 ***/        /* add a and b, result in c */
  stw $r4,4
  ldw $r3(0)	
  mov $r0,$data1
  ldw $r2(0)
  add $data0,$r0,$data1
  ldw $r3(4*1)
  mov $r0,$data1
  ldw $r2(4*1)
  adc $data0,$r0,$data1 /* 1 */
  ldw $r3(4*2)
  mov $r0,$data1
  ldw $r2(4*2)
  adc $data0,$r0,$data1 /* 2 */
  ldw $r3(4*3)
  mov $r0,$data1
  ldw $r2(4*3)
  adc $data0,$r0,$data1 /* 3 */
  ldw $r3(4*4)
  mov $r0,$data1
  ldw $r2(4*4)
  adc $data0,$r0,$data1 /* 4 */
  ldw $r3(4*5)
  mov $r0,$data1
  ldw $r2(4*5)
  adc $data0,$r0,$data1 /* 5 */
  ldw $r3(4*6)
  mov $r0,$data1
  ldw $r2(4*6)
  adc $data0,$r0,$data1 /* 6 */
  ldw $r3(4*7)
  mov $r0,$data1
  ldw $r2(4*7)
  adc $data0,$r0,$data1 /* 7 */
  ldw $r3(4*8)
  mov $r0,$data1
  ldw $r2(4*8)
  adc $data0,$r0,$data1 /* 8 */
  ldw $r3(4*9)
  mov $r0,$data1
  ldw $r2(4*9)
  adc $data0,$r0,$data1 /* 9 */
  ldw $r3(4*10)
  mov $r0,$data1
  ldw $r2(4*10)
  adc $data0,$r0,$data1 /* 10 */
  ldw $r3(4*11)
  mov $r0,$data1
  ldw $r2(4*11)
  adc $data0,$r0,$data1 /* 11 */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r5(0)
  mov $data0,$r0
  ret
}

void comba_double_add(_MIPD_ big x,big y,big w)
{ /* fast modular addition */
    unsigned int i;
    big modulus;
    BOOL dodec;
    mr_small *a,*b,*c;
    mr_small carry;  

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    modulus=mr_mip->pR;
    if (w!=x && w!=y) 
    {
        for (i=2*MR_COMBA;i<(w->len&MR_OBITS);i++) w->w[i]=0;
        /* zero(w); */
    }
    
    a=x->w; b=y->w; c=w->w;

    c_add2(a,b,c,&carry);

    w->len=2*MR_COMBA;

/* if sum is greater than modulus a decrement will be required */

    dodec=FALSE;
    if (carry) dodec=TRUE;  /* possible misprediction here */
    else
    {
        if (w->w[2*MR_COMBA-1]>modulus->w[2*MR_COMBA-1]) dodec=TRUE; /* possible misprediction here */
	else
	{
            if (w->w[2*MR_COMBA-1]==modulus->w[2*MR_COMBA-1]) /* this will be very rare, so easily predicted */
	    {
	        if (mr_compare(w,modulus)>=0) dodec=TRUE; /* do full comparison */
	    }
	}
    }

    if (dodec)  /* prediction here correlated to earlier predictions, so should predict nicely */
    {
         a=&(w->w[MR_COMBA]); b=&(modulus->w[MR_COMBA]);

	 c_dec(a,b,&carry);
    }
    if (w->w[2*MR_COMBA-1]==0) mr_lzero(w);   

}

void comba_sub(_MIPD_ big x,big y,big w)
{ /* fast modular subtraction */
    unsigned int i;
    big modulus;
    mr_small *a,*b,*c;
    mr_small carry;  


#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    modulus=mr_mip->modulus;
    if (x!=w && y!=w) 
    {
        for (i=MR_COMBA;i<(w->len&MR_OBITS);i++) w->w[i]=0;   
        /* zero(w); */
    }

    a=x->w; b=y->w; c=w->w;

    c_sub(a,b,c,&carry);

    if (carry)
    {
        a=w->w; b=modulus->w;
	c_inc(a,b,&carry);
    }
    w->len=MR_COMBA;
    if (w->w[MR_COMBA-1]==0) mr_lzero(w); 
}

asm c_sub2(mr_small *a,mr_small *b,mr_small *c,mr_small *carry)
{
/*** SUBTRACTION2 ***/             /* sub b from a, result in c */
  stw $r4,4
  ldw $r2(0)	
  mov $r0,$data1
  ldw $r3(0)
  sub $data0,$r0,$data1
  ldw $r2(4*1)
  mov $r0,$data1
  ldw $r3(4*1)
  sbb $data0,$r0,$data1 /* 1  */
  ldw $r2(4*2)
  mov $r0,$data1
  ldw $r3(4*2)
  sbb $data0,$r0,$data1 /* 2  */
  ldw $r2(4*3)
  mov $r0,$data1
  ldw $r3(4*3)
  sbb $data0,$r0,$data1 /* 3  */
  ldw $r2(4*4)
  mov $r0,$data1
  ldw $r3(4*4)
  sbb $data0,$r0,$data1 /* 4  */
  ldw $r2(4*5)
  mov $r0,$data1
  ldw $r3(4*5)
  sbb $data0,$r0,$data1 /* 5  */
  ldw $r2(4*6)
  mov $r0,$data1
  ldw $r3(4*6)
  sbb $data0,$r0,$data1 /* 6  */
  ldw $r2(4*7)
  mov $r0,$data1
  ldw $r3(4*7)
  sbb $data0,$r0,$data1 /* 7  */
  ldw $r2(4*8)
  mov $r0,$data1
  ldw $r3(4*8)
  sbb $data0,$r0,$data1 /* 8  */
  ldw $r2(4*9)
  mov $r0,$data1
  ldw $r3(4*9)
  sbb $data0,$r0,$data1 /* 9  */
  ldw $r2(4*10)
  mov $r0,$data1
  ldw $r3(4*10)
  sbb $data0,$r0,$data1 /* 10  */
  ldw $r2(4*11)
  mov $r0,$data1
  ldw $r3(4*11)
  sbb $data0,$r0,$data1 /* 11  */
  mov $r0,$0
  adc $r0,$r0,$0
  stw $r5(0)
  mov $data0,$r0
  ret
}

void comba_double_sub(_MIPD_ big x,big y,big w)
{ /* fast modular subtraction */
    unsigned int i;
    big modulus;
    mr_small *a,*b,*c;
    mr_small carry;  

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    modulus=mr_mip->modulus;
    if (x!=w && y!=w) 
    {
        for (i=2*MR_COMBA;i<(w->len&MR_OBITS);i++) w->w[i]=0;   
        /* zero(w); */
    }

    a=x->w; b=y->w; c=w->w;

    c_sub2(a,b,c,&carry);

    if (carry)
    {
        a=&(w->w[MR_COMBA]); b=modulus->w; 

	c_inc(a,b,&carry);
    
    }
    w->len=2*MR_COMBA;
    if (w->w[2*MR_COMBA-1]==0) mr_lzero(w); 
}

void comba_negate(_MIPD_ big x,big w)
{ /* fast modular subtraction */
    unsigned int i;
    big modulus;
    mr_small *a,*b,*c;
    mr_small carry;  

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    modulus=mr_mip->modulus;
    if (w!=x) 
    {
        for (i=MR_COMBA;i<(w->len&MR_OBITS);i++) w->w[i]=0;
        /* zero(w); */
    }
    a=modulus->w; b=x->w; c=w->w;

    c_sub(a,b,c,&carry);

    w->len=MR_COMBA;
    if (w->w[MR_COMBA-1]==0) mr_lzero(w); 
}

#endif
