/*
 *   MIRACL Comba's method for ultimate speed binary polynomial
 *   mrcomba2.tpl 
 *
 *   Here the inner loops of the basic multiplication, and squaring  
 *   algorithms are completely unravelled, and  reorganised for maximum possible speed. 
 *
 *   This approach is recommended for maximum speed where parameters
 *   are fixed and compute resources are constrained. The processor MUST 
 *   support a special binary polynomial multiplication instruction
 *
 *   This file is a template. To fill in the gaps and create mrcomba2.c, 
 *   you must run the mex.c program to insert the C or assembly language 
 *   macros from the appropriate .mcs file. 
 *
 *   This method would appear to be particularly useful for implementing 
 *   fast Elliptic Curve Cryptosystems over GF(2^m) 
 *
 *   The #define MR_COMBA2 in mirdef.h determines the FIXED size of 
 *   modulus to be used. This *must* be determined at compile time. 
 *
 *   Note that this module can generate a *lot* of code for large values 
 *   of MR_COMBA2. This should have a maximum value of 8-16.
 *
 *   Note that on some processors it is *VITAL* that arrays be aligned on 
 *   4-byte boundaries
 *
 * *  **** This code does not like -fomit-frame-pointer using GCC  ***********
 *
 *   Copyright (c) 2006 Shamus Software Ltd.
 */

#include "miracl.h"    

#ifdef MR_COMBA2

asm c_mul2(mr_small *a,mr_small *b,mr_small *c)
{
/*** MULTIPLY2 ***/      /* multiply a by b, result in c */    
  mov $dspmode,1
  xor $r7,$r7
  xor $r1,$r1
  xor $r0,$r0
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*0)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*1)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*2)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*3)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*4)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*0)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*0)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*5)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*1)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*1)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*6)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*2)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*2)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*7)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*3)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*3)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*8)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*4)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*4)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*9)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  ldw $r2(4*5)
  mov $r6,$data1
  ldw $r3(4*5)
  xmul $r6,$r6,$data1
  xor $r0,$r6
  xor $r1,$msw,$r1
  stw $r4(4*10)
  mov $data0,$r0
  mov $r0,$r1
  mov $r1,$r7
  xor $r7,$r7
  stw $r4(4*11)
  mov $data0,$r0
  ret
}

/* NOTE! z must be distinct from x and y */

void comba_mult2(_MIPD_ big x,big y,big z) 
{ /* comba multiplier */
    int i;
    mr_small *a,*b,*c;
    big w;

#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    w=mr_mip->w0; 
    for (i=2*MR_COMBA2;i<(int)(w->len&MR_OBITS);i++) w->w[i]=0;
    w->len=2*MR_COMBA2;
    a=x->w; b=y->w; c=w->w;

    c_mul2(a,b,c);
	
    mr_lzero(w);
    if (w!=z) copy (w,z); 
}   
 
#endif
