[svn] / ecrypt / trunk / submissions / phelix / i386 / phelix.S  

svn: ecrypt/trunk/submissions/phelix/i386/phelix.S

File: [svn] / ecrypt / trunk / submissions / phelix / i386 / phelix.S (download)
Revision: 125, Wed Dec 21 13:23:19 2005 UTC (7 years, 5 months ago) by cdecanni
File size: 42569 byte(s)
* included i386 assembly implementation.

/* 
 * ----------------------------------------------------------------
 * Phelix encryption/authentication algorithm
 * Author: Doug Whiting, Hifn. 2005.
 *
 * This source code is released to the public domain
 * ----------------------------------------------------------------
 */

#ifndef __i386__
#error architecture is not supported
#endif
	
	.file	"phelix.S"
	.text
	.align 4
	
	#include "strucmac.S"				/* structured programming macros */

	.set	PHELIX_INCREMENTAL_API,1		/* comment this out to exclude incremental calls */

/* concatenate text together (useful in building names inside macros) */
.macro	strCat	aa,bb,cc,dd,ee,ff,gg,hh
\aa\bb\cc\dd\ee\ff\gg\hh
.endm
/* ---------------------------------------------------------------- */
/* define a global label. Handle linking with and without underscore */
.macro	C_global	phelixName,ecryptName
  #ifdef MIX_ASM
    strCat ".global ",\phelixName,"_ASM"
    strCat ".global _",\phelixName,"_ASM"
strCat " ",\phelixName,"_ASM:"
strCat "_",\phelixName,"_ASM:"
  #else
	.global	 \phelixName
	.global _\phelixName
  #endif
 \phelixName:
_\phelixName:
  #ifdef ECRYPT_API
  .ifnc \ecryptName,
	.global	 \ecryptName
	.global	_\ecryptName
	 \ecryptName:
	_\ecryptName:
  .endif
 #endif
.endm

/* ---------------------------------------------------------------- */
	
C_global _debugPhelix_
		.long	0			/* ignored here, but must be defined for testPhelix.c */

AsmName:	.ascii	"gnu.as\0"
		.align 4

C_global PhelixCompiler_Name			/* show who assembled us */
		lea		AsmName,%eax
C_Global PhelixInit,ECRYPT_init			/* Init call does nothing */
		ret
/*  */
/* ---------------------------------------------------------------- */
/*  Macros and definitions */
/* ---------------------------------------------------------------- */
/*  */
/*  Phelix rotation constants */
	.set	ROT_0a,			 9
	.set	ROT_1a,			10
	.set	ROT_2a,			17
	.set	ROT_3a,			30
	.set	ROT_4a,			13

	.set	ROT_0b,			20
	.set	ROT_1b,			11
	.set	ROT_2b,			 5
	.set	ROT_3b,			15
	.set	ROT_4b,			25

	.set	UNROLL_CNT,		 8				/* how many blocks to unroll in inner loop */
	.set	ZERO_INIT_CNT,	 8				/* number of words of init */
	.set	MAGIC_MAC_XOR,	 0x912d94f1		/* special constants */
	.set	MAGIC_AAD_XOR,	 0xaadaadaa
/*  */
/* ----- register assignments */
/*  Z0		equ		eax */
/*  Z1		equ		ebx */
/*  Z2		equ		ecx */
/*  Z3		equ		edx */
/*  Z4		equ		esi */
/*  t0		equ		ebp				#"temp" scratch registers */
/*  t1		equ		edi */
/*  oldZreg	equ		Z4 */
/*  */
/* ---------------------------------------------------------------- */
/*  */
/*  Allocate and define local variables on the stack */
/*  [Note:	We use esp for locals, not ebp, since we need ebp as a variable. */
/* 			Thus, we can't use the assembler stack frame primitives.] */
/*  */
	.set	_maxLocalSize_		,0		/* max locals usage in bytes */
	.set	_Phelix_LocalSize	,0		/* starting value: no locals allocated yet */
 	.set	_SO_				,0		/* current stack offset due to calls */
/*  */
.macro _newLocal	wCnt,lName			/* macro to define a local variable */
	.set	\lName			 ,_Phelix_LocalSize
	.set	_Phelix_LocalSize,_Phelix_LocalSize+4*(\wCnt)
	/*  keep running tabs on stack usage for locals */
  .if    _maxLocalSize_<_Phelix_LocalSize
	.set _maxLocalSize_,_Phelix_LocalSize
  .endif
.endm
/*  */
.macro	_newParm wCnt,_pp_
	.set \_pp_, _pOfs_
strCat   ".set ",\_pp_,_LCL,",",(_pOfs_-_cpOfs_)
	.set _pOfs_,_pOfs_+4*(\wCnt)
.endm
/*  */
	/*  now define local variables for the Encrypt/Decrypt functions */
	_newLocal	1,srcPtr			/* pointer to  input data buffer */
	_newLocal	1,dstPtr			/* pointer to output data buffer */
	_newLocal	1,loopByteCnt		/* inner loop byte counter */
	_newLocal	1,jmpTabPtr			/* pointer to encrypt/decrypt jump table */
	_newLocal	8,X_i_0				/* local copy of the key values */
	_newLocal	8,X_i_1
	_newLocal	4,oldZ				/* "old" Z values */
	_newLocal	1,_i_				/* block number (+8) */
	_newLocal	UNROLL_CNT  ,exitTab/* local jump table for exiting unrolled loop */
	_newLocal	UNROLL_CNT+4,tmpBuf	/* local buffer encryption/decryption blocks */
	_newLocal	1,aadLeft			/* # bytes of aad remaining */
	_newLocal	1,msgLen0			/* initial value of src_ByteCnt */
	_newLocal	1,dstPtr0			/* initial dst pointer */
	_newLocal	1,retAddr			/* local "return" address */

	.set	_cpOfs_,4+8*4+_Phelix_LocalSize	/* caller parms offset from esp */
	.set		retAddr_LCL,retAddr-_cpOfs_
	.set		dstPtr0_LCL,dstPtr0-_cpOfs_
	.set		msgLen0_LCL,msgLen0-_cpOfs_
	.set		 tmpBuf_LCL, tmpBuf-_cpOfs_
/*  */
/* ---------------------------------------------------------------- */
/*  Define caller's parameters on the stack, relative to esp */
/*  */
	.set	_pOfs_,_cpOfs_

	_newParm	0,callerParms		/* placeholder, no space allocated */
	_newParm	1,ctxt_Ptr			
	_newParm	1,nonce_Ptr
	_newParm	1,aad_Ptr
	_newParm	1,aad_Len
	_newParm	1,src_Ptr
	_newParm	1,dst_Ptr
	_newParm	1,src_ByteCnt
	_newParm	1,mac_Ptr
/*  */
/* ---------------------------------------------------------------- */
/*  Phelix context structure definition */
	.set	_pOfs_,0

	_newParm	1,keySize			/* size of raw key in bits */
	_newParm	1,macSize			/* size of mac tag in bits */
	_newParm	1,X_1_Bump			/* 4*(keySize/8) + 256*(macSize mod 128) */
	_newParm	8,X_0				/* subkeys */
	_newParm	8,X_1				/* subkeys */
	/*  internal cipher state */
	_newParm	4,old_Z				/* previous Z[4] values for output */
	_newParm	5,_Z_				/* 5 internal state words */
	_newParm	1,blkNum			/* block number (i) */
	_newParm	2,aadLen			/* 64-bit aadLen counter (LSW first) */
	_newParm	1,msgLen			/* 32-bit msgLen counter (mod 2**32) */
	_newParm	1,aadXor			/* aad Xor constant */
/*  */
/* ---------------------------------------------------------------- */
/*  */
.macro _o_	op1,op2,op3,cond3		/* shorthand: instantiate 1-3 opcodes */
		\op1
		\op2
		\op3
		\cond3
.endm
/* ---------------------------------------------------------------- */
/*  adjust _SO_ with push/pop operations */
.macro	_stackOp op,reg,bump
	.ifnc  \reg,				/* only do something if reg is not blank */
	  \op %\reg
	  .set	_SO_,_SO_+\bump
	.endif
.endm

.macro	_push	r0,r1,r2,r3,r4,r5,r6
	_stackOp	push,\r0,4
	_stackOp	push,\r1,4
	_stackOp	push,\r2,4
	_stackOp	push,\r3,4
	_stackOp	push,\r4,4
	_stackOp	push,\r5,4
	_stackOp	push,\r6,4
.endm
/*  */
.macro	_pop	r0,r1,r2,r3,r4,r5,r6
	_stackOp	 pop,\r0,-4
	_stackOp	 pop,\r1,-4
	_stackOp	 pop,\r2,-4
	_stackOp	 pop,\r3,-4
	_stackOp	 pop,\r4,-4
	_stackOp	 pop,\r5,-4
	_stackOp	 pop,\r6,-4
.endm
/*  */
/* ---------------------------------------------------------------- */
/*  Init code, jump tables (for lblName = Encrypt/Decrypt) */
/* ---------------------------------------------------------------- */
/*  */
.macro	PhelixAlgo lblName
		/*  first, set up the stack frame */
		pushal							/* save all regs on stack */
 strCat	"lea ",\lblName,"_jmpTab,%ebp"	/* handle the encrypt/decrypt difference */
		jmp		Phelix_Main				/* go run the algorithm */
		/*  */
		/*  the jump table for this operation */
		/*  */
		.align	4
strCat	\lblName,"_jmpTab:"
		/* first, a list of "block boundary" targets within unrolled processing loop */
		.irp xxx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
		  .if \xxx < UNROLL_CNT
		    strCat	" .long \lblName","Blk_",\xxx
strCat	" .global \lblName","Blk_",\xxx
		  .endif
		.endr
		/*  next, successive "control" targets within Phelix_Main */
		strCat	".set OddBytes_OFFS,","(.-\lblName","_jmpTab)"
		strCat	".long \lblName","_OddBytes"
.endm	/* PhelixAlgo */

/*  */
/* ---------------------------------------------------------------- */
/*  Common unrolled loop end code for encrypt/decrypt */
/* ---------------------------------------------------------------- */
/*  */
.macro PhelixEndLoop CNT
		addl	$(\CNT)*4,srcPtr(%esp)	 	/* bump the pointers */
		addl	$(\CNT)*4,dstPtr(%esp)   		 
		addl	$(\CNT)	 ,_i_   (%esp)	  	/* bump the count */
		subl	$(\CNT)*4,loopByteCnt(%esp)	/* are we done yet? */
.endm	/* leave here with flags set for loop jmp */
/*  */
/* ---------------------------------------------------------------- */
/*  Common "early exit" code for encrypt/decrypt inner loop */
/* ---------------------------------------------------------------- */
/*  This functionality is required for splicing AAD/text/padding */
/*  */
.macro PhelixEarlyExit	jTabReg,_bn_
	.if \_bn_ < (UNROLL_CNT-1)			/* don't need early exit at bottom of loop */
		testl %\jTabReg,%\jTabReg		/* time to exit? */
		_if  nz
		  movl %esi,oldZ+4*((\_bn_) & 3)+_SO_(%esp)
		  jmp *%\jTabReg				/* go to "exit" address */
		_endif
	.endif
	movl %esi,oldZ+4*((\_bn_)& 3)+_SO_(%esp)
.endm
/*  */
/* **************************************************************** */
/*  start of actual code (i.e., end of macro definitions) */
/* **************************************************************** */
/*  */
		.align	4
INIT_ZEROES:
	.rept ZERO_INIT_CNT
			.long	0
	.endr
MASK_TAB:	.long	0,0xff,0xffff,0xffffff

_PhelixCodeStart_:

/*  */
/* ---------------------------------------------------------------- */
/*  Common control path for Encrypt/Decrypt */
/* ---------------------------------------------------------------- */
/*  In:	ebp --> (const) jump table (Encrypt_jmpTab or Decrypt_jmpTab) */
/*  Out:	everything done */
/*  */
Phelix_Main:
		/* point to callers first parameter (save code size below) */
		leal	callerParms-_Phelix_LocalSize(%esp),%esi
		subl	$_Phelix_LocalSize,%esp	/* make room for locals on stack */
		movl	%ebp,jmpTabPtr(%esp)	/* save jump table pointer */
		call	InitNonce
		/*  */
		/* ################################################################ */
		/*  Finally ready to start running Phelix on some data */
		/* ################################################################ */
		/*  First, process the initialization zeroes (loopByteCnt == 0 from PhelixInit) */
		/*  */
		movl	$_ret_InitZeroDone,exitTab+4*(ZERO_INIT_CNT-1)+_SO_(%esp)
		jmp		EncryptBlk_0
		/*  */
		/*  "local" function */
	.set _SO_,4
InitNonce:
		/*  first, init the local keys on the stack */
		movl	ctxt_Ptr_LCL(%esi),%ebp			/* point to context structure */
		movl	X_1_Bump(%ebp),%edi				/* edi=4*(keySize/8)+256*(macSize mod 128) */
		movl	nonce_Ptr_LCL(%esi),%edx		/* (const) pointer to nonce words */

		_push	esi								/* save esi  (push/pop = smaller than lea esi,callerParms) */
		xor		%esi,%esi						/* use esi as the variable i in SetTwoKeys */
		inc		%esi							/* start with i = 1, since edi = X'_1 = 4*L(U) already */
		call	SetTwoKeys						/* set X_1_n, X_5_n, for n=0,1  [return w/edi == 0] */
		call	SetTwoKeys						/* set X_2_n, X_6_n, for n=0,1 */
		call	SetTwoKeys						/* set X_3_n, X_7_n, for n=0,1 */
		xor		%esi,%esi						/* wrap to i = 0 */
		call	SetTwoKeys						/* set X_0_n, X_4_n, for n=0,1 */
		_pop	esi								/* restore pointer to callerParms */

		/* set up for initialization phase */
		xorl	%ecx,%ecx	
		leal	INIT_ZEROES,%ebp				/* use all zero input words, for i= -8 .. -1 */
		leal	tmpBuf+_SO_(%esp),%edi			/* discard output */
		movl	%ecx,loopByteCnt+_SO_(%esp)		/* initialize loop byte count counter = 0 */
		movl	%ecx,_i_+_SO_(%esp)				/* initialize i = 0 (block number + 8) */
		movl	%ebp,srcPtr+_SO_(%esp)
		movl	%edi,dstPtr+_SO_(%esp)

		/*  now initialize the Zn register values */
		movl	ctxt_Ptr_LCL(%esi),%ebp
		movl	nonce_Ptr_LCL(%esi),%edi

		movl	X_0+12(%ebp),%eax				/* get the X_0 key values */
		movl	X_0+16(%ebp),%ebx
		movl	X_0+20(%ebp),%ecx
		movl	X_0+24(%ebp),%edx
		movl	X_0+28(%ebp),%esi

		xorl	  (%edi),%eax					/* merge in the nonce */
		xorl	 4(%edi),%ebx
		xorl	 8(%edi),%ecx
		xorl	12(%edi),%edx
		ret
.set _SO_,0
		/* ########################################### */
		/*  done with the initial zeroes. */
_ret_InitZeroDone:
	.if UNROLL_CNT > ZERO_INIT_CNT			/* do we need to clear out the return point? */
		xorl	%ebp,%ebp					/* (only if it's not already at the end) */
		movl	%ebp,exitTab+4*(ZERO_INIT_CNT-1)+_SO_(%esp)
	.endif
		/* ################ */
		/*  handle AAD here, looping if needed */
		xorl	$MAGIC_AAD_XOR,%ebx
		movl	aad_Len+_SO_(%esp),%ebp
		testl	%ebp,%ebp
	_if nz									/* if nothing there, skip all aad processing */
		movl	aad_Ptr+_SO_(%esp),%edi
		movl	%ebp,aadLeft+_SO_(%esp)
		movl	%edi, srcPtr+_SO_(%esp)		/* src will come from aad_Ptr */
_aad_Loop:									/* here with ebp == aad_Len */
		leal	tmpBuf+_SO_(%esp),%edi		/* always use tmpBuf for aad dst (discard) */
		movl	%edi,dstPtr+_SO_(%esp)
		movl	aadLeft+_SO_(%esp),%ebp
		subl	$4*UNROLL_CNT,%ebp			/* only do one unrolled loop each time */
		_if ae								/* (since we use tmpBuf to discard ciphertext) */
		  movl	%ebp,aadLeft+_SO_(%esp)
		  xorl	%edi,%edi
		  movl	%edi,loopByteCnt+_SO_(%esp)
		  movl	$_aad_Loop,exitTab+4*(UNROLL_CNT-1)+_SO_(%esp)
		  jmp	EncryptBlk_0
		_endif
		/*  here to handle final partial loop */
_aad_PartialLoop:
		andl	$4*(UNROLL_CNT-1),%ebp
		movl	%ebp,loopByteCnt+_SO_(%esp)
		cmpl	$4,%ebp
		_if ae
		  movl		$_ret_aad_1,exitTab-4+_SO_(%esp,%ebp)
		  jmp		EncryptBlk_0
		_ret_aad_1:
		  movl		loopByteCnt+_SO_(%esp),%ebp
		  xorl		%edi,%edi
		  movl		%edi,exitTab-4+_SO_(%esp,%ebp)	/* clear the entry */
		_endif
		/*  here to handle final partial word of AAD */
		movl	aadLeft+_SO_(%esp),%ebp
		movl	%ebp,%edi
		andl	$3,%edi					/* any odd bytes? */
		_ifbrk	z						/* if not, we're done with AAD */
		addl	$4,%ebp
		andl	$4*(UNROLL_CNT-1),%ebp
		movl	%ebp,loopByteCnt+_SO_(%esp)
		_push	esi
		subl	$4,%ebp
		andl	$4*(UNROLL_CNT-1),%ebp
		movl	srcPtr+_SO_(%esp),%esi
		movl	(%esi,%ebp),%esi		/* get the last AAD word */
		andl	MASK_TAB(,%edi,4),%esi	/* clear out extra bits */
		leal	tmpBuf+_SO_(%esp),%edi
		movl	%esi,(%edi)
		subl	%ebp,%edi
		movl	%edi,dstPtr+_SO_(%esp)
		movl	%edi,srcPtr+_SO_(%esp)
		movl	$_ret_aad_2,exitTab+_SO_(%esp,%ebp)
		movl	%ebp,tmpBuf+4+_SO_(%esp)/* save this */
		_pop	esi
		jmp		*Encrypt_jmpTab(%ebp)
_ret_aad_2:
		movl	tmpBuf+4+_SO_(%esp),%ebp
		xorl	%edi,%edi
		movl	%edi,exitTab+_SO_(%esp,%ebp)
	_endif
		xorl	$MAGIC_AAD_XOR,%ebx
		/* ################ */
		/*  process the user data */
_startUserData:
		_push	esi							/* use esi as temp pointer  */
		leal	callerParms+_SO_(%esp),%esi	/*   (to save code size in accessing caller parms below) */
		leal	_ret_MAC0,%ebp
		movl	%ebp,retAddr_LCL(%esi)
		movl	src_Ptr_LCL(%esi),%ebp
		movl	%ebp,srcPtr+_SO_(%esp)
		movl	dst_Ptr_LCL(%esi),%edi
		movl	src_ByteCnt_LCL(%esi),%ebp
		/*  enter here from EncryptBytes */
processUserData:
		movl	%edi,dstPtr+_SO_(%esp)
		movl	%edi,dstPtr0_LCL(%esi)
		movl	%ebp,msgLen0_LCL(%esi)
		_pop	esi							/* restore esi */
		movl	loopByteCnt+_SO_(%esp),%edi
		andl	$4*(UNROLL_CNT-1),%edi		/* get the loop "phase" */
		subl	%edi,dstPtr+_SO_(%esp)		/* adjust pointers accordingly */
		subl	%edi,srcPtr+_SO_(%esp)
		/* ################ */
		/*  now process the bulk of the data in "full" loop chunks (ebp = src_ByteCnt) */
		addl	%edi,%ebp
		subl	$UNROLL_CNT*4,%ebp			/* enough for one "full" loop? */
		movl	%ebp,loopByteCnt+_SO_(%esp)	/* save the pre-subtracted value for use in the loop */
		_if ae	
		  add	jmpTabPtr+_SO_(%esp),%edi	/* get ready to jump into block processing */
		  movl	$_ret_DataDone1,exitTab+4*(UNROLL_CNT-1)+_SO_(%esp)
		  jmp	*(%edi)						/* go encrypt or decrypt */
_ret_DataDone1:
		  movl	loopByteCnt+_SO_(%esp),%ebp	/* restore ebp = loopByteCnt */
		  xorl	%edi,%edi					/* starting phase is at ??crypt_0 now */
		_endif
		/* ################ */
		/*  now process the remainder of the data, if any (partial loop) */
		andl	$4*(UNROLL_CNT-1),%ebp		/* compute ebp = end phase */
		cmpl	%edi,%ebp					/* any partial loop to do? */
		_if nz
		  movl	%ebp,loopByteCnt+_SO_(%esp)	/* make sure that the exit loop test falls thru */
		  addl	jmpTabPtr+_SO_(%esp),%edi	/* get ready to jump */
		  movl	$_ret_DataDone2,exitTab-4+_SO_(%esp,%ebp)	/* force an exit at the correct point */
		  jmp	*(%edi)
_ret_DataDone2:
		  xorl	%edi,%edi					/* edi = 0 */
		  movl	loopByteCnt+_SO_(%esp),%ebp
		  andl	$4*(UNROLL_CNT-1),%ebp		/* recompute exitTab index */
		  movl	%edi,exitTab-4+_SO_(%esp,%ebp)	/* clear the exitTab entry */
		_endif
		/* ################ */
		/*  special (i.e. UGLY!!) handling when src_ByteCnt isn't a multiple of 4 */
		/*  here with ebp = loopByteCnt AND 4*(UNROLL_CNT-1) */
		movl	msgLen0+_SO_(%esp),%edi		/* get original msgLen */
		andl	$3,%edi						/* any partial words? (hopefully rare) */
		_if nz
		  movl	$_ret_OddBytes,exitTab+_SO_(%esp,%ebp)
		  orl	%ebp,%edi					/* save word index and odd byte count */
		  movl	%edi,loopByteCnt+_SO_(%esp)	/* 	back into loopByteCnt */
		  _push esi
		  andl	$3,%edi
		  movl	srcPtr+_SO_(%esp),%esi
		  addl	%ebp,%esi
		  _push	ebp
		  movl	MASK_TAB(,%edi,4),%edi		/* get the mask bits */
		  movl	(%esi),%ebp					/* and get the source word */
		  leal	tmpBuf+_SO_(%esp),%esi
		  andl	%edi,%ebp					/* ebp = masked source word */
		  movl	%edi,8(%esi)				/* save the mask bits (for use in Decrypt_OddBytes) */
		  movl	%ebp, (%esi)				/* save the masked source word */
		  _pop	ebp
		  subl	%ebp,%esi					/* adjust src/dst ptrs for hard coded offsets in block code */
		  movl	%esi,srcPtr+_SO_(%esp)		/* set up for "single-word" encrypt in tmpBuf[] */
		  addl	$4,%esi
		  movl	%esi,dstPtr+_SO_(%esp)
		  mov	jmpTabPtr+_SO_(%esp),%edi	/* dispatch to different handler for Encrypt & Decrypt */
		  _pop	esi
		  jmp	*OddBytes_OFFS(%edi)
		  /*  */
		  /*  here to handle the odd-byte encrypt case */
Encrypt_OddBytes:
		  jmp	*Encrypt_jmpTab(%ebp)		/* go encrypt the single word */
		  /*  */
		  /*  here to handle the funky odd-byte decrypt case */
Decrypt_OddBytes:
		  /*  we have to encrypt halfway thru the block to compute keystream :-(( */
		  /* 		(i.e., in order to produce the "full" ciphertext word) */
		  _push eax,ebx,ecx,edx,esi,ebp
		  _o_ "addl %edx,%eax","roll $ROT_3b,%edx","mov X_i_0+_SO_(%esp,%ebp),%ebp"	/* get the key word */
		  _o_ "addl %esi,%ebx","roll $ROT_4b,%esi"
		  _o_ "xorl %eax,%ecx","roll $ROT_0a,%eax"
		  _o_ "xorl %ebx,%edx"					 ,"add %edx,%ebp"
		  _o_ "addl %ecx,%esi","roll $ROT_2a,%ecx","mov loopByteCnt+_SO_(%esp),%edi"
		  
		  _o_ "xorl %ebp,%eax","roll $ROT_4a,%esi","and $4*3,%edi"
		  _o_ "addl %eax,%ecx"				 ,"mov oldZ+_SO_(%esp,%edi),%ebp"
		  _o_ "xorl %ecx,%esi"
		  addl	%esi,%ebp					/* now ebp = keystream */
		  movl	tmpBuf+8+_SO_(%esp),%edi	/* get the mask word */
		  notl	%edi						/* toggle the maskbits */
		  andl	%ebp,%edi					/* mask off unused maskbits */
		  xorl	%edi,tmpBuf+_SO_(%esp)		/* re-create the "full" ciphertext word @ tmp src buffer */
		  _pop	ebp,esi,edx,ecx,ebx,eax
		  jmp	*Decrypt_jmpTab(%ebp)		/* go decrypt */
		  /*  "return" here with the dest word computed at [tmpBuf+4] */
_ret_OddBytes:
		  _push	esi,eax
		  leal	callerParms+_SO_(%esp),%esi
		  xorl	%edi,%edi
		  movl	loopByteCnt+_SO_(%esp),%ebp
		  andl	$4*(UNROLL_CNT-1),%ebp
		  movl	%edi,exitTab+_SO_(%esp,%ebp)	/* clear out the exitTab entry we just used */
		  movl	msgLen0+_SO_(%esp),%edi			/* now output just the number of dst bytes specified */
		  movl	%edi,%ebp
		  andl	$3,%ebp
		  xorl	%ebp,%edi						/* clear low 2 bits of count */
		  addl	dstPtr0_LCL(%esi),%edi			/* point to "final" word offset */
		  movl	tmpBuf_LCL+4(%esi),%eax			/* get the dst output word (short offset) */
		  xorl	(%edi),%eax						/* do bit diddling to output just the odd bytes */
		  andl	MASK_TAB(,%ebp,4),%eax
		  xorl	%eax,(%edi)
		  _pop	eax,esi
		_endif
		jmp		*retAddr+_SO_(%esp)			/* "return" to whomever */
_ret_MAC0:
		/* ################ */
		/*  here to compute and output/compare the MAC */
		movl	mac_Ptr+_SO_(%esp),%ebp
		xorl	aad_Len+_SO_(%esp),%esi
processMAC:	
		movl	%ebp,dstPtr0+_SO_(%esp)		/* save MAC ptr */
		xorl	$MAGIC_MAC_XOR,%eax			/* toggle bits to start the MAC */
		_push	esi
		movl	loopByteCnt+_SO_(%esp),%ebp
		movl	%ebp,%edi
		addl	$3,%ebp						/* advance to next full word, if odd bytes */
		andl	$4*(UNROLL_CNT-1),%ebp		/* ebp = next word "offset" within block */
		andl	$3,%edi						/* edi = length of src mod 4 (plaintext for MAC) */
		leal	tmpBuf+_SO_(%esp),%esi
		.set _bb_,0
	.rept 12								/* 8 for padding, 4 for MAC size */
		movl	%edi,_bb_(%esi)				/* fill tmpBuf with L(P) mod 4 */
		.set _bb_,_bb_+4
	.endr
		leal	7*4(%ebp),%edi
		andl	$4*(UNROLL_CNT-1),%edi		/* stop point is after 8 blocks (i+0..i+7) */
		movl	$_ret_MAC1,exitTab+_SO_(%esp,%edi)
		subl	%ebp,%esi					/* set up source/dest pointers */
		movl	%esi,srcPtr+_SO_(%esp)
		movl	%esi,dstPtr+_SO_(%esp)
		addl	$8*4-1,%ebp					/* FUNKY wrap logic requires -1 */
		movl	%ebp,loopByteCnt+_SO_(%esp)
		incl	%ebp						/* undo adjustment */
		andl	$4*(UNROLL_CNT-1),%ebp
		_pop	esi
		jmp		*Encrypt_jmpTab(%ebp)		/* go do the encryption */
		/*  just finished eight blocks of "padding" using L(P) mod 4 */
		/*  now generate the MAC */
_ret_MAC1:
		movl	loopByteCnt+_SO_(%esp),%ebp
		incl	%ebp						/* undo the -1 above */
		andl	$4*(UNROLL_CNT-1),%ebp
		leal	3*4(%ebp),%edi				/* do four more (0..3 -- stop after #3) */
		andl	$4*(UNROLL_CNT-1),%edi
		movl	$_ret_MAC2,exitTab+_SO_(%esp,%edi)
		leal	4*4-1(%ebp),%edi			/* FUNKY wrap logic requires -1 */
		movl	%edi,loopByteCnt+_SO_(%esp)
		jmp		*Encrypt_jmpTab(%ebp)
		/*  */
		/*  here with the MAC computed. eax..esi now can be trashed */
_ret_MAC2:
		leal	callerParms+_SO_(%esp),%esi
		movl	ctxt_Ptr_LCL(%esi),%edi
		movl	macSize(%edi),%ecx			/* ecx = # bits in MAC */
		movl	dstPtr0_LCL(%esi),%edi
		leal	tmpBuf+8*4+_SO_(%esp),%esi
		testl	$31,%ecx					/* can we do it one word at a time? */
		_if z
		  shrl	$5,%ecx						/* if so, it's faster */
		  rep	movsl
		_else
		  addl	$7,%ecx						/* round up to byte boundary */
		  shrl	$3,%ecx						/* non-word sizes get the slow treatment */
		  rep	movsb
		_endif
		/* ################ */
		/* tear down the stack and return */
		addl	$_Phelix_LocalSize,%esp
		popal								/* restore all of callers regs */
		ret									/* and return to caller */
/*  */
/* ---------------------------------------------------------------- */
/*  Common subroutine (for use in Phelix_Main) to init subkeys */
/* ---------------------------------------------------------------- */
/*  In:	ebp		-->	pCtxt (const)	 */
/* 		edx		--> nonce (const) */
/* 		edi		=	X' value for I */
/* 		esi		=	value of I (0..3) */
/*  Out:	esi	incremented.  ebp, edx unmodified */
/* 		edi		= oldZ[I] = 0 */
/* 		X_i_0, X_i_1 set on stack for both i=I and i=I+4 */
/* 		edi */
.set _SO_,12										/* two words on stack before call */
SetTwoKeys:
		movl	X_0+4*0(%ebp,%esi,4),%eax			/* load two key values */
		movl	X_0+4*4(%ebp,%esi,4),%ebx
		movl	%eax,X_i_0+4*0+_SO_(%esp,%esi,4)	/* store the X_i_0 values */
		movl	%ebx,X_i_0+4*4+_SO_(%esp,%esi,4)
		movl	(%edx,%esi,4),%ecx					/* get ecx = N_i */
		addl	%edi,%eax							/* add in 4*L(U), for esi == 1 */
		addl	%edi,%ebx
		addl	%ecx,%ebx							/* add/sub the nonce value */
		subl	%ecx,%eax
		addl	%esi,%eax
		xorl	%edi,%edi							/* set edi = 0 */
		movl	%ebx,X_i_1+4*0+_SO_(%esp,%esi,4)	/* store the X_i_1 values */
		movl	%eax,X_i_1+4*4+_SO_(%esp,%esi,4)
		movl	%edi,oldZ+_SO_(%esp,%esi,4)			/* zero out the oldZ values */
		.set _NN_,0
	.rept UNROLL_CNT/4								/* init the "block exit" jump table: all zeroes */
		movl	%edi,exitTab+_NN_+_SO_(%esp,%esi,4)
		.set _NN_,_NN_ + 16
	.endr
		incl	%esi						/* bump the counter for next call */
		ret
/*  */
.set _SO_,0							/* back to no offset */
/*  */
/* ---------------------------------------------------------------- */
/*  Encryption routines */
/* ---------------------------------------------------------------- */
/*  */
		.align	4
C_global PhelixEncryptPacket,ECRYPT_AE_encrypt_packet
		PhelixAlgo	Encrypt					/* instantiate the algorithm ocde */
		/*  */
		/* the main block processing loop */
		/*  */
	_rept
	  .irp _blkNum_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
	   .if  \_blkNum_ < UNROLL_CNT
strCat	EncryptBlk_,\_blkNum_,":"			/* make a label for re-entry points */
		.set _bb_,\_blkNum_ & 7				/* support UNROLL_CNT > 8 */

		_o_ "addl %edx,%eax","roll $ROT_3b,%edx","movl X_i_0+4*_bb_+_SO_(%esp),%ebp"
		_o_ "addl %esi,%ebx","roll $ROT_4b,%esi"
		_o_ "xorl %eax,%ecx","roll $ROT_0a,%eax","movl srcPtr+_SO_(%esp),%edi"
		_o_ "xorl %ebx,%edx","roll $ROT_1a,%ebx","addl %edx,%ebp"	/* does LEA opcode help here? */
		_o_ "addl %ecx,%esi","roll $ROT_2a,%ecx"

		_o_ "xorl %ebp,%eax","roll $ROT_3a,%edx","movl 4*_bb_(%edi),%ebp"	/* ebp = plaintext */
		_o_ "xorl %esi,%ebx","roll $ROT_4a,%esi","movl oldZ+4*(_bb_&3)+_SO_(%esp),%edi"
		_o_ "addl %eax,%ecx","roll $ROT_0b,%eax"
		_o_ "addl %ebx,%edx","roll $ROT_1b,%ebx","xorl %edx,%ebp"
		_o_ "xorl %ecx,%esi","roll $ROT_2b,%ecx"

		addl %esi,%edi						/* now edi = keystream */
		xorl %edx,%edi						/* set up to compute edi = ciphertext below */

		_o_ "addl %ebp,%eax","roll $ROT_3b,%edx","xorl %ebp,%edi"			/* now edi = ciphertext */
		_o_ "addl %esi,%ebx","roll $ROT_4b,%esi","movl X_i_1+4*_bb_+_SO_(%esp),%ebp"
		_o_ "xorl %eax,%ecx","roll $ROT_0a,%eax"
		_o_ "xorl %ebx,%edx","roll $ROT_1a,%ebx","addl _i_+_SO_(%esp),%ebp"
		_o_ "addl %ecx,%esi","roll $ROT_2a,%ecx","leal _bb_(%ebp,%edx),%ebp"

		_o_ "xorl %ebp,%eax","roll $ROT_3a,%edx","movl dstPtr+_SO_(%esp),%ebp"
		_o_ "xorl %esi,%ebx","roll $ROT_4a,%esi"
		_o_ "addl %eax,%ecx","roll $ROT_0b,%eax","movl %edi,4*_bb_(%ebp)"	/* save ciphertext */
		_o_ "addl %ebx,%edx","roll $ROT_1b,%ebx","movl exitTab+4*\_blkNum_+_SO_(%esp),%edi"
		_o_ "xorl %ecx,%esi","roll $ROT_2b,%ecx"
		
		PhelixEarlyExit edi,\_blkNum_		/* do we need to do an early exit? If so, do it */
	   .endif
      .endr
		PhelixEndLoop	UNROLL_CNT			/* set condition code for _until below */
	_until b
		jmp	*exitTab+4*(UNROLL_CNT-1)+_SO_(%esp)	/* "return" to do more */
/*  */
/* ---------------------------------------------------------------- */
/*  Decryption routine */
/* ---------------------------------------------------------------- */
/*  */
		.align	4
C_global PhelixDecryptPacket,ECRYPT_AE_decrypt_packet
		PhelixAlgo	Decrypt				/* instantiate the algorithm ocde */
		/*  */
		/* the main block processing loop */
		/*  */
	_rept
	  .irp _blkNum_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
	   .if  \_blkNum_ < UNROLL_CNT
strCat	DecryptBlk_,\_blkNum_,":"		/* make a label for re-entry points */
		.set _bb_,\_blkNum_ & 7			/* support UNROLL_CNT > 8 (but not really!) */
		_o_ "addl %edx,%eax","roll $ROT_3b,%edx","movl X_i_0+4*_bb_+_SO_(%esp),%ebp"
		_o_ "addl %esi,%ebx","roll $ROT_4b,%esi"
		_o_ "xorl %eax,%ecx","roll $ROT_0a,%eax","movl srcPtr+_SO_(%esp),%edi"
		_o_ "xorl %ebx,%edx","roll $ROT_1a,%ebx","addl %edx,%ebp"
		_o_ "addl %ecx,%esi","roll $ROT_2a,%ecx"

		_o_ "xorl %ebp,%eax","roll $ROT_3a,%edx","movl 4*_bb_(%edi),%ebp"	/* ebp = ciphertext */
		_o_ "xorl %esi,%ebx","roll $ROT_4a,%esi","movl oldZ+4*(_bb_&3)+_SO_(%esp),%edi"
		_o_ "addl %eax,%ecx","roll $ROT_0b,%eax"
		_o_ "addl %ebx,%edx","roll $ROT_1b,%ebx"
		_o_ "xorl %ecx,%esi","roll $ROT_2b,%ecx"

		addl %esi,%edi						/* set edi = keystream */
		xorl %ebp,%edi						/* now edi = plaintext */
		movl %edx,%ebp
		xorl %edi,%ebp						/* now ebp = plaintext ^ edx */

		_o_ "addl %ebp,%eax","roll $ROT_3b,%edx"
		_o_ "addl %esi,%ebx","roll $ROT_4b,%esi","movl X_i_1+4*_bb_+_SO_(%esp),%ebp"
		_o_ "xorl %eax,%ecx","roll $ROT_0a,%eax"
		_o_ "xorl %ebx,%edx","roll $ROT_1a,%ebx","addl _i_+_SO_(%esp),%ebp"
		_o_ "addl %ecx,%esi","roll $ROT_2a,%ecx","leal _bb_(%ebp,%edx),%ebp"

		_o_ "xorl %ebp,%eax","roll $ROT_3a,%edx","movl dstPtr+_SO_(%esp),%ebp"
		_o_ "xorl %esi,%ebx","roll $ROT_4a,%esi"
		_o_ "addl %eax,%ecx","roll $ROT_0b,%eax","movl %edi,4*_bb_(%ebp)"	/* save plaintext computed above */
		_o_ "addl %ebx,%edx","roll $ROT_1b,%ebx","movl exitTab+4*\_blkNum_+_SO_(%esp),%edi"
		_o_ "xorl %ecx,%esi","roll $ROT_2b,%ecx"

		PhelixEarlyExit edi,\_blkNum_		/* do we need to do an early exit? If so, do it */
	   .endif
      .endr
		PhelixEndLoop	UNROLL_CNT			/* set condition code for _until below */
	_until b
		jmp		*exitTab+4*(UNROLL_CNT-1)+_SO_(%esp)	/* "return" to do more */
/*  */
_PhelixCodeEnd_:

.ifdef PHELIX_INCREMENTAL_API
/*  */
/* ---------------------------------------------------------------- */
/*  "Incremental" function: SetupNonce */
/* ---------------------------------------------------------------- */
/* 	use same stack as EncryptPacket! */
/*  */
C_global PhelixSetupNonce,ECRYPT_AE_ivsetup
		pushal
		lea		callerParms-_Phelix_LocalSize(%esp),%esi
		subl	$_Phelix_LocalSize,%esp
_SO_			=	0
		call	InitNonce
		movl	$_ret_SetupNonceDone,exitTab+4*(ZERO_INIT_CNT-1)+_SO_(%esp)
		jmp		EncryptBlk_0
_ret_SetupNonceDone:
	.if UNROLL_CNT > ZERO_INIT_CNT		/* do we need to clear out the return point? */
		.err	"Replicate code here from _ret_InitZeroDone"
	.endif
		movl	ctxt_Ptr+_SO_(%esp),%ebp		/* save our context */
		/*  */
		movl	$MAGIC_AAD_XOR,%edi
		xorl	%edi,%ebx
		movl	%edi,aadXor(%ebp)
		/*  */
		movl	%eax,4*0+_Z_(%ebp)
		movl	%ebx,4*1+_Z_(%ebp)
		movl	%ecx,4*2+_Z_(%ebp)
		movl	%edx,4*3+_Z_(%ebp)
		movl	%esi,4*4+_Z_(%ebp)

		.irp	_nn_,0,1,2,3
		  movl	X_i_1+8*\_nn_  +_SO_(%esp),%eax
		  movl	X_i_1+8*\_nn_+4+_SO_(%esp),%ebx
		  movl	oldZ +4*\_nn_  +_SO_(%esp),%ecx
		  movl	%eax,X_1+  8*\_nn_(%ebp)
		  movl	%ebx,X_1+4+8*\_nn_(%ebp)
		  movl	%ecx,old_Z+4*\_nn_(%ebp)
		.endr
		/*  */
		xorl	%edi,%edi
		movl	%edi,msgLen  (%ebp)
		movl	%edi,aadLen  (%ebp)
		movl	%edi,aadLen+4(%ebp)
		movl	_i_+_SO_(%esp),%edi
		movl	%edi,blkNum(%ebp)
		/*  */
		addl	$_Phelix_LocalSize,%esp
		popal
		ret
/*  */
/* ---------------------------------------------------------------- */
/*  "Incremental" function: EncryptBytes/DecryptBytes */
/* ---------------------------------------------------------------- */
/* 	use same locals stack as EncryptPacket */
/*  */
_pOfs_	=		_cpOfs_
/*  */
		_newParm 1,ctxt_Ptr
		_newParm 1,src_Ptr
		_newParm 1,dst_Ptr
		_newParm 1,bCnt
/*  */
C_global PhelixEncryptBytes,ECRYPT_AE_encrypt_bytes
		pushal
		leal	Encrypt_jmpTab,%ebp
PhelixBytes:
		leal	callerParms-_Phelix_LocalSize(%esp),%esi
		subl	$_Phelix_LocalSize,%esp
	.set _SO_,0
		movl	%ebp,jmpTabPtr+_SO_(%esp)
		/*  copy context to local on stack */
		movl	ctxt_Ptr_LCL(%esi),%ebp
		_push	esi
		leal	X_0(%ebp),%esi
		leal	X_i_0+_SO_(%esp),%edi
		movl	$8+8+4,%ecx					/* X_0, X_1, and oldZ */
		cld
		rep		movsl						/* copy the context */
		xorl	%eax,%eax	
		movl	$UNROLL_CNT,%ecx			/* zero out exitTab */
		rep		stosl
		_pop	esi
		leal	_ret_PhelixBytes,%ebp
		movl	%ebp,retAddr_LCL(%esi)		/* set up return address */
		movl	src_Ptr_LCL(%esi),%ebp		/* copy srcPtr and dstPtr */
		movl	%ebp,srcPtr+_SO_(%esp)
		movl	dst_Ptr_LCL(%esi),%ebp
		movl	%ebp,dstPtr+_SO_(%esp)
		movl	ctxt_Ptr_LCL(%esi),%ebp
		movl	blkNum(%ebp),%edi			/* convert blkNum from pCtxt to locals */
		andl	$~(UNROLL_CNT-1),%edi
		movl	%edi,_i_+_SO_(%esp)
		movl	blkNum(%ebp),%edi
		shll	$2,%edi						/* convert blkNum to a word count */
		movl	%edi,loopByteCnt+_SO_(%esp)	/* and save it as the "phase" */
		movl	_Z_+4*0(%ebp),%eax			/* load the Z values */
		movl	_Z_+4*1(%ebp),%ebx
		movl	_Z_+4*2(%ebp),%ecx
		movl	_Z_+4*3(%ebp),%edx
		movl	_Z_+4*4(%ebp),%esi
		xorl	aadXor(%ebp),%ebx
		movl	$0,aadXor(%ebp)

		_push	esi
		leal	callerParms+_SO_(%esp),%esi
		movl	src_Ptr_LCL(%esi),%ebp
		movl	%ebp,srcPtr+_SO_(%esp)
		movl	bCnt_LCL(%esi),%ebp
		movl	dst_Ptr_LCL(%esi),%edi
		jmp		processUserData
		_pop	esi
_ret_PhelixBytes:

		/*  copy modified value back to context */
		movl	ctxt_Ptr+_SO_(%esp),%ebp
		movl	%eax,_Z_+4*0(%ebp)			/* store the values Z0..Z4 */
		movl	%ebx,_Z_+4*1(%ebp)
		movl	%ecx,_Z_+4*2(%ebp)
		movl	%edx,_Z_+4*3(%ebp)
		movl	%esi,_Z_+4*4(%ebp)

		movl	msgLen0+_SO_(%esp),%edi		/* update pCtxt.blkNum */
		movl	%edi,%esi
		addl	$3,%edi
		shrl	$2,%edi
		addl	%edi,blkNum(%ebp)
		addl	%esi,msgLen(%ebp)			/* track low 2 bits of msgLen */

		leal	old_Z(%ebp),%edi
		leal	oldZ+_SO_(%esp),%esi
		movl	$4,%ecx					/* copy back the updated oldZ values */
		rep		movsl

		addl	$_Phelix_LocalSize,%esp
		popal
		ret
		/*  */
		/*  handle decryption here */
C_global PhelixDecryptBytes,ECRYPT_AE_decrypt_bytes
		pushal
		leal	Decrypt_jmpTab,%ebp
		jmp		PhelixBytes
/*  */
/* ---------------------------------------------------------------- */
/*  "Incremental" function: Finalize (MAC) */
/* ---------------------------------------------------------------- */
/* 	use same locals stack as EncryptPacket */
/*  */
_pOfs_	=		_cpOfs_
		_newParm	1,ctxt_Ptr
		_newParm	1,mac_Ptr
/*  */
C_global PhelixFinalize,ECRYPT_AE_finalize
		pushal
		leal	callerParms-_Phelix_LocalSize(%esp),%esi
		subl	$_Phelix_LocalSize,%esp
	.set _SO_,0
		leal	Encrypt_jmpTab,%ebp
		movl	%ebp,jmpTabPtr+_SO_(%esp)

		/*  copy context to local on stack */
		movl	ctxt_Ptr_LCL(%esi),%ebp
		_push	esi
		leal	X_0(%ebp),%esi
		leal	X_i_0+_SO_(%esp),%edi
		movl	$8+8+4,%ecx					/* X_0, X_1, and oldZ */
		cld	
		rep		movsl						/* copy the context */
		xorl	%eax,%eax
		movl	$UNROLL_CNT,%ecx			/* zero out exitTab */
		rep		stosl
		_pop	esi

		movl	ctxt_Ptr_LCL(%esi),%ebp
		movl	blkNum(%ebp),%edi			/* convert blkNum from pCtxt to locals */
		andl	$~(UNROLL_CNT-1),%edi	
		movl	%edi,_i_+_SO_(%esp)

		movl	msgLen(%ebp),%eax
		subl	$4,%eax
		negl	%eax
		andl	$3,%eax						/* track the low 2 bits of msgLen */
				
		movl	blkNum(%ebp),%edi
		shll	$2,%edi						/* convert blkNum to a word count */
		subl	%eax,%edi
		movl 	%edi,loopByteCnt+_SO_(%esp)	/* and save it as the "phase" */

		movl	_Z_+4*0(%ebp),%eax			/* load the Z values */
		movl	_Z_+4*1(%ebp),%ebx
		movl	_Z_+4*2(%ebp),%ecx
		movl	_Z_+4*3(%ebp),%edx
		movl	_Z_+4*4(%ebp),%esi

		xorl	aadXor  (%ebp),%ebx
		xorl	aadLen  (%ebp),%esi
		xorl	aadLen+4(%ebp),%ecx
		movl	mac_Ptr+_SO_(%esp),%ebp
		jmp		processMAC
/*  */
/*  */
/* ---------------------------------------------------------------- */
/*  "Incremental" function: ProcessAAD */
/* ---------------------------------------------------------------- */
 .set _Phelix_LocalSize,0
		_newLocal	1,aad_I					/* different local stack from from above! */
		_newLocal	1,aad_bb
		_newLocal	1,aad_tmp
/*  */
_cpOfs_	=		4+8*4+_Phelix_LocalSize		/* caller parms offset from esp */
_pOfs_	=		_cpOfs_
/*  */
		_newParm	1,ctxt_Ptr
		_newParm	1,aad_Ptr
		_newParm	1,aad_Len
/*  */
C_global PhelixProcessAAD,ECRYPT_AE_authenticate_bytes
		pushal
		subl	$_Phelix_LocalSize,%esp
	.set _SO_,0
		movl	ctxt_Ptr+_SO_(%esp),%ebp	/* point to context */
		movl	 aad_Len+_SO_(%esp),%edi
		addl	%edi,aadLen  (%ebp)			/* update accumulated length */
		adcl	$0  ,aadLen+4(%ebp)
		movl	blkNum(%ebp),%edi
		movl	%edi,aad_I+_SO_(%esp)

		movl	_Z_+4*0(%ebp),%eax			/* load the Z values */
		movl	_Z_+4*1(%ebp),%ebx
		movl	_Z_+4*2(%ebp),%ecx
		movl	_Z_+4*3(%ebp),%edx
		movl	_Z_+4*4(%ebp),%esi

		subl	$4,aad_Len+_SO_(%esp)		/* are we done yet? */
		_rept ae
aad_Again:movl	aad_I+_SO_(%esp),%edi
		  andl	$7,%edi
		  movl	ctxt_Ptr+_SO_(%esp),%ebp
		  _o_	"addl %edx,%eax","roll $ROT_3b,%edx","movl X_0(%ebp,%edi,4),%ebp"
		  _o_	"addl %esi,%ebx","roll $ROT_4b,%esi","movl %edi,aad_bb+_SO_(%esp)"
		  _o_	"xorl %eax,%ecx","roll $ROT_0a,%eax","movl aad_Ptr+_SO_(%esp),%edi"
		  _o_	"xorl %ebx,%edx","roll $ROT_1a,%ebx","addl %edx,%ebp"		
		  _o_	"addl %ecx,%esi","roll $ROT_2a,%ecx"

		  _o_	"xorl %ebp,%eax","roll $ROT_3a,%edx","movl (%edi),%ebp"		/* ebp = AAD plaintext */
		  _o_	"xorl %esi,%ebx","roll $ROT_4a,%esi","addl $4,%edi"
		  _o_	"addl %eax,%ecx","roll $ROT_0b,%eax","movl %edi,aad_Ptr+_SO_(%esp)"
		  _o_	"addl %ebx,%edx","roll $ROT_1b,%ebx","xorl %edx,%ebp"
		  _o_	"xorl %ecx,%esi","roll $ROT_2b,%ecx","movl aad_bb+_SO_(%esp),%edi"

		  _o_	"addl %ebp,%eax","roll $ROT_3b,%edx","movl ctxt_Ptr+_SO_(%esp),%ebp"
		  _o_	"addl %esi,%ebx","roll $ROT_4b,%esi","movl X_1(%ebp,%edi,4),%ebp"
		  _o_	"xorl %eax,%ecx","roll $ROT_0a,%eax"
		  _o_	"xorl %ebx,%edx","roll $ROT_1a,%ebx","addl aad_I+_SO_(%esp),%ebp"
		  _o_	"addl %ecx,%esi","roll $ROT_2a,%ecx","addl %edx,%ebp"

		  _o_	"xorl %ebp,%eax","roll $ROT_3a,%edx","movl ctxt_Ptr+_SO_(%esp),%ebp"
		  _o_	"xorl %esi,%ebx","roll $ROT_4a,%esi","andl $3,%edi"
		  _o_	"addl %eax,%ecx","roll $ROT_0b,%eax","incl aad_I+_SO_(%esp)"
		  _o_	"addl %ebx,%edx","roll $ROT_1b,%ebx"
		  _o_	"xorl %ecx,%esi","roll $ROT_2b,%ecx","movl %esi,old_Z(%ebp,%edi,4)"
		  subl	$4,aad_Len+_SO_(%esp)		/* are we done yet? */
		_until	b

		/*  note ebp == ctxt_Ptr here */
		movl	aad_Len+_SO_(%esp),%edi		/* at this point, -4 <= aad_Len < 0 */
		andl	$3,%edi						/* any odd bytes left? */
		_if z								/* if not, we're done */
		  movl	aad_I+_SO_(%esp),%edi		/* copy back the updated blkNum  */
		  movl	%edi,blkNum(%ebp)

		  movl	%eax,_Z_+4*0(%ebp)	  			/* save the Z values */
		  movl	%ebx,_Z_+4*1(%ebp)	  
		  movl	%ecx,_Z_+4*2(%ebp)	  
		  movl	%edx,_Z_+4*3(%ebp)	  
		  movl	%esi,_Z_+4*4(%ebp)	  

		  /*  clean up the stack and return */
		  addl	$_Phelix_LocalSize,%esp
		  popal
		  ret
		_endif
		/*  here to handle odd AAD bytes */
		movl	aad_Ptr+_SO_(%esp),%ebp		/* get the final partial word */
		movl	(%ebp),%ebp
		andl	MASK_TAB(,%edi,4),%ebp		/* mask off unused bits */
		leal	aad_tmp+_SO_(%esp),%edi
		movl	%edi,aad_Ptr+_SO_(%esp)		/* point aad_Ptr to aad_Tmp */
		movl	%ebp,(%edi)					/* store zero-padded word there */
		xorl	%ebp,%ebp					/* fix up the count to not come here again */
		movl	%ebp,aad_Len+_SO_(%esp)
		jmp		aad_Again
/*  */
/* ---------------------------------------------------------------- */
/*  "Incremental" function: SetupKey */
/* ---------------------------------------------------------------- */
/*  */
	.set _Phelix_LocalSize,0
		_newLocal	1,sk_esi
		_newLocal	1,sk_Cnt
/*  */
_cpOfs_	=		4+8*4+_Phelix_LocalSize	/* caller parms offset from esp */
_pOfs_	=		_cpOfs_
/*  */
		_newParm	1,ctxt_Ptr
		_newParm	1,key_Ptr
		_newParm	1,key_Size
		_newParm	1,iv_Size
		_newParm	1,mac_Size
/*  */
/* assert(PHELIX_NONCE_SIZE==ivSize)# Phelix only supports "full" nonces	*/
/* assert( 0  == (keySize%8))#	Phelix only supports byte-sized keys	*/
/* assert(256 >=  keySize)#	 Phelix only supports keys <= 256 bits*/
/*  */
C_global PhelixSetupKey,ECRYPT_AE_keysetup
		pushal
		subl	$_Phelix_LocalSize,%esp
	.set _SO_,0
		movl	ctxt_Ptr+_SO_(%esp),%ebp	/* point to the context to be built */
		movl	key_Size+_SO_(%esp),%eax	/* copy keySize */
		movl	%eax,keySize(%ebp)
		movl	mac_Size+_SO_(%esp),%ebx	/* and macSize */
		movl	%ebx,macSize(%ebp)
		andl	$127,%ebx					/* and compute X1_Bump */
		shll	$8  ,%ebx
		shrl	$1  ,%eax					/* eax = keySize/2 (in bits) */
		addl	%eax,%ebx
		movl	%ebx,X_1_Bump(%ebp)			/* then store it */
		shrl	$2  ,%eax					/* eax = keySize/8 (# bytes of key) */

		/*  now copy in the key bits */
		movl	key_Ptr+_SO_(%esp),%edi
		xorl	%ebx,%ebx					/* ebx = counter */
		_rept
		  cmpl	%eax,%ebx					/* is this full word part of the key? */
		  _brk	ae							/* if not, go handle partial word (if any) */
		  movl	(%edi,%ebx),%ecx			/* else get next full word of key */
		  movl	%ecx,X_0(%ebp,%ebx)			/* and copy it to context */
		  addl	$4,%ebx						/* bump counter */
		_endr								/* go back for more */
		testl	$3,%eax						/* if any partial words, handle that here */
		_if	nz
		  movl	%eax,%esi
		  andl	$3,%esi						/* esi = (keySize/8) mod 4 */
		  movl	MASK_TAB(,%esi,4),%ecx		/* mask off "unused" bits */
		  andl	%ecx,X_0-4(%ebp,%ebx)
		_endif
		xorl	%ecx,%ecx					/* zero out the rest of the context key */
		_rept
		  cmpl	$8*4,%ebx					/* are we done yet? */
		  _brk	ae
		  movl	%ecx,X_0(%ebp,%ebx)			/* zero context key */
		  addl	$4,%ebx
		_endr
		/*  now run the Feistel network for initial key mixing */
		addl	$64,%eax
		movl	%eax,sk_esi+_SO_(%esp)		/* precompute L(U)+64 "constant" for mixing */
		movl	$128,sk_Cnt+_SO_(%esp)		/* use this as a counter */
		_rept
		  movl	sk_Cnt+_SO_(%esp),%edi
		  andl	$16,%edi					/* isolate one bit */
		  movl	X_0+4*0(%ebp,%edi),%eax 
		  movl	X_0+4*1(%ebp,%edi),%ebx    
		  movl	X_0+4*2(%ebp,%edi),%ecx    
		  movl	X_0+4*3(%ebp,%edi),%edx    
		  movl	sk_esi+ _SO_(%esp),%esi
		  .rept 2							/* unroll just a bit */
		    _o_ "addl %edx,%eax","roll $ROT_3b,%edx"
			_o_ "addl %esi,%ebx","roll $ROT_4b,%esi"
			_o_ "xorl %eax,%ecx","roll $ROT_0a,%eax"
			_o_ "xorl %ebx,%edx","roll $ROT_1a,%ebx"
			_o_ "addl %ecx,%esi","roll $ROT_2a,%ecx"
	  
			_o_ "xorl %edx,%eax","roll $ROT_3a,%edx"
			_o_ "xorl %esi,%ebx","roll $ROT_4a,%esi"
			_o_ "addl %eax,%ecx","roll $ROT_0b,%eax"
			_o_ "addl %ebx,%edx","roll $ROT_1b,%ebx"
			_o_ "xorl %ecx,%esi","roll $ROT_2b,%ecx"
		  .endr
		  xorl	$16,%edi					/* go to other half */
		  xorl	%eax,X_0+4*0(%ebp,%edi)		/* perform the Feistel xor */
		  xorl	%ebx,X_0+4*1(%ebp,%edi)
		  xorl	%ecx,X_0+4*2(%ebp,%edi)
		  xorl	%edx,X_0+4*3(%ebp,%edi)
		  subl	$16,sk_Cnt+_SO_(%esp)
		_until be
		/*  clean up the stack and return */
		addl	$_Phelix_LocalSize,%esp
		popal
		ret
/*  */
/* ---------------------------------------------------------------- */
/*  */
C_global PhelixIncremental_CodeSize
		mov		$(.- _PhelixCodeStart_),%eax
		ret
/*  */
.endif /*  _INCREMENTAL_API */
/*  */
/* ---------------------------------------------------------------- */
/*  use this NOP routine to calibrate/check our timing tests */
/* ---------------------------------------------------------------- */
/*  */
C_global PhelixNop
		pushal
		popal
		ret
/*  */
/* ---------------------------------------------------------------- */
/*  size statistics at compile time */
/* ---------------------------------------------------------------- */
/*  */
C_global PhelixProcessPacket_CodeSize,ECRYPT_AE_process_packet_CodeSize
		movl	$(_PhelixCodeEnd_-_PhelixCodeStart_),%eax
		ret
/*  */
	.end

eSTREAM Project

Powered by ViewCVS 1.0-dev
(Powered by Apache)

ViewCVS and CVS Help