andersk Git - openssh.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* $OpenBSD: umac.c,v 1.1 2007/06/07 19:37:34 pvalchev Exp $ */
	2	/* -----------------------------------------------------------------------
	3	*
	4	* umac.c -- C Implementation UMAC Message Authentication
	5	*
	6	* Version 0.93b of rfc4418.txt -- 2006 July 18
	7	*
	8	* For a full description of UMAC message authentication see the UMAC
	9	* world-wide-web page at http://www.cs.ucdavis.edu/~rogaway/umac
	10	* Please report bugs and suggestions to the UMAC webpage.
	11	*
	12	* Copyright (c) 1999-2006 Ted Krovetz
	13	*
	14	* Permission to use, copy, modify, and distribute this software and
	15	* its documentation for any purpose and with or without fee, is hereby
	16	* granted provided that the above copyright notice appears in all copies
	17	* and in supporting documentation, and that the name of the copyright
	18	* holder not be used in advertising or publicity pertaining to
	19	* distribution of the software without specific, written prior permission.
	20	*
	21	* Comments should be directed to Ted Krovetz (tdk@acm.org)
	22	*
	23	* ---------------------------------------------------------------------- */
	24
	25	/* ////////////////////// IMPORTANT NOTES /////////////////////////////////
	26	*
	27	* 1) This version does not work properly on messages larger than 16MB
	28	*
	29	* 2) If you set the switch to use SSE2, then all data must be 16-byte
	30	* aligned
	31	*
	32	* 3) When calling the function umac(), it is assumed that msg is in
	33	* a writable buffer of length divisible by 32 bytes. The message itself
	34	* does not have to fill the entire buffer, but bytes beyond msg may be
	35	* zeroed.
	36	*
	37	* 4) Three free AES implementations are supported by this implementation of
	38	* UMAC. Paulo Barreto's version is in the public domain and can be found
	39	* at http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ (search for
	40	* "Barreto"). The only two files needed are rijndael-alg-fst.c and
	41	* rijndael-alg-fst.h. Brian Gladman's version is distributed with the GNU
	42	* Public lisence at http://fp.gladman.plus.com/AES/index.htm. It
	43	* includes a fast IA-32 assembly version. The OpenSSL crypo library is
	44	* the third.
	45	*
	46	* 5) With FORCE_C_ONLY flags set to 0, incorrect results are sometimes
	47	* produced under gcc with optimizations set -O3 or higher. Dunno why.
	48	*
	49	/////////////////////////////////////////////////////////////////////// */
	50
	51	/* ---------------------------------------------------------------------- */
	52	/* --- User Switches ---------------------------------------------------- */
	53	/* ---------------------------------------------------------------------- */
	54
	55	#define UMAC_OUTPUT_LEN 8 /* Alowable: 4, 8, 12, 16 */
	56	/* #define FORCE_C_ONLY 1 ANSI C and 64-bit integers req'd */
	57	/* #define AES_IMPLEMENTAION 1 1 = OpenSSL, 2 = Barreto, 3 = Gladman */
	58	/* #define SSE2 0 Is SSE2 is available? */
	59	/* #define RUN_TESTS 0 Run basic correctness/speed tests */
	60	/* #define UMAC_AE_SUPPORT 0 Enable auhthenticated encrytion */
	61
	62	/* ---------------------------------------------------------------------- */
	63	/* -- Global Includes --------------------------------------------------- */
	64	/* ---------------------------------------------------------------------- */
	65
	66	#include "includes.h"
	67	#include <sys/types.h>
	68
	69	#include "umac.h"
	70	#include <string.h>
	71	#include <stdlib.h>
	72	#include <stddef.h>
	73
	74	/* ---------------------------------------------------------------------- */
	75	/* --- Primitive Data Types --- */
	76	/* ---------------------------------------------------------------------- */
	77
	78	/* The following assumptions may need change on your system */
	79	typedef u_int8_t UINT8; /* 1 byte */
	80	typedef u_int16_t UINT16; /* 2 byte */
	81	typedef u_int32_t UINT32; /* 4 byte */
	82	typedef u_int64_t UINT64; /* 8 bytes */
	83	typedef unsigned int UWORD; /* Register */
	84
	85	/* ---------------------------------------------------------------------- */
	86	/* --- Constants -------------------------------------------------------- */
	87	/* ---------------------------------------------------------------------- */
	88
	89	#define UMAC_KEY_LEN 16 /* UMAC takes 16 bytes of external key */
	90
	91	/* Message "words" are read from memory in an endian-specific manner. */
	92	/* For this implementation to behave correctly, __LITTLE_ENDIAN__ must */
	93	/* be set true if the host computer is little-endian. */
	94
	95	#if BYTE_ORDER == LITTLE_ENDIAN
	96	#define __LITTLE_ENDIAN__ 1
	97	#else
	98	#define __LITTLE_ENDIAN__ 0
	99	#endif
	100
	101	/* ---------------------------------------------------------------------- */
	102	/* ---------------------------------------------------------------------- */
	103	/* ----- Architecture Specific ------------------------------------------ */
	104	/* ---------------------------------------------------------------------- */
	105	/* ---------------------------------------------------------------------- */
	106
	107
	108	/* ---------------------------------------------------------------------- */
	109	/* ---------------------------------------------------------------------- */
	110	/* ----- Primitive Routines --------------------------------------------- */
	111	/* ---------------------------------------------------------------------- */
	112	/* ---------------------------------------------------------------------- */
	113
	114
	115	/* ---------------------------------------------------------------------- */
	116	/* --- 32-bit by 32-bit to 64-bit Multiplication ------------------------ */
	117	/* ---------------------------------------------------------------------- */
	118
	119	#define MUL64(a,b) ((UINT64)((UINT64)(UINT32)(a) * (UINT64)(UINT32)(b)))
	120
	121	/* ---------------------------------------------------------------------- */
	122	/* --- Endian Conversion --- Forcing assembly on some platforms */
	123	/* ---------------------------------------------------------------------- */
	124
	125	#if 0
	126	static UINT32 LOAD_UINT32_REVERSED(void *ptr)
	127	{
	128	UINT32 temp = (UINT32 )ptr;
	129	temp = (temp >> 24) \| ((temp & 0x00FF0000) >> 8 )
	130	\| ((temp & 0x0000FF00) << 8 ) \| (temp << 24);
	131	return (UINT32)temp;
	132	}
	133
	134	static void STORE_UINT32_REVERSED(void *ptr, UINT32 x)
	135	{
	136	UINT32 i = (UINT32)x;
	137	(UINT32 )ptr = (i >> 24) \| ((i & 0x00FF0000) >> 8 )
	138	\| ((i & 0x0000FF00) << 8 ) \| (i << 24);
	139	}
	140	#endif
	141
	142	/* The following definitions use the above reversal-primitives to do the right
	143	* thing on endian specific load and stores.
	144	*/
	145
	146	#define LOAD_UINT32_REVERSED(p) (swap32((UINT32 )(p)))
	147	#define STORE_UINT32_REVERSED(p,v) ((UINT32 )(p) = swap32(v))
	148
	149	#if (__LITTLE_ENDIAN__)
	150	#define LOAD_UINT32_LITTLE(ptr) ((UINT32 )(ptr))
	151	#define STORE_UINT32_BIG(ptr,x) STORE_UINT32_REVERSED(ptr,x)
	152	#else
	153	#define LOAD_UINT32_LITTLE(ptr) LOAD_UINT32_REVERSED(ptr)
	154	#define STORE_UINT32_BIG(ptr,x) ((UINT32 )(ptr) = (UINT32)(x))
	155	#endif
	156
	157
	158
	159	/* ---------------------------------------------------------------------- */
	160	/* ---------------------------------------------------------------------- */
	161	/* ----- Begin KDF & PDF Section ---------------------------------------- */
	162	/* ---------------------------------------------------------------------- */
	163	/* ---------------------------------------------------------------------- */
	164
	165	/* UMAC uses AES with 16 byte block and key lengths */
	166	#define AES_BLOCK_LEN 16
	167
	168	/* OpenSSL's AES */
	169	#include <openssl/aes.h>
	170	typedef AES_KEY aes_int_key[1];
	171	#define aes_encryption(in,out,int_key) \
	172	AES_encrypt((u_char )(in),(u_char )(out),(AES_KEY *)int_key)
	173	#define aes_key_setup(key,int_key) \
	174	AES_set_encrypt_key((u_char )(key),UMAC_KEY_LEN8,int_key)
	175
	176	/* The user-supplied UMAC key is stretched using AES in a counter
	177	* mode to supply all random bits needed by UMAC. The kdf function takes
	178	* an AES internal key representation 'key' and writes a stream of
	179	* 'nbytes' bytes to the memory pointed at by 'buffer_ptr'. Each distinct
	180	* 'ndx' causes a distinct byte stream.
	181	*/
	182	static void kdf(void *buffer_ptr, aes_int_key key, UINT8 ndx, int nbytes)
	183	{
	184	UINT8 in_buf[AES_BLOCK_LEN] = {0};
	185	UINT8 out_buf[AES_BLOCK_LEN];
	186	UINT8 dst_buf = (UINT8 )buffer_ptr;
	187	int i;
	188
	189	/* Setup the initial value */
	190	in_buf[AES_BLOCK_LEN-9] = ndx;
	191	in_buf[AES_BLOCK_LEN-1] = i = 1;
	192
	193	while (nbytes >= AES_BLOCK_LEN) {
	194	aes_encryption(in_buf, out_buf, key);
	195	memcpy(dst_buf,out_buf,AES_BLOCK_LEN);
	196	in_buf[AES_BLOCK_LEN-1] = ++i;
	197	nbytes -= AES_BLOCK_LEN;
	198	dst_buf += AES_BLOCK_LEN;
	199	}
	200	if (nbytes) {
	201	aes_encryption(in_buf, out_buf, key);
	202	memcpy(dst_buf,out_buf,nbytes);
	203	}
	204	}
	205
	206	/* The final UHASH result is XOR'd with the output of a pseudorandom
	207	* function. Here, we use AES to generate random output and
	208	* xor the appropriate bytes depending on the last bits of nonce.
	209	* This scheme is optimized for sequential, increasing big-endian nonces.
	210	*/
	211
	212	typedef struct {
	213	UINT8 cache[AES_BLOCK_LEN]; /* Previous AES output is saved */
	214	UINT8 nonce[AES_BLOCK_LEN]; /* The AES input making above cache */
	215	aes_int_key prf_key; /* Expanded AES key for PDF */
	216	} pdf_ctx;
	217
	218	static void pdf_init(pdf_ctx *pc, aes_int_key prf_key)
	219	{
	220	UINT8 buf[UMAC_KEY_LEN];
	221
	222	kdf(buf, prf_key, 0, UMAC_KEY_LEN);
	223	aes_key_setup(buf, pc->prf_key);
	224
	225	/* Initialize pdf and cache */
	226	memset(pc->nonce, 0, sizeof(pc->nonce));
	227	aes_encryption(pc->nonce, pc->cache, pc->prf_key);
	228	}
	229
	230	static void pdf_gen_xor(pdf_ctx *pc, UINT8 nonce[8], UINT8 buf[8])
	231	{
	232	/* 'ndx' indicates that we'll be using the 0th or 1st eight bytes
	233	* of the AES output. If last time around we returned the ndx-1st
	234	* element, then we may have the result in the cache already.
	235	*/
	236
	237	#if (UMAC_OUTPUT_LEN == 4)
	238	#define LOW_BIT_MASK 3
	239	#elif (UMAC_OUTPUT_LEN == 8)
	240	#define LOW_BIT_MASK 1
	241	#elif (UMAC_OUTPUT_LEN > 8)
	242	#define LOW_BIT_MASK 0
	243	#endif
	244
	245	UINT8 tmp_nonce_lo[4];
	246	#if LOW_BIT_MASK != 0
	247	int ndx = nonce[7] & LOW_BIT_MASK;
	248	#endif
	249	(UINT32 )tmp_nonce_lo = ((UINT32 *)nonce)[1];
	250	tmp_nonce_lo[3] &= ~LOW_BIT_MASK; /* zero last bit */
	251
	252	if ( (((UINT32 )tmp_nonce_lo)[0] != ((UINT32 )pc->nonce)[1]) \|\|
	253	(((UINT32 )nonce)[0] != ((UINT32 )pc->nonce)[0]) )
	254	{
	255	((UINT32 )pc->nonce)[0] = ((UINT32 )nonce)[0];
	256	((UINT32 )pc->nonce)[1] = ((UINT32 )tmp_nonce_lo)[0];
	257	aes_encryption(pc->nonce, pc->cache, pc->prf_key);
	258	}
	259
	260	#if (UMAC_OUTPUT_LEN == 4)
	261	((UINT32 )buf) ^= ((UINT32 *)pc->cache)[ndx];
	262	#elif (UMAC_OUTPUT_LEN == 8)
	263	((UINT64 )buf) ^= ((UINT64 *)pc->cache)[ndx];
	264	#elif (UMAC_OUTPUT_LEN == 12)
	265	((UINT64 )buf)[0] ^= ((UINT64 )pc->cache)[0];
	266	((UINT32 )buf)[2] ^= ((UINT32 )pc->cache)[2];
	267	#elif (UMAC_OUTPUT_LEN == 16)
	268	((UINT64 )buf)[0] ^= ((UINT64 )pc->cache)[0];
	269	((UINT64 )buf)[1] ^= ((UINT64 )pc->cache)[1];
	270	#endif
	271	}
	272
	273	/* ---------------------------------------------------------------------- */
	274	/* ---------------------------------------------------------------------- */
	275	/* ----- Begin NH Hash Section ------------------------------------------ */
	276	/* ---------------------------------------------------------------------- */
	277	/* ---------------------------------------------------------------------- */
	278
	279	/* The NH-based hash functions used in UMAC are described in the UMAC paper
	280	* and specification, both of which can be found at the UMAC website.
	281	* The interface to this implementation has two
	282	* versions, one expects the entire message being hashed to be passed
	283	* in a single buffer and returns the hash result immediately. The second
	284	* allows the message to be passed in a sequence of buffers. In the
	285	* muliple-buffer interface, the client calls the routine nh_update() as
	286	* many times as necessary. When there is no more data to be fed to the
	287	* hash, the client calls nh_final() which calculates the hash output.
	288	* Before beginning another hash calculation the nh_reset() routine
	289	* must be called. The single-buffer routine, nh(), is equivalent to
	290	* the sequence of calls nh_update() and nh_final(); however it is
	291	* optimized and should be prefered whenever the multiple-buffer interface
	292	* is not necessary. When using either interface, it is the client's
	293	* responsability to pass no more than L1_KEY_LEN bytes per hash result.
	294	*
	295	* The routine nh_init() initializes the nh_ctx data structure and
	296	* must be called once, before any other PDF routine.
	297	*/
	298
	299	/* The "nh_aux" routines do the actual NH hashing work. They
	300	* expect buffers to be multiples of L1_PAD_BOUNDARY. These routines
	301	* produce output for all STREAMS NH iterations in one call,
	302	* allowing the parallel implementation of the streams.
	303	*/
	304
	305	#define STREAMS (UMAC_OUTPUT_LEN / 4) /* Number of times hash is applied */
	306	#define L1_KEY_LEN 1024 /* Internal key bytes */
	307	#define L1_KEY_SHIFT 16 /* Toeplitz key shift between streams */
	308	#define L1_PAD_BOUNDARY 32 /* pad message to boundary multiple */
	309	#define ALLOC_BOUNDARY 16 /* Keep buffers aligned to this */
	310	#define HASH_BUF_BYTES 64 /* nh_aux_hb buffer multiple */
	311
	312	typedef struct {
	313	UINT8 nh_key [L1_KEY_LEN + L1_KEY_SHIFT * (STREAMS - 1)]; /* NH Key */
	314	UINT8 data [HASH_BUF_BYTES]; /* Incomming data buffer */
	315	int next_data_empty; /* Bookeeping variable for data buffer. */
	316	int bytes_hashed; /* Bytes (out of L1_KEY_LEN) incorperated. */
	317	UINT64 state[STREAMS]; /* on-line state */
	318	} nh_ctx;
	319
	320
	321	#if (UMAC_OUTPUT_LEN == 4)
	322
	323	static void nh_aux(void kp, void dp, void *hp, UINT32 dlen)
	324	/* NH hashing primitive. Previous (partial) hash result is loaded and
	325	* then stored via hp pointer. The length of the data pointed at by "dp",
	326	* "dlen", is guaranteed to be divisible by L1_PAD_BOUNDARY (32). Key
	327	* is expected to be endian compensated in memory at key setup.
	328	*/
	329	{
	330	UINT64 h;
	331	UWORD c = dlen / 32;
	332	UINT32 k = (UINT32 )kp;
	333	UINT32 d = (UINT32 )dp;
	334	UINT32 d0,d1,d2,d3,d4,d5,d6,d7;
	335	UINT32 k0,k1,k2,k3,k4,k5,k6,k7;
	336
	337	h = ((UINT64 )hp);
	338	do {
	339	d0 = LOAD_UINT32_LITTLE(d+0); d1 = LOAD_UINT32_LITTLE(d+1);
	340	d2 = LOAD_UINT32_LITTLE(d+2); d3 = LOAD_UINT32_LITTLE(d+3);
	341	d4 = LOAD_UINT32_LITTLE(d+4); d5 = LOAD_UINT32_LITTLE(d+5);
	342	d6 = LOAD_UINT32_LITTLE(d+6); d7 = LOAD_UINT32_LITTLE(d+7);
	343	k0 = (k+0); k1 = (k+1); k2 = (k+2); k3 = (k+3);
	344	k4 = (k+4); k5 = (k+5); k6 = (k+6); k7 = (k+7);
	345	h += MUL64((k0 + d0), (k4 + d4));
	346	h += MUL64((k1 + d1), (k5 + d5));
	347	h += MUL64((k2 + d2), (k6 + d6));
	348	h += MUL64((k3 + d3), (k7 + d7));
	349
	350	d += 8;
	351	k += 8;
	352	} while (--c);
	353	((UINT64 )hp) = h;
	354	}
	355
	356	#elif (UMAC_OUTPUT_LEN == 8)
	357
	358	static void nh_aux(void kp, void dp, void *hp, UINT32 dlen)
	359	/* Same as previous nh_aux, but two streams are handled in one pass,
	360	* reading and writing 16 bytes of hash-state per call.
	361	*/
	362	{
	363	UINT64 h1,h2;
	364	UWORD c = dlen / 32;
	365	UINT32 k = (UINT32 )kp;
	366	UINT32 d = (UINT32 )dp;
	367	UINT32 d0,d1,d2,d3,d4,d5,d6,d7;
	368	UINT32 k0,k1,k2,k3,k4,k5,k6,k7,
	369	k8,k9,k10,k11;
	370
	371	h1 = ((UINT64 )hp);
	372	h2 = ((UINT64 )hp + 1);
	373	k0 = (k+0); k1 = (k+1); k2 = (k+2); k3 = (k+3);
	374	do {
	375	d0 = LOAD_UINT32_LITTLE(d+0); d1 = LOAD_UINT32_LITTLE(d+1);
	376	d2 = LOAD_UINT32_LITTLE(d+2); d3 = LOAD_UINT32_LITTLE(d+3);
	377	d4 = LOAD_UINT32_LITTLE(d+4); d5 = LOAD_UINT32_LITTLE(d+5);
	378	d6 = LOAD_UINT32_LITTLE(d+6); d7 = LOAD_UINT32_LITTLE(d+7);
	379	k4 = (k+4); k5 = (k+5); k6 = (k+6); k7 = (k+7);
	380	k8 = (k+8); k9 = (k+9); k10 = (k+10); k11 = (k+11);
	381
	382	h1 += MUL64((k0 + d0), (k4 + d4));
	383	h2 += MUL64((k4 + d0), (k8 + d4));
	384
	385	h1 += MUL64((k1 + d1), (k5 + d5));
	386	h2 += MUL64((k5 + d1), (k9 + d5));
	387
	388	h1 += MUL64((k2 + d2), (k6 + d6));
	389	h2 += MUL64((k6 + d2), (k10 + d6));
	390
	391	h1 += MUL64((k3 + d3), (k7 + d7));
	392	h2 += MUL64((k7 + d3), (k11 + d7));
	393
	394	k0 = k8; k1 = k9; k2 = k10; k3 = k11;
	395
	396	d += 8;
	397	k += 8;
	398	} while (--c);
	399	((UINT64 *)hp)[0] = h1;
	400	((UINT64 *)hp)[1] = h2;
	401	}
	402
	403	#elif (UMAC_OUTPUT_LEN == 12)
	404
	405	static void nh_aux(void kp, void dp, void *hp, UINT32 dlen)
	406	/* Same as previous nh_aux, but two streams are handled in one pass,
	407	* reading and writing 24 bytes of hash-state per call.
	408	*/
	409	{
	410	UINT64 h1,h2,h3;
	411	UWORD c = dlen / 32;
	412	UINT32 k = (UINT32 )kp;
	413	UINT32 d = (UINT32 )dp;
	414	UINT32 d0,d1,d2,d3,d4,d5,d6,d7;
	415	UINT32 k0,k1,k2,k3,k4,k5,k6,k7,
	416	k8,k9,k10,k11,k12,k13,k14,k15;
	417
	418	h1 = ((UINT64 )hp);
	419	h2 = ((UINT64 )hp + 1);
	420	h3 = ((UINT64 )hp + 2);
	421	k0 = (k+0); k1 = (k+1); k2 = (k+2); k3 = (k+3);
	422	k4 = (k+4); k5 = (k+5); k6 = (k+6); k7 = (k+7);
	423	do {
	424	d0 = LOAD_UINT32_LITTLE(d+0); d1 = LOAD_UINT32_LITTLE(d+1);
	425	d2 = LOAD_UINT32_LITTLE(d+2); d3 = LOAD_UINT32_LITTLE(d+3);
	426	d4 = LOAD_UINT32_LITTLE(d+4); d5 = LOAD_UINT32_LITTLE(d+5);
	427	d6 = LOAD_UINT32_LITTLE(d+6); d7 = LOAD_UINT32_LITTLE(d+7);
	428	k8 = (k+8); k9 = (k+9); k10 = (k+10); k11 = (k+11);
	429	k12 = (k+12); k13 = (k+13); k14 = (k+14); k15 = (k+15);
	430
	431	h1 += MUL64((k0 + d0), (k4 + d4));
	432	h2 += MUL64((k4 + d0), (k8 + d4));
	433	h3 += MUL64((k8 + d0), (k12 + d4));
	434
	435	h1 += MUL64((k1 + d1), (k5 + d5));
	436	h2 += MUL64((k5 + d1), (k9 + d5));
	437	h3 += MUL64((k9 + d1), (k13 + d5));
	438
	439	h1 += MUL64((k2 + d2), (k6 + d6));
	440	h2 += MUL64((k6 + d2), (k10 + d6));
	441	h3 += MUL64((k10 + d2), (k14 + d6));
	442
	443	h1 += MUL64((k3 + d3), (k7 + d7));
	444	h2 += MUL64((k7 + d3), (k11 + d7));
	445	h3 += MUL64((k11 + d3), (k15 + d7));
	446
	447	k0 = k8; k1 = k9; k2 = k10; k3 = k11;
	448	k4 = k12; k5 = k13; k6 = k14; k7 = k15;
	449
	450	d += 8;
	451	k += 8;
	452	} while (--c);
	453	((UINT64 *)hp)[0] = h1;
	454	((UINT64 *)hp)[1] = h2;
	455	((UINT64 *)hp)[2] = h3;
	456	}
	457
	458	#elif (UMAC_OUTPUT_LEN == 16)
	459
	460	static void nh_aux(void kp, void dp, void *hp, UINT32 dlen)
	461	/* Same as previous nh_aux, but two streams are handled in one pass,
	462	* reading and writing 24 bytes of hash-state per call.
	463	*/
	464	{
	465	UINT64 h1,h2,h3,h4;
	466	UWORD c = dlen / 32;
	467	UINT32 k = (UINT32 )kp;
	468	UINT32 d = (UINT32 )dp;
	469	UINT32 d0,d1,d2,d3,d4,d5,d6,d7;
	470	UINT32 k0,k1,k2,k3,k4,k5,k6,k7,
	471	k8,k9,k10,k11,k12,k13,k14,k15,
	472	k16,k17,k18,k19;
	473
	474	h1 = ((UINT64 )hp);
	475	h2 = ((UINT64 )hp + 1);
	476	h3 = ((UINT64 )hp + 2);
	477	h4 = ((UINT64 )hp + 3);
	478	k0 = (k+0); k1 = (k+1); k2 = (k+2); k3 = (k+3);
	479	k4 = (k+4); k5 = (k+5); k6 = (k+6); k7 = (k+7);
	480	do {
	481	d0 = LOAD_UINT32_LITTLE(d+0); d1 = LOAD_UINT32_LITTLE(d+1);
	482	d2 = LOAD_UINT32_LITTLE(d+2); d3 = LOAD_UINT32_LITTLE(d+3);
	483	d4 = LOAD_UINT32_LITTLE(d+4); d5 = LOAD_UINT32_LITTLE(d+5);
	484	d6 = LOAD_UINT32_LITTLE(d+6); d7 = LOAD_UINT32_LITTLE(d+7);
	485	k8 = (k+8); k9 = (k+9); k10 = (k+10); k11 = (k+11);
	486	k12 = (k+12); k13 = (k+13); k14 = (k+14); k15 = (k+15);
	487	k16 = (k+16); k17 = (k+17); k18 = (k+18); k19 = (k+19);
	488
	489	h1 += MUL64((k0 + d0), (k4 + d4));
	490	h2 += MUL64((k4 + d0), (k8 + d4));
	491	h3 += MUL64((k8 + d0), (k12 + d4));
	492	h4 += MUL64((k12 + d0), (k16 + d4));
	493
	494	h1 += MUL64((k1 + d1), (k5 + d5));
	495	h2 += MUL64((k5 + d1), (k9 + d5));
	496	h3 += MUL64((k9 + d1), (k13 + d5));
	497	h4 += MUL64((k13 + d1), (k17 + d5));
	498
	499	h1 += MUL64((k2 + d2), (k6 + d6));
	500	h2 += MUL64((k6 + d2), (k10 + d6));
	501	h3 += MUL64((k10 + d2), (k14 + d6));
	502	h4 += MUL64((k14 + d2), (k18 + d6));
	503
	504	h1 += MUL64((k3 + d3), (k7 + d7));
	505	h2 += MUL64((k7 + d3), (k11 + d7));
	506	h3 += MUL64((k11 + d3), (k15 + d7));
	507	h4 += MUL64((k15 + d3), (k19 + d7));
	508
	509	k0 = k8; k1 = k9; k2 = k10; k3 = k11;
	510	k4 = k12; k5 = k13; k6 = k14; k7 = k15;
	511	k8 = k16; k9 = k17; k10 = k18; k11 = k19;
	512
	513	d += 8;
	514	k += 8;
	515	} while (--c);
	516	((UINT64 *)hp)[0] = h1;
	517	((UINT64 *)hp)[1] = h2;
	518	((UINT64 *)hp)[2] = h3;
	519	((UINT64 *)hp)[3] = h4;
	520	}
	521
	522	/* ---------------------------------------------------------------------- */
	523	#endif /* UMAC_OUTPUT_LENGTH */
	524	/* ---------------------------------------------------------------------- */
	525
	526
	527	/* ---------------------------------------------------------------------- */
	528
	529	static void nh_transform(nh_ctx hc, UINT8 buf, UINT32 nbytes)
	530	/* This function is a wrapper for the primitive NH hash functions. It takes
	531	* as argument "hc" the current hash context and a buffer which must be a
	532	* multiple of L1_PAD_BOUNDARY. The key passed to nh_aux is offset
	533	* appropriately according to how much message has been hashed already.
	534	*/
	535	{
	536	UINT8 *key;
	537
	538	key = hc->nh_key + hc->bytes_hashed;
	539	nh_aux(key, buf, hc->state, nbytes);
	540	}
	541
	542	/* ---------------------------------------------------------------------- */
	543
	544	static void endian_convert(void *buf, UWORD bpw, UINT32 num_bytes)
	545	/* We endian convert the keys on little-endian computers to */
	546	/* compensate for the lack of big-endian memory reads during hashing. */
	547	{
	548	UWORD iters = num_bytes / bpw;
	549	if (bpw == 4) {
	550	UINT32 p = (UINT32 )buf;
	551	do {
	552	*p = LOAD_UINT32_REVERSED(p);
	553	p++;
	554	} while (--iters);
	555	} else if (bpw == 8) {
	556	UINT32 p = (UINT32 )buf;
	557	UINT32 t;
	558	do {
	559	t = LOAD_UINT32_REVERSED(p+1);
	560	p[1] = LOAD_UINT32_REVERSED(p);
	561	p[0] = t;
	562	p += 2;
	563	} while (--iters);
	564	}
	565	}
	566	#if (__LITTLE_ENDIAN__)
	567	#define endian_convert_if_le(x,y,z) endian_convert((x),(y),(z))
	568	#else
	569	#define endian_convert_if_le(x,y,z) do{}while(0) /* Do nothing */
	570	#endif
	571
	572	/* ---------------------------------------------------------------------- */
	573
	574	static void nh_reset(nh_ctx *hc)
	575	/* Reset nh_ctx to ready for hashing of new data */
	576	{
	577	hc->bytes_hashed = 0;
	578	hc->next_data_empty = 0;
	579	hc->state[0] = 0;
	580	#if (UMAC_OUTPUT_LEN >= 8)
	581	hc->state[1] = 0;
	582	#endif
	583	#if (UMAC_OUTPUT_LEN >= 12)
	584	hc->state[2] = 0;
	585	#endif
	586	#if (UMAC_OUTPUT_LEN == 16)
	587	hc->state[3] = 0;
	588	#endif
	589
	590	}
	591
	592	/* ---------------------------------------------------------------------- */
	593
	594	static void nh_init(nh_ctx *hc, aes_int_key prf_key)
	595	/* Generate nh_key, endian convert and reset to be ready for hashing. */
	596	{
	597	kdf(hc->nh_key, prf_key, 1, sizeof(hc->nh_key));
	598	endian_convert_if_le(hc->nh_key, 4, sizeof(hc->nh_key));
	599	nh_reset(hc);
	600	}
	601
	602	/* ---------------------------------------------------------------------- */
	603
	604	static void nh_update(nh_ctx hc, UINT8 buf, UINT32 nbytes)
	605	/* Incorporate nbytes of data into a nh_ctx, buffer whatever is not an */
	606	/* even multiple of HASH_BUF_BYTES. */
	607	{
	608	UINT32 i,j;
	609
	610	j = hc->next_data_empty;
	611	if ((j + nbytes) >= HASH_BUF_BYTES) {
	612	if (j) {
	613	i = HASH_BUF_BYTES - j;
	614	memcpy(hc->data+j, buf, i);
	615	nh_transform(hc,hc->data,HASH_BUF_BYTES);
	616	nbytes -= i;
	617	buf += i;
	618	hc->bytes_hashed += HASH_BUF_BYTES;
	619	}
	620	if (nbytes >= HASH_BUF_BYTES) {
	621	i = nbytes & ~(HASH_BUF_BYTES - 1);
	622	nh_transform(hc, buf, i);
	623	nbytes -= i;
	624	buf += i;
	625	hc->bytes_hashed += i;
	626	}
	627	j = 0;
	628	}
	629	memcpy(hc->data + j, buf, nbytes);
	630	hc->next_data_empty = j + nbytes;
	631	}
	632
	633	/* ---------------------------------------------------------------------- */
	634
	635	static void zero_pad(UINT8 *p, int nbytes)
	636	{
	637	/* Write "nbytes" of zeroes, beginning at "p" */
	638	if (nbytes >= (int)sizeof(UWORD)) {
	639	while ((ptrdiff_t)p % sizeof(UWORD)) {
	640	*p = 0;
	641	nbytes--;
	642	p++;
	643	}
	644	while (nbytes >= (int)sizeof(UWORD)) {
	645	(UWORD )p = 0;
	646	nbytes -= sizeof(UWORD);
	647	p += sizeof(UWORD);
	648	}
	649	}
	650	while (nbytes) {
	651	*p = 0;
	652	nbytes--;
	653	p++;
	654	}
	655	}
	656
	657	/* ---------------------------------------------------------------------- */
	658
	659	static void nh_final(nh_ctx hc, UINT8 result)
	660	/* After passing some number of data buffers to nh_update() for integration
	661	* into an NH context, nh_final is called to produce a hash result. If any
	662	* bytes are in the buffer hc->data, incorporate them into the
	663	* NH context. Finally, add into the NH accumulation "state" the total number
	664	* of bits hashed. The resulting numbers are written to the buffer "result".
	665	* If nh_update was never called, L1_PAD_BOUNDARY zeroes are incorporated.
	666	*/
	667	{
	668	int nh_len, nbits;
	669
	670	if (hc->next_data_empty != 0) {
	671	nh_len = ((hc->next_data_empty + (L1_PAD_BOUNDARY - 1)) &
	672	~(L1_PAD_BOUNDARY - 1));
	673	zero_pad(hc->data + hc->next_data_empty,
	674	nh_len - hc->next_data_empty);
	675	nh_transform(hc, hc->data, nh_len);
	676	hc->bytes_hashed += hc->next_data_empty;
	677	} else if (hc->bytes_hashed == 0) {
	678	nh_len = L1_PAD_BOUNDARY;
	679	zero_pad(hc->data, L1_PAD_BOUNDARY);
	680	nh_transform(hc, hc->data, nh_len);
	681	}
	682
	683	nbits = (hc->bytes_hashed << 3);
	684	((UINT64 )result)[0] = ((UINT64 )hc->state)[0] + nbits;
	685	#if (UMAC_OUTPUT_LEN >= 8)
	686	((UINT64 )result)[1] = ((UINT64 )hc->state)[1] + nbits;
	687	#endif
	688	#if (UMAC_OUTPUT_LEN >= 12)
	689	((UINT64 )result)[2] = ((UINT64 )hc->state)[2] + nbits;
	690	#endif
	691	#if (UMAC_OUTPUT_LEN == 16)
	692	((UINT64 )result)[3] = ((UINT64 )hc->state)[3] + nbits;
	693	#endif
	694	nh_reset(hc);
	695	}
	696
	697	/* ---------------------------------------------------------------------- */
	698
	699	static void nh(nh_ctx hc, UINT8 buf, UINT32 padded_len,
	700	UINT32 unpadded_len, UINT8 *result)
	701	/* All-in-one nh_update() and nh_final() equivalent.
	702	* Assumes that padded_len is divisible by L1_PAD_BOUNDARY and result is
	703	* well aligned
	704	*/
	705	{
	706	UINT32 nbits;
	707
	708	/* Initialize the hash state */
	709	nbits = (unpadded_len << 3);
	710
	711	((UINT64 *)result)[0] = nbits;
	712	#if (UMAC_OUTPUT_LEN >= 8)
	713	((UINT64 *)result)[1] = nbits;
	714	#endif
	715	#if (UMAC_OUTPUT_LEN >= 12)
	716	((UINT64 *)result)[2] = nbits;
	717	#endif
	718	#if (UMAC_OUTPUT_LEN == 16)
	719	((UINT64 *)result)[3] = nbits;
	720	#endif
	721
	722	nh_aux(hc->nh_key, buf, result, padded_len);
	723	}
	724
	725	/* ---------------------------------------------------------------------- */
	726	/* ---------------------------------------------------------------------- */
	727	/* ----- Begin UHASH Section -------------------------------------------- */
	728	/* ---------------------------------------------------------------------- */
	729	/* ---------------------------------------------------------------------- */
	730
	731	/* UHASH is a multi-layered algorithm. Data presented to UHASH is first
	732	* hashed by NH. The NH output is then hashed by a polynomial-hash layer
	733	* unless the initial data to be hashed is short. After the polynomial-
	734	* layer, an inner-product hash is used to produce the final UHASH output.
	735	*
	736	* UHASH provides two interfaces, one all-at-once and another where data
	737	* buffers are presented sequentially. In the sequential interface, the
	738	* UHASH client calls the routine uhash_update() as many times as necessary.
	739	* When there is no more data to be fed to UHASH, the client calls
	740	* uhash_final() which
	741	* calculates the UHASH output. Before beginning another UHASH calculation
	742	* the uhash_reset() routine must be called. The all-at-once UHASH routine,
	743	* uhash(), is equivalent to the sequence of calls uhash_update() and
	744	* uhash_final(); however it is optimized and should be
	745	* used whenever the sequential interface is not necessary.
	746	*
	747	* The routine uhash_init() initializes the uhash_ctx data structure and
	748	* must be called once, before any other UHASH routine.
	749	*/
	750
	751	/* ---------------------------------------------------------------------- */
	752	/* ----- Constants and uhash_ctx ---------------------------------------- */
	753	/* ---------------------------------------------------------------------- */
	754
	755	/* ---------------------------------------------------------------------- */
	756	/* ----- Poly hash and Inner-Product hash Constants --------------------- */
	757	/* ---------------------------------------------------------------------- */
	758
	759	/* Primes and masks */
	760	#define p36 ((UINT64)0x0000000FFFFFFFFBull) /* 2^36 - 5 */
	761	#define p64 ((UINT64)0xFFFFFFFFFFFFFFC5ull) /* 2^64 - 59 */
	762	#define m36 ((UINT64)0x0000000FFFFFFFFFull) /* The low 36 of 64 bits */
	763
	764
	765	/* ---------------------------------------------------------------------- */
	766
	767	typedef struct uhash_ctx {
	768	nh_ctx hash; /* Hash context for L1 NH hash */
	769	UINT64 poly_key_8[STREAMS]; /* p64 poly keys */
	770	UINT64 poly_accum[STREAMS]; /* poly hash result */
	771	UINT64 ip_keys[STREAMS4]; / Inner-product keys */
	772	UINT32 ip_trans[STREAMS]; /* Inner-product translation */
	773	UINT32 msg_len; /* Total length of data passed */
	774	/* to uhash */
	775	} uhash_ctx;
	776	typedef struct uhash_ctx *uhash_ctx_t;
	777
	778	/* ---------------------------------------------------------------------- */
	779
	780
	781	/* The polynomial hashes use Horner's rule to evaluate a polynomial one
	782	* word at a time. As described in the specification, poly32 and poly64
	783	* require keys from special domains. The following implementations exploit
	784	* the special domains to avoid overflow. The results are not guaranteed to
	785	* be within Z_p32 and Z_p64, but the Inner-Product hash implementation
	786	* patches any errant values.
	787	*/
	788
	789	static UINT64 poly64(UINT64 cur, UINT64 key, UINT64 data)
	790	{
	791	UINT32 key_hi = (UINT32)(key >> 32),
	792	key_lo = (UINT32)key,
	793	cur_hi = (UINT32)(cur >> 32),
	794	cur_lo = (UINT32)cur,
	795	x_lo,
	796	x_hi;
	797	UINT64 X,T,res;
	798
	799	X = MUL64(key_hi, cur_lo) + MUL64(cur_hi, key_lo);
	800	x_lo = (UINT32)X;
	801	x_hi = (UINT32)(X >> 32);
	802
	803	res = (MUL64(key_hi, cur_hi) + x_hi) * 59 + MUL64(key_lo, cur_lo);
	804
	805	T = ((UINT64)x_lo << 32);
	806	res += T;
	807	if (res < T)
	808	res += 59;
	809
	810	res += data;
	811	if (res < data)
	812	res += 59;
	813
	814	return res;
	815	}
	816
	817
	818	/* Although UMAC is specified to use a ramped polynomial hash scheme, this
	819	* implementation does not handle all ramp levels. Because we don't handle
	820	* the ramp up to p128 modulus in this implementation, we are limited to
	821	* 2^14 poly_hash() invocations per stream (for a total capacity of 2^24
	822	* bytes input to UMAC per tag, ie. 16MB).
	823	*/
	824	static void poly_hash(uhash_ctx_t hc, UINT32 data_in[])
	825	{
	826	int i;
	827	UINT64 data=(UINT64)data_in;
	828
	829	for (i = 0; i < STREAMS; i++) {
	830	if ((UINT32)(data[i] >> 32) == 0xfffffffful) {
	831	hc->poly_accum[i] = poly64(hc->poly_accum[i],
	832	hc->poly_key_8[i], p64 - 1);
	833	hc->poly_accum[i] = poly64(hc->poly_accum[i],
	834	hc->poly_key_8[i], (data[i] - 59));
	835	} else {
	836	hc->poly_accum[i] = poly64(hc->poly_accum[i],
	837	hc->poly_key_8[i], data[i]);
	838	}
	839	}
	840	}
	841
	842
	843	/* ---------------------------------------------------------------------- */
	844
	845
	846	/* The final step in UHASH is an inner-product hash. The poly hash
	847	* produces a result not neccesarily WORD_LEN bytes long. The inner-
	848	* product hash breaks the polyhash output into 16-bit chunks and
	849	* multiplies each with a 36 bit key.
	850	*/
	851
	852	static UINT64 ip_aux(UINT64 t, UINT64 *ipkp, UINT64 data)
	853	{
	854	t = t + ipkp[0] * (UINT64)(UINT16)(data >> 48);
	855	t = t + ipkp[1] * (UINT64)(UINT16)(data >> 32);
	856	t = t + ipkp[2] * (UINT64)(UINT16)(data >> 16);
	857	t = t + ipkp[3] * (UINT64)(UINT16)(data);
	858
	859	return t;
	860	}
	861
	862	static UINT32 ip_reduce_p36(UINT64 t)
	863	{
	864	/* Divisionless modular reduction */
	865	UINT64 ret;
	866
	867	ret = (t & m36) + 5 * (t >> 36);
	868	if (ret >= p36)
	869	ret -= p36;
	870
	871	/* return least significant 32 bits */
	872	return (UINT32)(ret);
	873	}
	874
	875
	876	/* If the data being hashed by UHASH is no longer than L1_KEY_LEN, then
	877	* the polyhash stage is skipped and ip_short is applied directly to the
	878	* NH output.
	879	*/
	880	static void ip_short(uhash_ctx_t ahc, UINT8 nh_res, u_char res)
	881	{
	882	UINT64 t;
	883	UINT64 nhp = (UINT64 )nh_res;
	884
	885	t = ip_aux(0,ahc->ip_keys, nhp[0]);
	886	STORE_UINT32_BIG((UINT32 *)res+0, ip_reduce_p36(t) ^ ahc->ip_trans[0]);
	887	#if (UMAC_OUTPUT_LEN >= 8)
	888	t = ip_aux(0,ahc->ip_keys+4, nhp[1]);
	889	STORE_UINT32_BIG((UINT32 *)res+1, ip_reduce_p36(t) ^ ahc->ip_trans[1]);
	890	#endif
	891	#if (UMAC_OUTPUT_LEN >= 12)
	892	t = ip_aux(0,ahc->ip_keys+8, nhp[2]);
	893	STORE_UINT32_BIG((UINT32 *)res+2, ip_reduce_p36(t) ^ ahc->ip_trans[2]);
	894	#endif
	895	#if (UMAC_OUTPUT_LEN == 16)
	896	t = ip_aux(0,ahc->ip_keys+12, nhp[3]);
	897	STORE_UINT32_BIG((UINT32 *)res+3, ip_reduce_p36(t) ^ ahc->ip_trans[3]);
	898	#endif
	899	}
	900
	901	/* If the data being hashed by UHASH is longer than L1_KEY_LEN, then
	902	* the polyhash stage is not skipped and ip_long is applied to the
	903	* polyhash output.
	904	*/
	905	static void ip_long(uhash_ctx_t ahc, u_char *res)
	906	{
	907	int i;
	908	UINT64 t;
	909
	910	for (i = 0; i < STREAMS; i++) {
	911	/* fix polyhash output not in Z_p64 */
	912	if (ahc->poly_accum[i] >= p64)
	913	ahc->poly_accum[i] -= p64;
	914	t = ip_aux(0,ahc->ip_keys+(i*4), ahc->poly_accum[i]);
	915	STORE_UINT32_BIG((UINT32 *)res+i,
	916	ip_reduce_p36(t) ^ ahc->ip_trans[i]);
	917	}
	918	}
	919
	920
	921	/* ---------------------------------------------------------------------- */
	922
	923	/* ---------------------------------------------------------------------- */
	924
	925	/* Reset uhash context for next hash session */
	926	static int uhash_reset(uhash_ctx_t pc)
	927	{
	928	nh_reset(&pc->hash);
	929	pc->msg_len = 0;
	930	pc->poly_accum[0] = 1;
	931	#if (UMAC_OUTPUT_LEN >= 8)
	932	pc->poly_accum[1] = 1;
	933	#endif
	934	#if (UMAC_OUTPUT_LEN >= 12)
	935	pc->poly_accum[2] = 1;
	936	#endif
	937	#if (UMAC_OUTPUT_LEN == 16)
	938	pc->poly_accum[3] = 1;
	939	#endif
	940	return 1;
	941	}
	942
	943	/* ---------------------------------------------------------------------- */
	944
	945	/* Given a pointer to the internal key needed by kdf() and a uhash context,
	946	* initialize the NH context and generate keys needed for poly and inner-
	947	* product hashing. All keys are endian adjusted in memory so that native
	948	* loads cause correct keys to be in registers during calculation.
	949	*/
	950	static void uhash_init(uhash_ctx_t ahc, aes_int_key prf_key)
	951	{
	952	int i;
	953	UINT8 buf[(8STREAMS+4)sizeof(UINT64)];
	954
	955	/* Zero the entire uhash context */
	956	memset(ahc, 0, sizeof(uhash_ctx));
	957
	958	/* Initialize the L1 hash */
	959	nh_init(&ahc->hash, prf_key);
	960
	961	/* Setup L2 hash variables */
	962	kdf(buf, prf_key, 2, sizeof(buf)); /* Fill buffer with index 1 key */
	963	for (i = 0; i < STREAMS; i++) {
	964	/* Fill keys from the buffer, skipping bytes in the buffer not
	965	* used by this implementation. Endian reverse the keys if on a
	966	* little-endian computer.
	967	*/
	968	memcpy(ahc->poly_key_8+i, buf+24*i, 8);
	969	endian_convert_if_le(ahc->poly_key_8+i, 8, 8);
	970	/* Mask the 64-bit keys to their special domain */
	971	ahc->poly_key_8[i] &= ((UINT64)0x01ffffffu << 32) + 0x01ffffffu;
	972	ahc->poly_accum[i] = 1; /* Our polyhash prepends a non-zero word */
	973	}
	974
	975	/* Setup L3-1 hash variables */
	976	kdf(buf, prf_key, 3, sizeof(buf)); /* Fill buffer with index 2 key */
	977	for (i = 0; i < STREAMS; i++)
	978	memcpy(ahc->ip_keys+4i, buf+(8i+4)*sizeof(UINT64),
	979	4*sizeof(UINT64));
	980	endian_convert_if_le(ahc->ip_keys, sizeof(UINT64),
	981	sizeof(ahc->ip_keys));
	982	for (i = 0; i < STREAMS*4; i++)
	983	ahc->ip_keys[i] %= p36; /* Bring into Z_p36 */
	984
	985	/* Setup L3-2 hash variables */
	986	/* Fill buffer with index 4 key */
	987	kdf(ahc->ip_trans, prf_key, 4, STREAMS * sizeof(UINT32));
	988	endian_convert_if_le(ahc->ip_trans, sizeof(UINT32),
	989	STREAMS * sizeof(UINT32));
	990	}
	991
	992	/* ---------------------------------------------------------------------- */
	993
	994	#if 0
	995	static uhash_ctx_t uhash_alloc(u_char key[])
	996	{
	997	/* Allocate memory and force to a 16-byte boundary. */
	998	uhash_ctx_t ctx;
	999	u_char bytes_to_add;
	1000	aes_int_key prf_key;
	1001
	1002	ctx = (uhash_ctx_t)malloc(sizeof(uhash_ctx)+ALLOC_BOUNDARY);
	1003	if (ctx) {
	1004	if (ALLOC_BOUNDARY) {
	1005	bytes_to_add = ALLOC_BOUNDARY -
	1006	((ptrdiff_t)ctx & (ALLOC_BOUNDARY -1));
	1007	ctx = (uhash_ctx_t)((u_char *)ctx + bytes_to_add);
	1008	((u_char )ctx - 1) = bytes_to_add;
	1009	}
	1010	aes_key_setup(key,prf_key);
	1011	uhash_init(ctx, prf_key);
	1012	}
	1013	return (ctx);
	1014	}
	1015	#endif
	1016
	1017	/* ---------------------------------------------------------------------- */
	1018
	1019	#if 0
	1020	static int uhash_free(uhash_ctx_t ctx)
	1021	{
	1022	/* Free memory allocated by uhash_alloc */
	1023	u_char bytes_to_sub;
	1024
	1025	if (ctx) {
	1026	if (ALLOC_BOUNDARY) {
	1027	bytes_to_sub = ((u_char )ctx - 1);
	1028	ctx = (uhash_ctx_t)((u_char *)ctx - bytes_to_sub);
	1029	}
	1030	free(ctx);
	1031	}
	1032	return (1);
	1033	}
	1034	#endif
	1035	/* ---------------------------------------------------------------------- */
	1036
	1037	static int uhash_update(uhash_ctx_t ctx, u_char *input, long len)
	1038	/* Given len bytes of data, we parse it into L1_KEY_LEN chunks and
	1039	* hash each one with NH, calling the polyhash on each NH output.
	1040	*/
	1041	{
	1042	UWORD bytes_hashed, bytes_remaining;
	1043	UINT8 nh_result[STREAMS*sizeof(UINT64)];
	1044
	1045	if (ctx->msg_len + len <= L1_KEY_LEN) {
	1046	nh_update(&ctx->hash, (UINT8 *)input, len);
	1047	ctx->msg_len += len;
	1048	} else {
	1049
	1050	bytes_hashed = ctx->msg_len % L1_KEY_LEN;
	1051	if (ctx->msg_len == L1_KEY_LEN)
	1052	bytes_hashed = L1_KEY_LEN;
	1053
	1054	if (bytes_hashed + len >= L1_KEY_LEN) {
	1055
	1056	/* If some bytes have been passed to the hash function */
	1057	/* then we want to pass at most (L1_KEY_LEN - bytes_hashed) */
	1058	/* bytes to complete the current nh_block. */
	1059	if (bytes_hashed) {
	1060	bytes_remaining = (L1_KEY_LEN - bytes_hashed);
	1061	nh_update(&ctx->hash, (UINT8 *)input, bytes_remaining);
	1062	nh_final(&ctx->hash, nh_result);
	1063	ctx->msg_len += bytes_remaining;
	1064	poly_hash(ctx,(UINT32 *)nh_result);
	1065	len -= bytes_remaining;
	1066	input += bytes_remaining;
	1067	}
	1068
	1069	/* Hash directly from input stream if enough bytes */
	1070	while (len >= L1_KEY_LEN) {
	1071	nh(&ctx->hash, (UINT8 *)input, L1_KEY_LEN,
	1072	L1_KEY_LEN, nh_result);
	1073	ctx->msg_len += L1_KEY_LEN;
	1074	len -= L1_KEY_LEN;
	1075	input += L1_KEY_LEN;
	1076	poly_hash(ctx,(UINT32 *)nh_result);
	1077	}
	1078	}
	1079
	1080	/* pass remaining < L1_KEY_LEN bytes of input data to NH */
	1081	if (len) {
	1082	nh_update(&ctx->hash, (UINT8 *)input, len);
	1083	ctx->msg_len += len;
	1084	}
	1085	}
	1086
	1087	return (1);
	1088	}
	1089
	1090	/* ---------------------------------------------------------------------- */
	1091
	1092	static int uhash_final(uhash_ctx_t ctx, u_char *res)
	1093	/* Incorporate any pending data, pad, and generate tag */
	1094	{
	1095	UINT8 nh_result[STREAMS*sizeof(UINT64)];
	1096
	1097	if (ctx->msg_len > L1_KEY_LEN) {
	1098	if (ctx->msg_len % L1_KEY_LEN) {
	1099	nh_final(&ctx->hash, nh_result);
	1100	poly_hash(ctx,(UINT32 *)nh_result);
	1101	}
	1102	ip_long(ctx, res);
	1103	} else {
	1104	nh_final(&ctx->hash, nh_result);
	1105	ip_short(ctx,nh_result, res);
	1106	}
	1107	uhash_reset(ctx);
	1108	return (1);
	1109	}
	1110
	1111	/* ---------------------------------------------------------------------- */
	1112
	1113	#if 0
	1114	static int uhash(uhash_ctx_t ahc, u_char msg, long len, u_char res)
	1115	/* assumes that msg is in a writable buffer of length divisible by */
	1116	/* L1_PAD_BOUNDARY. Bytes beyond msg[len] may be zeroed. */
	1117	{
	1118	UINT8 nh_result[STREAMS*sizeof(UINT64)];
	1119	UINT32 nh_len;
	1120	int extra_zeroes_needed;
	1121
	1122	/* If the message to be hashed is no longer than L1_HASH_LEN, we skip
	1123	* the polyhash.
	1124	*/
	1125	if (len <= L1_KEY_LEN) {
	1126	if (len == 0) /* If zero length messages will not */
	1127	nh_len = L1_PAD_BOUNDARY; /* be seen, comment out this case */
	1128	else
	1129	nh_len = ((len + (L1_PAD_BOUNDARY - 1)) & ~(L1_PAD_BOUNDARY - 1));
	1130	extra_zeroes_needed = nh_len - len;
	1131	zero_pad((UINT8 *)msg + len, extra_zeroes_needed);
	1132	nh(&ahc->hash, (UINT8 *)msg, nh_len, len, nh_result);
	1133	ip_short(ahc,nh_result, res);
	1134	} else {
	1135	/* Otherwise, we hash each L1_KEY_LEN chunk with NH, passing the NH
	1136	* output to poly_hash().
	1137	*/
	1138	do {
	1139	nh(&ahc->hash, (UINT8 *)msg, L1_KEY_LEN, L1_KEY_LEN, nh_result);
	1140	poly_hash(ahc,(UINT32 *)nh_result);
	1141	len -= L1_KEY_LEN;
	1142	msg += L1_KEY_LEN;
	1143	} while (len >= L1_KEY_LEN);
	1144	if (len) {
	1145	nh_len = ((len + (L1_PAD_BOUNDARY - 1)) & ~(L1_PAD_BOUNDARY - 1));
	1146	extra_zeroes_needed = nh_len - len;
	1147	zero_pad((UINT8 *)msg + len, extra_zeroes_needed);
	1148	nh(&ahc->hash, (UINT8 *)msg, nh_len, len, nh_result);
	1149	poly_hash(ahc,(UINT32 *)nh_result);
	1150	}
	1151
	1152	ip_long(ahc, res);
	1153	}
	1154
	1155	uhash_reset(ahc);
	1156	return 1;
	1157	}
	1158	#endif
	1159
	1160	/* ---------------------------------------------------------------------- */
	1161	/* ---------------------------------------------------------------------- */
	1162	/* ----- Begin UMAC Section --------------------------------------------- */
	1163	/* ---------------------------------------------------------------------- */
	1164	/* ---------------------------------------------------------------------- */
	1165
	1166	/* The UMAC interface has two interfaces, an all-at-once interface where
	1167	* the entire message to be authenticated is passed to UMAC in one buffer,
	1168	* and a sequential interface where the message is presented a little at a
	1169	* time. The all-at-once is more optimaized than the sequential version and
	1170	* should be preferred when the sequential interface is not required.
	1171	*/
	1172	struct umac_ctx {
	1173	uhash_ctx hash; /* Hash function for message compression */
	1174	pdf_ctx pdf; /* PDF for hashed output */
	1175	void free_ptr; / Address to free this struct via */
	1176	} umac_ctx;
	1177
	1178	/* ---------------------------------------------------------------------- */
	1179
	1180	#if 0
	1181	int umac_reset(struct umac_ctx *ctx)
	1182	/* Reset the hash function to begin a new authentication. */
	1183	{
	1184	uhash_reset(&ctx->hash);
	1185	return (1);
	1186	}
	1187	#endif
	1188
	1189	/* ---------------------------------------------------------------------- */
	1190
	1191	int umac_delete(struct umac_ctx *ctx)
	1192	/* Deallocate the ctx structure */
	1193	{
	1194	if (ctx) {
	1195	if (ALLOC_BOUNDARY)
	1196	ctx = (struct umac_ctx *)ctx->free_ptr;
	1197	free(ctx);
	1198	}
	1199	return (1);
	1200	}
	1201
	1202	/* ---------------------------------------------------------------------- */
	1203
	1204	struct umac_ctx *umac_new(u_char key[])
	1205	/* Dynamically allocate a umac_ctx struct, initialize variables,
	1206	* generate subkeys from key. Align to 16-byte boundary.
	1207	*/
	1208	{
	1209	struct umac_ctx ctx, octx;
	1210	size_t bytes_to_add;
	1211	aes_int_key prf_key;
	1212
	1213	octx = ctx = malloc(sizeof(*ctx) + ALLOC_BOUNDARY);
	1214	if (ctx) {
	1215	if (ALLOC_BOUNDARY) {
	1216	bytes_to_add = ALLOC_BOUNDARY -
	1217	((ptrdiff_t)ctx & (ALLOC_BOUNDARY - 1));
	1218	ctx = (struct umac_ctx )((u_char )ctx + bytes_to_add);
	1219	}
	1220	ctx->free_ptr = octx;
	1221	aes_key_setup(key,prf_key);
	1222	pdf_init(&ctx->pdf, prf_key);
	1223	uhash_init(&ctx->hash, prf_key);
	1224	}
	1225
	1226	return (ctx);
	1227	}
	1228
	1229	/* ---------------------------------------------------------------------- */
	1230
	1231	int umac_final(struct umac_ctx *ctx, u_char tag[], u_char nonce[8])
	1232	/* Incorporate any pending data, pad, and generate tag */
	1233	{
	1234	uhash_final(&ctx->hash, (u_char *)tag);
	1235	pdf_gen_xor(&ctx->pdf, (UINT8 )nonce, (UINT8 )tag);
	1236
	1237	return (1);
	1238	}
	1239
	1240	/* ---------------------------------------------------------------------- */
	1241
	1242	int umac_update(struct umac_ctx ctx, u_char input, long len)
	1243	/* Given len bytes of data, we parse it into L1_KEY_LEN chunks and */
	1244	/* hash each one, calling the PDF on the hashed output whenever the hash- */
	1245	/* output buffer is full. */
	1246	{
	1247	uhash_update(&ctx->hash, input, len);
	1248	return (1);
	1249	}
	1250
	1251	/* ---------------------------------------------------------------------- */
	1252
	1253	#if 0
	1254	int umac(struct umac_ctx ctx, u_char input,
	1255	long len, u_char tag[],
	1256	u_char nonce[8])
	1257	/* All-in-one version simply calls umac_update() and umac_final(). */
	1258	{
	1259	uhash(&ctx->hash, input, len, (u_char *)tag);
	1260	pdf_gen_xor(&ctx->pdf, (UINT8 )nonce, (UINT8 )tag);
	1261
	1262	return (1);
	1263	}
	1264	#endif
	1265
	1266	/* ---------------------------------------------------------------------- */
	1267	/* ---------------------------------------------------------------------- */
	1268	/* ----- End UMAC Section ----------------------------------------------- */
	1269	/* ---------------------------------------------------------------------- */
	1270	/* ---------------------------------------------------------------------- */