2 poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
8 /* use memcpy() to copy blocks of memory (typically faster) */
10 /* use unaligned little-endian load/store (can be faster) */
11 #define USE_UNALIGNED 0
13 struct poly1305_context {
18 unsigned char buffer[POLY1305_BLOCK_SIZE];
22 #if (USE_UNALIGNED == 1)
25 #define U32TO8(p, v) \
27 *((uint32_t *)(p)) = v; \
30 /* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
32 U8TO32(const unsigned char *p) {
34 (((uint32_t)(p[0] & 0xff)) |
35 ((uint32_t)(p[1] & 0xff) << 8) |
36 ((uint32_t)(p[2] & 0xff) << 16) |
37 ((uint32_t)(p[3] & 0xff) << 24));
40 /* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
42 U32TO8(unsigned char *p, uint32_t v) {
44 p[1] = (v >> 8) & 0xff;
45 p[2] = (v >> 16) & 0xff;
46 p[3] = (v >> 24) & 0xff;
51 poly1305_init(struct poly1305_context *st, const unsigned char key[32]) {
52 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
53 st->r[0] = (U8TO32(&key[ 0])) & 0x3ffffff;
54 st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03;
55 st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff;
56 st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff;
57 st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff;
66 /* save pad for later */
67 st->pad[0] = U8TO32(&key[16]);
68 st->pad[1] = U8TO32(&key[20]);
69 st->pad[2] = U8TO32(&key[24]);
70 st->pad[3] = U8TO32(&key[28]);
77 poly1305_blocks(struct poly1305_context *st, const unsigned char *m, size_t bytes) {
78 const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
79 uint32_t r0, r1, r2, r3, r4;
80 uint32_t s1, s2, s3, s4;
81 uint32_t h0, h1, h2, h3, h4;
82 uint64_t d0, d1, d2, d3, d4;
102 while(bytes >= POLY1305_BLOCK_SIZE) {
104 h0 += (U8TO32(m + 0)) & 0x3ffffff;
105 h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff;
106 h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff;
107 h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff;
108 h4 += (U8TO32(m + 12) >> 8) | hibit;
111 d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1);
112 d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2);
113 d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3);
114 d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4);
115 d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0);
117 /* (partial) h %= p */
118 c = (uint32_t)(d0 >> 26);
119 h0 = (uint32_t)d0 & 0x3ffffff;
121 c = (uint32_t)(d1 >> 26);
122 h1 = (uint32_t)d1 & 0x3ffffff;
124 c = (uint32_t)(d2 >> 26);
125 h2 = (uint32_t)d2 & 0x3ffffff;
127 c = (uint32_t)(d3 >> 26);
128 h3 = (uint32_t)d3 & 0x3ffffff;
130 c = (uint32_t)(d4 >> 26);
131 h4 = (uint32_t)d4 & 0x3ffffff;
137 m += POLY1305_BLOCK_SIZE;
138 bytes -= POLY1305_BLOCK_SIZE;
149 poly1305_finish(struct poly1305_context *st, unsigned char mac[16]) {
150 uint32_t h0, h1, h2, h3, h4, c;
151 uint32_t g0, g1, g2, g3, g4;
155 /* process the remaining block */
157 size_t i = st->leftover;
160 for(; i < POLY1305_BLOCK_SIZE; i++) {
165 poly1305_blocks(st, st->buffer, POLY1305_BLOCK_SIZE);
204 g4 = h4 + c - (1 << 26);
206 /* select h if h < p, or h + -p if h >= p */
207 mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1;
214 h0 = (h0 & mask) | g0;
215 h1 = (h1 & mask) | g1;
216 h2 = (h2 & mask) | g2;
217 h3 = (h3 & mask) | g3;
218 h4 = (h4 & mask) | g4;
220 /* h = h % (2^128) */
221 h0 = ((h0) | (h1 << 26)) & 0xffffffff;
222 h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
223 h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
224 h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
226 /* mac = (h + pad) % (2^128) */
227 f = (uint64_t)h0 + st->pad[0] ;
229 f = (uint64_t)h1 + st->pad[1] + (f >> 32);
231 f = (uint64_t)h2 + st->pad[2] + (f >> 32);
233 f = (uint64_t)h3 + st->pad[3] + (f >> 32);
239 U32TO8(mac + 12, h3);
241 /* zero out the state */
259 poly1305_update(struct poly1305_context *st, const unsigned char *m, size_t bytes) {
262 /* handle leftover */
264 size_t want = (POLY1305_BLOCK_SIZE - st->leftover);
270 for(i = 0; i < want; i++) {
271 st->buffer[st->leftover + i] = m[i];
276 st->leftover += want;
278 if(st->leftover < POLY1305_BLOCK_SIZE) {
282 poly1305_blocks(st, st->buffer, POLY1305_BLOCK_SIZE);
286 /* process full blocks */
287 if(bytes >= POLY1305_BLOCK_SIZE) {
288 size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
289 poly1305_blocks(st, m, want);
296 #if (USE_MEMCPY == 1)
297 memcpy(st->buffer + st->leftover, m, bytes);
300 for(i = 0; i < bytes; i++) {
301 st->buffer[st->leftover + i] = m[i];
305 st->leftover += bytes;
310 * Poly1305 tag generation. This concatenates a string according to the rules
311 * outlined in RFC 7539 and calculates the tag.
313 * \param key 32 byte secret one-time key for poly1305
314 * \param ct ciphertext
315 * \param ct_len ciphertext length in bytes
316 * \param tag pointer to 16 bytes for tag storage
319 poly1305_get_tag(const unsigned char key[32], const void *ct, int ct_len, unsigned char tag[16]) {
320 struct poly1305_context ctx;
323 unsigned char pad[16];
325 poly1305_init(&ctx, key);
326 memset(&pad, 0, sizeof(pad));
328 /* payload and padding */
329 poly1305_update(&ctx, ct, ct_len);
330 left_over = ct_len % 16;
333 poly1305_update(&ctx, pad, 16 - left_over);
338 poly1305_update(&ctx, (unsigned char *)&len, 8);
340 poly1305_update(&ctx, (unsigned char *)&len, 8);
341 poly1305_finish(&ctx, tag);