27#if (CRYPTOPP_AESNI_AVAILABLE)
29# include <emmintrin.h>
30# include <smmintrin.h>
31# include <wmmintrin.h>
35#if (CRYPTOPP_BOOL_ARMV8)
37# if (CRYPTOPP_ARM_NEON_HEADER)
40# if (CRYPTOPP_ARM_ACLE_HEADER)
50#if defined(CRYPTOPP_POWER8_AES_AVAILABLE)
55#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
60#ifndef EXCEPTION_EXECUTE_HANDLER
61# define EXCEPTION_EXECUTE_HANDLER 1
65extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__;
71#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
73 typedef void (*SigHandler)(int);
75 static jmp_buf s_jmpSIGILL;
76 static void SigIllHandler(
int)
78 longjmp(s_jmpSIGILL, 1);
83#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
86#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
88#elif (CRYPTOPP_ARM_AES_AVAILABLE)
89# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
90 volatile bool result =
true;
94 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
95 uint8x16_t r1 = vaeseq_u8(data, key);
96 uint8x16_t r2 = vaesdq_u8(data, key);
100 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
102 __except (EXCEPTION_EXECUTE_HANDLER)
111 volatile bool result =
true;
113 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
114 if (oldHandler == SIG_ERR)
117 volatile sigset_t oldMask;
118 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
120 signal(SIGILL, oldHandler);
124 if (setjmp(s_jmpSIGILL))
128 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
129 uint8x16_t r1 = vaeseq_u8(data, key);
130 uint8x16_t r2 = vaesdq_u8(data, key);
132 r2 = vaesimcq_u8(r2);
135 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
138 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
139 signal(SIGILL, oldHandler);
150#if (CRYPTOPP_ARM_AES_AVAILABLE)
152ANONYMOUS_NAMESPACE_BEGIN
154inline void ARMV8_Enc_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
157 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
158 uint8x16_t block = vreinterpretq_u8_u64(data);
161 block = vaeseq_u8(block, vld1q_u8(keys+0*16));
163 block = vaesmcq_u8(block);
165 for (
unsigned int i=1; i<rounds-1; i+=2)
168 block = vaeseq_u8(block, vld1q_u8(keys+i*16));
170 block = vaesmcq_u8(block);
172 block = vaeseq_u8(block, vld1q_u8(keys+(i+1)*16));
174 block = vaesmcq_u8(block);
178 block = vaeseq_u8(block, vld1q_u8(keys+(rounds-1)*16));
180 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
182 data = vreinterpretq_u64_u8(block);
185inline void ARMV8_Enc_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
186 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
187 const word32 *subkeys,
unsigned int rounds)
190 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
192 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
193 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
194 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
195 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
196 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
197 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
200 for (
unsigned int i=0; i<rounds-1; ++i)
202 key = vld1q_u8(keys+i*16);
204 block0 = vaeseq_u8(block0, key);
206 block0 = vaesmcq_u8(block0);
208 block1 = vaeseq_u8(block1, key);
210 block1 = vaesmcq_u8(block1);
212 block2 = vaeseq_u8(block2, key);
214 block2 = vaesmcq_u8(block2);
216 block3 = vaeseq_u8(block3, key);
218 block3 = vaesmcq_u8(block3);
220 block4 = vaeseq_u8(block4, key);
222 block4 = vaesmcq_u8(block4);
224 block5 = vaeseq_u8(block5, key);
226 block5 = vaesmcq_u8(block5);
230 key = vld1q_u8(keys+(rounds-1)*16);
231 block0 = vaeseq_u8(block0, key);
232 block1 = vaeseq_u8(block1, key);
233 block2 = vaeseq_u8(block2, key);
234 block3 = vaeseq_u8(block3, key);
235 block4 = vaeseq_u8(block4, key);
236 block5 = vaeseq_u8(block5, key);
239 key = vld1q_u8(keys+rounds*16);
240 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
241 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
242 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
243 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
244 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
245 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
248inline void ARMV8_Dec_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
251 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
252 uint8x16_t block = vreinterpretq_u8_u64(data);
255 block = vaesdq_u8(block, vld1q_u8(keys+0*16));
257 block = vaesimcq_u8(block);
259 for (
unsigned int i=1; i<rounds-1; i+=2)
262 block = vaesdq_u8(block, vld1q_u8(keys+i*16));
264 block = vaesimcq_u8(block);
266 block = vaesdq_u8(block, vld1q_u8(keys+(i+1)*16));
268 block = vaesimcq_u8(block);
272 block = vaesdq_u8(block, vld1q_u8(keys+(rounds-1)*16));
274 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
276 data = vreinterpretq_u64_u8(block);
279inline void ARMV8_Dec_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
280 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
281 const word32 *subkeys,
unsigned int rounds)
284 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
286 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
287 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
288 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
289 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
290 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
291 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
294 for (
unsigned int i=0; i<rounds-1; ++i)
296 key = vld1q_u8(keys+i*16);
298 block0 = vaesdq_u8(block0, key);
300 block0 = vaesimcq_u8(block0);
302 block1 = vaesdq_u8(block1, key);
304 block1 = vaesimcq_u8(block1);
306 block2 = vaesdq_u8(block2, key);
308 block2 = vaesimcq_u8(block2);
310 block3 = vaesdq_u8(block3, key);
312 block3 = vaesimcq_u8(block3);
314 block4 = vaesdq_u8(block4, key);
316 block4 = vaesimcq_u8(block4);
318 block5 = vaesdq_u8(block5, key);
320 block5 = vaesimcq_u8(block5);
324 key = vld1q_u8(keys+(rounds-1)*16);
325 block0 = vaesdq_u8(block0, key);
326 block1 = vaesdq_u8(block1, key);
327 block2 = vaesdq_u8(block2, key);
328 block3 = vaesdq_u8(block3, key);
329 block4 = vaesdq_u8(block4, key);
330 block5 = vaesdq_u8(block5, key);
333 key = vld1q_u8(keys+rounds*16);
334 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
335 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
336 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
337 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
338 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
339 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
342ANONYMOUS_NAMESPACE_END
344size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
345 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
348 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
351size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
352 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
355 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
362#if (CRYPTOPP_AESNI_AVAILABLE)
364ANONYMOUS_NAMESPACE_BEGIN
367CRYPTOPP_ALIGN_DATA(16)
368const
word32 s_rconLE[] = {
369 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
372inline void AESNI_Enc_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
374 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
376 block = _mm_xor_si128(block, skeys[0]);
377 for (
unsigned int i=1; i<rounds-1; i+=2)
379 block = _mm_aesenc_si128(block, skeys[i]);
380 block = _mm_aesenc_si128(block, skeys[i+1]);
382 block = _mm_aesenc_si128(block, skeys[rounds-1]);
383 block = _mm_aesenclast_si128(block, skeys[rounds]);
386inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
389 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
391 __m128i rk = skeys[0];
392 block0 = _mm_xor_si128(block0, rk);
393 block1 = _mm_xor_si128(block1, rk);
394 block2 = _mm_xor_si128(block2, rk);
395 block3 = _mm_xor_si128(block3, rk);
396 for (
unsigned int i=1; i<rounds; i++)
399 block0 = _mm_aesenc_si128(block0, rk);
400 block1 = _mm_aesenc_si128(block1, rk);
401 block2 = _mm_aesenc_si128(block2, rk);
402 block3 = _mm_aesenc_si128(block3, rk);
405 block0 = _mm_aesenclast_si128(block0, rk);
406 block1 = _mm_aesenclast_si128(block1, rk);
407 block2 = _mm_aesenclast_si128(block2, rk);
408 block3 = _mm_aesenclast_si128(block3, rk);
411inline void AESNI_Dec_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
413 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
415 block = _mm_xor_si128(block, skeys[0]);
416 for (
unsigned int i=1; i<rounds-1; i+=2)
418 block = _mm_aesdec_si128(block, skeys[i]);
419 block = _mm_aesdec_si128(block, skeys[i+1]);
421 block = _mm_aesdec_si128(block, skeys[rounds-1]);
422 block = _mm_aesdeclast_si128(block, skeys[rounds]);
425inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
428 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
430 __m128i rk = skeys[0];
431 block0 = _mm_xor_si128(block0, rk);
432 block1 = _mm_xor_si128(block1, rk);
433 block2 = _mm_xor_si128(block2, rk);
434 block3 = _mm_xor_si128(block3, rk);
435 for (
unsigned int i=1; i<rounds; i++)
438 block0 = _mm_aesdec_si128(block0, rk);
439 block1 = _mm_aesdec_si128(block1, rk);
440 block2 = _mm_aesdec_si128(block2, rk);
441 block3 = _mm_aesdec_si128(block3, rk);
444 block0 = _mm_aesdeclast_si128(block0, rk);
445 block1 = _mm_aesdeclast_si128(block1, rk);
446 block2 = _mm_aesdeclast_si128(block2, rk);
447 block3 = _mm_aesdeclast_si128(block3, rk);
450ANONYMOUS_NAMESPACE_END
452void Rijndael_UncheckedSetKey_SSE4_AESNI(
const byte *userKey,
size_t keyLen,
word32 *rk)
454 const size_t rounds = keyLen / 4 + 6;
455 const word32 *rc = s_rconLE;
457 __m128i temp = _mm_loadu_si128(
M128_CAST(userKey+keyLen-16));
458 std::memcpy(rk, userKey, keyLen);
461 const size_t keySize = 4*(rounds+1);
462 const word32* end = rk + keySize;
466 rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
467 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
468 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
469 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
471 if (rk + keyLen/4 + 4 == end)
476 rk[10] = rk[ 4] ^ rk[ 9];
477 rk[11] = rk[ 5] ^ rk[10];
478 temp = _mm_insert_epi32(temp, rk[11], 3);
480 else if (keyLen == 32)
482 temp = _mm_insert_epi32(temp, rk[11], 3);
483 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
484 rk[13] = rk[ 5] ^ rk[12];
485 rk[14] = rk[ 6] ^ rk[13];
486 rk[15] = rk[ 7] ^ rk[14];
487 temp = _mm_insert_epi32(temp, rk[15], 3);
491 temp = _mm_insert_epi32(temp, rk[7], 3);
498void Rijndael_UncheckedSetKeyRev_AESNI(
word32 *key,
unsigned int rounds)
505 for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
507 temp = _mm_aesimc_si128(*
M128_CAST(key+i));
515size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
516 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
524 sk, rounds, ib, xb, outBlocks, length, flags);
527size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
528 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
535 sk, rounds, ib, xb, outBlocks, length, flags);
542#if (CRYPTOPP_POWER8_AES_AVAILABLE)
544ANONYMOUS_NAMESPACE_BEGIN
547CRYPTOPP_ALIGN_DATA(16)
548static const uint32_t s_rconBE[] = {
549 0x01000000, 0x02000000, 0x04000000, 0x08000000,
550 0x10000000, 0x20000000, 0x40000000, 0x80000000,
551 0x1B000000, 0x36000000
554inline void POWER8_Enc_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
557 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
562 for (
size_t i=1; i<rounds-1; i+=2)
577 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
580 block0 =
VecXor(block0, k);
581 block1 =
VecXor(block1, k);
582 block2 =
VecXor(block2, k);
583 block3 =
VecXor(block3, k);
584 block4 =
VecXor(block4, k);
585 block5 =
VecXor(block5, k);
587 for (
size_t i=1; i<rounds; ++i)
607inline void POWER8_Dec_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
610 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
615 for (
size_t i=rounds-1; i>1; i-=2)
630 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
633 block0 =
VecXor(block0, k);
634 block1 =
VecXor(block1, k);
635 block2 =
VecXor(block2, k);
636 block3 =
VecXor(block3, k);
637 block4 =
VecXor(block4, k);
638 block5 =
VecXor(block5, k);
640 for (
size_t i=rounds-1; i>0; --i)
660ANONYMOUS_NAMESPACE_END
662void Rijndael_UncheckedSetKey_POWER8(
const byte* userKey,
size_t keyLen,
word32* rk,
const byte* Se)
664 const size_t rounds = keyLen / 4 + 6;
665 const word32 *rc = s_rconBE;
671 const size_t keySize = 4*(rounds+1);
672 const word32* end = rkey + keySize;
676 temp = rkey[keyLen/4-1];
677 word32 x = (
word32(Se[GETBYTE(temp, 2)]) << 24) ^ (
word32(Se[GETBYTE(temp, 1)]) << 16) ^
678 (
word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
679 rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
680 rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
681 rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
682 rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];
684 if (rkey + keyLen/4 + 4 == end)
689 rkey[10] = rkey[ 4] ^ rkey[ 9];
690 rkey[11] = rkey[ 5] ^ rkey[10];
692 else if (keyLen == 32)
695 rkey[12] = rkey[ 4] ^ (
word32(Se[GETBYTE(temp, 3)]) << 24) ^ (
word32(Se[GETBYTE(temp, 2)]) << 16) ^ (
word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
696 rkey[13] = rkey[ 5] ^ rkey[12];
697 rkey[14] = rkey[ 6] ^ rkey[13];
698 rkey[15] = rkey[ 7] ^ rkey[14];
703#if (CRYPTOPP_LITTLE_ENDIAN)
705 const uint8x16_p mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
708 for (i=0; i<rounds; i+=2, rkey+=8)
714 for ( ; i<rounds+1; i++, rkey+=4)
719size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
720 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
723 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
726size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
727 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
730 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Template for AdvancedProcessBlocks and SIMD processing.
#define MAYBE_UNCONST_CAST(T, x)
SunCC workaround.
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
#define M128_CAST(x)
Clang workaround.
size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
#define MAYBE_CONST
SunCC workaround.
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
@ BIG_ENDIAN_ORDER
byte order is big-endian
Utility functions for the Crypto++ library.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
void GetUserKey(ByteOrder order, T *out, size_t outlen, const byte *in, size_t inlen)
Copy bytes in a buffer to an array of elements in big-endian order.
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.