101 const W *subKeys,
size_t rounds,
const byte *inBlocks,
102 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
109 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
110 const uint32x4_t s_one = vld1q_u32(w_one);
112 const size_t blockSize = 16;
115 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
116 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
117 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
120 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
121 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
123 if (flags & BT_ReverseDirection)
125 inBlocks =
PtrAdd(inBlocks, length - blockSize);
126 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
127 outBlocks =
PtrAdd(outBlocks, length - blockSize);
128 inIncrement = 0-inIncrement;
129 xorIncrement = 0-xorIncrement;
130 outIncrement = 0-outIncrement;
133 if (flags & BT_AllowParallel)
135 while (length >= 6*blockSize)
137 uint64x2_t block0, block1, block2, block3, block4, block5;
138 if (flags & BT_InBlockIsCounter)
140 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
141 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
142 block1 = vaddq_u64(block0, one);
143 block2 = vaddq_u64(block1, one);
144 block3 = vaddq_u64(block2, one);
145 block4 = vaddq_u64(block3, one);
146 block5 = vaddq_u64(block4, one);
147 vst1q_u8(
const_cast<byte*
>(inBlocks),
148 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
152 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
153 inBlocks =
PtrAdd(inBlocks, inIncrement);
154 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
155 inBlocks =
PtrAdd(inBlocks, inIncrement);
156 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
157 inBlocks =
PtrAdd(inBlocks, inIncrement);
158 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
159 inBlocks =
PtrAdd(inBlocks, inIncrement);
160 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
161 inBlocks =
PtrAdd(inBlocks, inIncrement);
162 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
163 inBlocks =
PtrAdd(inBlocks, inIncrement);
168 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
169 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
170 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
171 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
172 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
173 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
174 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
175 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
176 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
177 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
178 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
179 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
182 func6(block0, block1, block2, block3, block4, block5, subKeys,
static_cast<unsigned int>(rounds));
186 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
187 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
188 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
189 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
190 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
191 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
192 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
193 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
194 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
195 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
196 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
197 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
200 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
201 outBlocks =
PtrAdd(outBlocks, outIncrement);
202 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
203 outBlocks =
PtrAdd(outBlocks, outIncrement);
204 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
205 outBlocks =
PtrAdd(outBlocks, outIncrement);
206 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
207 outBlocks =
PtrAdd(outBlocks, outIncrement);
208 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
209 outBlocks =
PtrAdd(outBlocks, outIncrement);
210 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
211 outBlocks =
PtrAdd(outBlocks, outIncrement);
213 length -= 6*blockSize;
217 while (length >= blockSize)
220 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
223 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
225 if (flags & BT_InBlockIsCounter)
226 const_cast<byte *
>(inBlocks)[15]++;
228 func1(block, subKeys,
static_cast<unsigned int>(rounds));
231 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
233 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
235 inBlocks =
PtrAdd(inBlocks, inIncrement);
236 outBlocks =
PtrAdd(outBlocks, outIncrement);
237 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
255 const W *subKeys,
size_t rounds,
const byte *inBlocks,
256 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
263 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
264 const uint32x4_t s_one = vld1q_u32(w_one);
266 const size_t blockSize = 16;
269 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
270 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
271 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
274 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
275 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
277 if (flags & BT_ReverseDirection)
279 inBlocks =
PtrAdd(inBlocks, length - blockSize);
280 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
281 outBlocks =
PtrAdd(outBlocks, length - blockSize);
282 inIncrement = 0-inIncrement;
283 xorIncrement = 0-xorIncrement;
284 outIncrement = 0-outIncrement;
287 if (flags & BT_AllowParallel)
289 while (length >= 4*blockSize)
291 uint32x4_t block0, block1, block2, block3;
292 if (flags & BT_InBlockIsCounter)
294 const uint32x4_t one = s_one;
295 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
296 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
297 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
298 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
299 vst1q_u8(
const_cast<byte*
>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
300 vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
304 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
305 inBlocks =
PtrAdd(inBlocks, inIncrement);
306 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
307 inBlocks =
PtrAdd(inBlocks, inIncrement);
308 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
309 inBlocks =
PtrAdd(inBlocks, inIncrement);
310 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
311 inBlocks =
PtrAdd(inBlocks, inIncrement);
316 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
317 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
318 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
319 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
320 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
321 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
322 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
323 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
326 func4(block0, block1, block2, block3, subKeys,
static_cast<unsigned int>(rounds));
330 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
331 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
332 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
333 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
334 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
335 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
336 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
337 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
340 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
341 outBlocks =
PtrAdd(outBlocks, outIncrement);
342 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
343 outBlocks =
PtrAdd(outBlocks, outIncrement);
344 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
345 outBlocks =
PtrAdd(outBlocks, outIncrement);
346 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
347 outBlocks =
PtrAdd(outBlocks, outIncrement);
349 length -= 4*blockSize;
353 while (length >= blockSize)
355 uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
358 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
360 if (flags & BT_InBlockIsCounter)
361 const_cast<byte *
>(inBlocks)[15]++;
363 func1(block, subKeys,
static_cast<unsigned int>(rounds));
366 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
368 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
370 inBlocks =
PtrAdd(inBlocks, inIncrement);
371 outBlocks =
PtrAdd(outBlocks, outIncrement);
372 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
389 const W *subKeys,
size_t rounds,
const byte *inBlocks,
390 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
397 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
398 const uint32x4_t s_one = vld1q_u32(w_one);
400 const size_t blockSize = 16;
403 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
404 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
405 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
408 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
409 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
411 if (flags & BT_ReverseDirection)
413 inBlocks =
PtrAdd(inBlocks, length - blockSize);
414 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
415 outBlocks =
PtrAdd(outBlocks, length - blockSize);
416 inIncrement = 0-inIncrement;
417 xorIncrement = 0-xorIncrement;
418 outIncrement = 0-outIncrement;
421 if (flags & BT_AllowParallel)
423 while (length >= 6*blockSize)
425 uint64x2_t block0, block1, block2, block3, block4, block5;
426 if (flags & BT_InBlockIsCounter)
428 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
429 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
430 block1 = vaddq_u64(block0, one);
431 block2 = vaddq_u64(block1, one);
432 block3 = vaddq_u64(block2, one);
433 block4 = vaddq_u64(block3, one);
434 block5 = vaddq_u64(block4, one);
435 vst1q_u8(
const_cast<byte*
>(inBlocks),
436 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
440 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
441 inBlocks =
PtrAdd(inBlocks, inIncrement);
442 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
443 inBlocks =
PtrAdd(inBlocks, inIncrement);
444 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
445 inBlocks =
PtrAdd(inBlocks, inIncrement);
446 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
447 inBlocks =
PtrAdd(inBlocks, inIncrement);
448 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
449 inBlocks =
PtrAdd(inBlocks, inIncrement);
450 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
451 inBlocks =
PtrAdd(inBlocks, inIncrement);
456 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
457 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
458 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
459 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
460 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
461 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
462 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
463 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
464 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
465 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
466 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
467 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
470 func6(block0, block1, block2, block3, block4, block5, subKeys,
static_cast<unsigned int>(rounds));
474 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
475 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
476 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
477 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
478 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
479 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
480 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
481 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
482 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
483 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
484 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
485 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
488 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
489 outBlocks =
PtrAdd(outBlocks, outIncrement);
490 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
491 outBlocks =
PtrAdd(outBlocks, outIncrement);
492 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
493 outBlocks =
PtrAdd(outBlocks, outIncrement);
494 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
495 outBlocks =
PtrAdd(outBlocks, outIncrement);
496 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
497 outBlocks =
PtrAdd(outBlocks, outIncrement);
498 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
499 outBlocks =
PtrAdd(outBlocks, outIncrement);
501 length -= 6*blockSize;
504 while (length >= 2*blockSize)
506 uint64x2_t block0, block1;
507 if (flags & BT_InBlockIsCounter)
509 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
510 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
511 block1 = vaddq_u64(block0, one);
512 vst1q_u8(
const_cast<byte*
>(inBlocks),
513 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
517 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
518 inBlocks =
PtrAdd(inBlocks, inIncrement);
519 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
520 inBlocks =
PtrAdd(inBlocks, inIncrement);
525 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
526 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
527 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
528 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
531 func2(block0, block1, subKeys,
static_cast<unsigned int>(rounds));
535 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
536 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
537 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
538 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
541 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
542 outBlocks =
PtrAdd(outBlocks, outIncrement);
543 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
544 outBlocks =
PtrAdd(outBlocks, outIncrement);
546 length -= 2*blockSize;
550 while (length >= blockSize)
552 uint64x2_t block, zero = {0,0};
553 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
556 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
558 if (flags & BT_InBlockIsCounter)
559 const_cast<byte *
>(inBlocks)[15]++;
561 func2(block, zero, subKeys,
static_cast<unsigned int>(rounds));
564 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
566 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
568 inBlocks =
PtrAdd(inBlocks, inIncrement);
569 outBlocks =
PtrAdd(outBlocks, outIncrement);
570 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
636 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
637 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
644 const size_t blockSize = 16;
647 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
648 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
649 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
652 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
653 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
655 if (flags & BT_ReverseDirection)
657 inBlocks =
PtrAdd(inBlocks, length - blockSize);
658 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
659 outBlocks =
PtrAdd(outBlocks, length - blockSize);
660 inIncrement = 0-inIncrement;
661 xorIncrement = 0-xorIncrement;
662 outIncrement = 0-outIncrement;
665 if (flags & BT_AllowParallel)
667 while (length >= 6*blockSize)
669 __m128i block0, block1, block2, block3, block4, block5;
670 if (flags & BT_InBlockIsCounter)
673 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
675 block1 = _mm_add_epi32(block0, s_one);
676 block2 = _mm_add_epi32(block1, s_one);
677 block3 = _mm_add_epi32(block2, s_one);
678 block4 = _mm_add_epi32(block3, s_one);
679 block5 = _mm_add_epi32(block4, s_one);
680 _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
685 inBlocks =
PtrAdd(inBlocks, inIncrement);
687 inBlocks =
PtrAdd(inBlocks, inIncrement);
689 inBlocks =
PtrAdd(inBlocks, inIncrement);
691 inBlocks =
PtrAdd(inBlocks, inIncrement);
693 inBlocks =
PtrAdd(inBlocks, inIncrement);
695 inBlocks =
PtrAdd(inBlocks, inIncrement);
700 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
701 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
702 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
703 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
704 block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
705 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
706 block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
707 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
708 block4 = _mm_xor_si128(block4, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
709 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
710 block5 = _mm_xor_si128(block5, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
711 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
714 func6(block0, block1, block2, block3, block4, block5, subKeys,
static_cast<unsigned int>(rounds));
718 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
719 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
720 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
721 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
722 block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
723 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
724 block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
725 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
726 block4 = _mm_xor_si128(block4, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
727 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
728 block5 = _mm_xor_si128(block5, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
729 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
732 _mm_storeu_si128(
M128_CAST(outBlocks), block0);
733 outBlocks =
PtrAdd(outBlocks, outIncrement);
734 _mm_storeu_si128(
M128_CAST(outBlocks), block1);
735 outBlocks =
PtrAdd(outBlocks, outIncrement);
736 _mm_storeu_si128(
M128_CAST(outBlocks), block2);
737 outBlocks =
PtrAdd(outBlocks, outIncrement);
738 _mm_storeu_si128(
M128_CAST(outBlocks), block3);
739 outBlocks =
PtrAdd(outBlocks, outIncrement);
740 _mm_storeu_si128(
M128_CAST(outBlocks), block4);
741 outBlocks =
PtrAdd(outBlocks, outIncrement);
742 _mm_storeu_si128(
M128_CAST(outBlocks), block5);
743 outBlocks =
PtrAdd(outBlocks, outIncrement);
745 length -= 6*blockSize;
748 while (length >= 2*blockSize)
750 __m128i block0, block1;
751 if (flags & BT_InBlockIsCounter)
754 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
756 block1 = _mm_add_epi32(block0, s_one);
757 _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
762 inBlocks =
PtrAdd(inBlocks, inIncrement);
764 inBlocks =
PtrAdd(inBlocks, inIncrement);
769 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
770 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
771 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
772 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
775 func2(block0, block1, subKeys,
static_cast<unsigned int>(rounds));
779 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
780 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
781 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
782 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
785 _mm_storeu_si128(
M128_CAST(outBlocks), block0);
786 outBlocks =
PtrAdd(outBlocks, outIncrement);
787 _mm_storeu_si128(
M128_CAST(outBlocks), block1);
788 outBlocks =
PtrAdd(outBlocks, outIncrement);
790 length -= 2*blockSize;
794 while (length >= blockSize)
796 __m128i block, zero = _mm_setzero_si128();
800 block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
802 if (flags & BT_InBlockIsCounter)
803 const_cast<byte *
>(inBlocks)[15]++;
805 func2(block, zero, subKeys,
static_cast<unsigned int>(rounds));
808 block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
810 _mm_storeu_si128(
M128_CAST(outBlocks), block);
812 inBlocks =
PtrAdd(inBlocks, inIncrement);
813 outBlocks =
PtrAdd(outBlocks, outIncrement);
814 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
831 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
832 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
839 const size_t blockSize = 16;
842 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
843 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
844 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
847 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
848 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
850 if (flags & BT_ReverseDirection)
852 inBlocks =
PtrAdd(inBlocks, length - blockSize);
853 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
854 outBlocks =
PtrAdd(outBlocks, length - blockSize);
855 inIncrement = 0-inIncrement;
856 xorIncrement = 0-xorIncrement;
857 outIncrement = 0-outIncrement;
860 if (flags & BT_AllowParallel)
862 while (length >= 4*blockSize)
864 __m128i block0, block1, block2, block3;
865 if (flags & BT_InBlockIsCounter)
868 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
870 block1 = _mm_add_epi32(block0, s_one);
871 block2 = _mm_add_epi32(block1, s_one);
872 block3 = _mm_add_epi32(block2, s_one);
873 _mm_storeu_si128(
M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
878 inBlocks =
PtrAdd(inBlocks, inIncrement);
880 inBlocks =
PtrAdd(inBlocks, inIncrement);
882 inBlocks =
PtrAdd(inBlocks, inIncrement);
884 inBlocks =
PtrAdd(inBlocks, inIncrement);
889 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
890 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
891 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
892 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
893 block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
894 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
895 block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
896 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
899 func4(block0, block1, block2, block3, subKeys,
static_cast<unsigned int>(rounds));
903 block0 = _mm_xor_si128(block0, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
904 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
905 block1 = _mm_xor_si128(block1, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
906 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
907 block2 = _mm_xor_si128(block2, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
908 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
909 block3 = _mm_xor_si128(block3, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
910 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
913 _mm_storeu_si128(
M128_CAST(outBlocks), block0);
914 outBlocks =
PtrAdd(outBlocks, outIncrement);
915 _mm_storeu_si128(
M128_CAST(outBlocks), block1);
916 outBlocks =
PtrAdd(outBlocks, outIncrement);
917 _mm_storeu_si128(
M128_CAST(outBlocks), block2);
918 outBlocks =
PtrAdd(outBlocks, outIncrement);
919 _mm_storeu_si128(
M128_CAST(outBlocks), block3);
920 outBlocks =
PtrAdd(outBlocks, outIncrement);
922 length -= 4*blockSize;
926 while (length >= blockSize)
931 block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
933 if (flags & BT_InBlockIsCounter)
934 const_cast<byte *
>(inBlocks)[15]++;
936 func1(block, subKeys,
static_cast<unsigned int>(rounds));
939 block = _mm_xor_si128(block, _mm_loadu_si128(
CONST_M128_CAST(xorBlocks)));
941 _mm_storeu_si128(
M128_CAST(outBlocks), block);
943 inBlocks =
PtrAdd(inBlocks, inIncrement);
944 outBlocks =
PtrAdd(outBlocks, outIncrement);
945 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
972 const W *subKeys,
size_t rounds,
const byte *inBlocks,
973 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
980#if (CRYPTOPP_LITTLE_ENDIAN)
986 const size_t blockSize = 16;
989 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
990 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
991 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
994 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
995 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
997 if (flags & BT_ReverseDirection)
999 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1000 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1001 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1002 inIncrement = 0-inIncrement;
1003 xorIncrement = 0-xorIncrement;
1004 outIncrement = 0-outIncrement;
1007 if (flags & BT_AllowParallel)
1009 while (length >= 4*blockSize)
1013 if (flags & BT_InBlockIsCounter)
1016 block1 =
VecAdd(block0, s_one);
1017 block2 =
VecAdd(block1, s_one);
1018 block3 =
VecAdd(block2, s_one);
1028 const_cast<byte*
>(inBlocks)[15] += 6;
1033 inBlocks =
PtrAdd(inBlocks, inIncrement);
1035 inBlocks =
PtrAdd(inBlocks, inIncrement);
1037 inBlocks =
PtrAdd(inBlocks, inIncrement);
1039 inBlocks =
PtrAdd(inBlocks, inIncrement);
1045 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1047 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1049 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1051 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1054 func4(block0, block1, block2, block3, subKeys, rounds);
1059 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1061 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1063 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1065 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1069 outBlocks =
PtrAdd(outBlocks, outIncrement);
1071 outBlocks =
PtrAdd(outBlocks, outIncrement);
1073 outBlocks =
PtrAdd(outBlocks, outIncrement);
1075 outBlocks =
PtrAdd(outBlocks, outIncrement);
1077 length -= 4*blockSize;
1081 while (length >= blockSize)
1088 if (flags & BT_InBlockIsCounter)
1089 const_cast<byte *
>(inBlocks)[15]++;
1091 func1(block, subKeys, rounds);
1098 inBlocks =
PtrAdd(inBlocks, inIncrement);
1099 outBlocks =
PtrAdd(outBlocks, outIncrement);
1100 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1101 length -= blockSize;
1117 const W *subKeys,
size_t rounds,
const byte *inBlocks,
1118 const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
1125#if (CRYPTOPP_LITTLE_ENDIAN)
1131 const size_t blockSize = 16;
1134 size_t inIncrement = (flags & (
EnumToInt(BT_InBlockIsCounter)|
EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136 size_t outIncrement = (flags &
EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1139 const bool xorInput = (xorBlocks != NULLPTR) && (flags &
EnumToInt(BT_XorInput));
1140 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags &
EnumToInt(BT_XorInput));
1142 if (flags & BT_ReverseDirection)
1144 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1145 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1146 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1147 inIncrement = 0-inIncrement;
1148 xorIncrement = 0-xorIncrement;
1149 outIncrement = 0-outIncrement;
1152 if (flags & BT_AllowParallel)
1154 while (length >= 6*blockSize)
1156 uint32x4_p block0, block1, block2, block3, block4, block5;
1158 if (flags & BT_InBlockIsCounter)
1161 block1 =
VecAdd(block0, s_one);
1162 block2 =
VecAdd(block1, s_one);
1163 block3 =
VecAdd(block2, s_one);
1164 block4 =
VecAdd(block3, s_one);
1165 block5 =
VecAdd(block4, s_one);
1182 VecStoreBE(temp,
const_cast<byte*
>(inBlocks));
1187 inBlocks =
PtrAdd(inBlocks, inIncrement);
1189 inBlocks =
PtrAdd(inBlocks, inIncrement);
1191 inBlocks =
PtrAdd(inBlocks, inIncrement);
1193 inBlocks =
PtrAdd(inBlocks, inIncrement);
1195 inBlocks =
PtrAdd(inBlocks, inIncrement);
1197 inBlocks =
PtrAdd(inBlocks, inIncrement);
1203 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1205 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1207 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1209 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1211 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1213 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1216 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1221 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1223 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1225 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1227 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1229 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1231 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1235 outBlocks =
PtrAdd(outBlocks, outIncrement);
1237 outBlocks =
PtrAdd(outBlocks, outIncrement);
1239 outBlocks =
PtrAdd(outBlocks, outIncrement);
1241 outBlocks =
PtrAdd(outBlocks, outIncrement);
1243 outBlocks =
PtrAdd(outBlocks, outIncrement);
1245 outBlocks =
PtrAdd(outBlocks, outIncrement);
1247 length -= 6*blockSize;
1251 while (length >= blockSize)
1258 if (flags & BT_InBlockIsCounter)
1259 const_cast<byte *
>(inBlocks)[15]++;
1261 func1(block, subKeys, rounds);
1268 inBlocks =
PtrAdd(inBlocks, inIncrement);
1269 outBlocks =
PtrAdd(outBlocks, outIncrement);
1270 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1271 length -= blockSize;