Crypto++ 8.9
Free C++ class library of cryptographic schemes
ppc_simd.h
Go to the documentation of this file.
1// ppc_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file ppc_simd.h
4/// \brief Support functions for PowerPC and vector operations
5/// \details This header provides an agnostic interface into Clang, GCC
6/// and IBM XL C/C++ compilers modulo their different built-in functions
7/// for accessing vector instructions.
8/// \details The abstractions are necessary to support back to GCC 4.8 and
9/// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10/// default compiler for GCC112, GCC119 and others on the compile farm.
11/// Older IBM XL C/C++ compilers also have the need due to lack of
12/// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13/// compilers provide best support and don't need many of the hacks
14/// below.
15/// \details The library is tested with the following PowerPC machines and
16/// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17/// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18/// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19/// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20/// - GCC110, Linux, POWER7, GCC 4.8.5
21/// - GCC110, Linux, POWER7, XLC 12.01
22/// - GCC111, AIX, POWER7, GCC 4.8.1
23/// - GCC111, AIX, POWER7, XLC 12.01
24/// - GCC112, Linux, POWER8, GCC 4.8.5
25/// - GCC112, Linux, POWER8, XLC 13.01
26/// - GCC112, Linux, POWER8, Clang 7.0
27/// - GCC119, AIX, POWER8, GCC 7.2.0
28/// - GCC119, AIX, POWER8, XLC 13.01
29/// - GCC135, Linux, POWER9, GCC 7.0
30/// \details 12 machines are used for testing because the three compilers form
31/// five or six profiles. The profiles are listed below.
32/// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33/// - XLC 13.0 and earlier (all IBM components)
34/// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35/// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36/// - early LLVM Clang (traditional Clang compiler)
37/// - late LLVM Clang (traditional Clang compiler)
38/// \details The LLVM front-end makes it tricky to write portable code because
39/// LLVM pretends to be other compilers but cannot consume other compiler's
40/// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
41/// be GCC, Clang and XLC all at once but it can only consume it's variety
42/// of builtins.
43/// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
44/// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
45/// changed to <tt>VecAnd</tt>. The name change helped consolidate two
46/// slightly different implementations.
47/// \details At Crypto++ 8.3 the library added select 64-bit functions for
48/// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
49/// take 32-bit vectors and adds or subtracts them as if there were vectors
50/// with two 64-bit elements. The functions dramtically improve performance
51/// for some algorithms on some platforms, like SIMON128 and SPECK128 on
52/// Power6 and earlier. For example, SPECK128 improved from 70 cpb to
53/// 10 cpb on an old PowerMac. Use the functions like shown below.
54/// <pre>
55/// \#if defined(_ARCH_PWR8)
56/// \# define speck128_t uint64x2_p
57/// \#else
58/// \# define speck128_t uint32x4_p
59/// \#endif
60///
61/// speck128_t rk, x1, x2, y1, y2;
62/// rk = (speck128_t)VecLoadAligned(ptr);
63/// x1 = VecRotateRight64<8>(x1);
64/// x1 = VecAdd64(x1, y1);
65/// ...</pre>
66/// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
67
68// Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69// actual availaibility of the feature for the source file being compiled.
70// The preprocessor macros depend on compiler options like -maltivec; and
71// not compiler versions.
72
73// For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74// For XLC see the Compiler Reference manual. For Clang you have to experiment.
75// Clang does not document the compiler options, does not reject options it does
76// not understand, and pretends to be other compilers even though it cannot
77// process the builtins and intrinsics. Clang will waste hours of your time.
78
79// DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80// code paths guarded by preprocessor macros because XLC 12 generates
81// bad code in some places. To verify the bad code generation test on
82// GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83//
84// inline uint32x4_p VecLoad(const byte src[16])
85// {
86// #if defined(__VSX__) || defined(_ARCH_PWR8)
87// return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88// #else
89// return VecLoad_ALTIVEC(src);
90// #endif
91// }
92
93// We should be able to perform the load using inline asm on Power7 with
94// VSX or Power8. The inline asm will avoid C undefined behavior due to
95// casting from byte* to word32*. We are safe because our byte* are
96// 16-byte aligned for Altivec. Below is the big endian load. Little
97// endian would need to follow with xxpermdi for the reversal.
98//
99// __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100
101// GCC and XLC use integer math for the address (D-form or byte-offset
102// in the ISA manual). LLVM uses pointer math for the address (DS-form
103// or indexed in the ISA manual). To keep them consistent we calculate
104// the address from the offset and pass to a load or store function
105// using a 0 offset.
106
107#ifndef CRYPTOPP_PPC_CRYPTO_H
108#define CRYPTOPP_PPC_CRYPTO_H
109
110#include "config.h"
111#include "misc.h"
112
113#if defined(__ALTIVEC__)
114# include <altivec.h>
115# undef vector
116# undef pixel
117# undef bool
118#endif
119
120// XL C++ on AIX does not define VSX and does not
121// provide an option to set it. We have to set it
122// for the code below. This define must stay in
123// sync with the define in test_ppc_power7.cpp.
124#ifndef CRYPTOPP_DISABLE_POWER7
125# if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
126# define __VSX__ 1
127# endif
128#endif
129
130// XL C++ on AIX does not define CRYPTO and does not
131// provide an option to set it. We have to set it
132// for the code below. This define must stay in
133// sync with the define in test_ppc_power8.cpp
134#ifndef CRYPTOPP_DISABLE_POWER8
135# if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
136# define __CRYPTO__ 1
137# endif
138#endif
139
140/// \brief Cast array to vector pointer
141/// \details CONST_V8_CAST casts a const array to a vector
142/// pointer for a byte array. The Power ABI says source arrays
143/// are non-const, so this define removes the const. XLC++ will
144/// fail the compile if the source array is const.
145#define CONST_V8_CAST(x) ((unsigned char*)(x))
146/// \brief Cast array to vector pointer
147/// \details CONST_V32_CAST casts a const array to a vector
148/// pointer for a word array. The Power ABI says source arrays
149/// are non-const, so this define removes the const. XLC++ will
150/// fail the compile if the source array is const.
151#define CONST_V32_CAST(x) ((unsigned int*)(x))
152/// \brief Cast array to vector pointer
153/// \details CONST_V64_CAST casts a const array to a vector
154/// pointer for a double word array. The Power ABI says source arrays
155/// are non-const, so this define removes the const. XLC++ will
156/// fail the compile if the source array is const.
157#define CONST_V64_CAST(x) ((unsigned long long*)(x))
158/// \brief Cast array to vector pointer
159/// \details NCONST_V8_CAST casts an array to a vector
160/// pointer for a byte array. The Power ABI says source arrays
161/// are non-const, so this define removes the const. XLC++ will
162/// fail the compile if the source array is const.
163#define NCONST_V8_CAST(x) ((unsigned char*)(x))
164/// \brief Cast array to vector pointer
165/// \details NCONST_V32_CAST casts an array to a vector
166/// pointer for a word array. The Power ABI says source arrays
167/// are non-const, so this define removes the const. XLC++ will
168/// fail the compile if the source array is const.
169#define NCONST_V32_CAST(x) ((unsigned int*)(x))
170/// \brief Cast array to vector pointer
171/// \details NCONST_V64_CAST casts an array to a vector
172/// pointer for a double word array. The Power ABI says source arrays
173/// are non-const, so this define removes the const. XLC++ will
174/// fail the compile if the source array is const.
175#define NCONST_V64_CAST(x) ((unsigned long long*)(x))
176
177// VecLoad_ALTIVEC and VecStore_ALTIVEC are
178// too noisy on modern compilers
179#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
180# pragma GCC diagnostic push
181# pragma GCC diagnostic ignored "-Wdeprecated"
182#endif
183
184NAMESPACE_BEGIN(CryptoPP)
185
186#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
187
188/// \brief Vector of 8-bit elements
189/// \par Wraps
190/// __vector unsigned char
191/// \since Crypto++ 6.0
192typedef __vector unsigned char uint8x16_p;
193/// \brief Vector of 16-bit elements
194/// \par Wraps
195/// __vector unsigned short
196/// \since Crypto++ 6.0
197typedef __vector unsigned short uint16x8_p;
198/// \brief Vector of 32-bit elements
199/// \par Wraps
200/// __vector unsigned int
201/// \since Crypto++ 6.0
202typedef __vector unsigned int uint32x4_p;
203
204#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
205/// \brief Vector of 64-bit elements
206/// \details uint64x2_p is available on POWER7 with VSX and above. Most
207/// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
208/// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
209/// \par Wraps
210/// __vector unsigned long long
211/// \since Crypto++ 6.0
212typedef __vector unsigned long long uint64x2_p;
213#endif // VSX or ARCH_PWR8
214
215/// \brief The 0 vector
216/// \return a 32-bit vector of 0's
217/// \since Crypto++ 8.0
219{
220 const uint32x4_p v = {0,0,0,0};
221 return v;
222}
223
224/// \brief The 1 vector
225/// \return a 32-bit vector of 1's
226/// \since Crypto++ 8.0
228{
229 const uint32x4_p v = {1,1,1,1};
230 return v;
231}
232
233/// \brief Reverse bytes in a vector
234/// \tparam T vector type
235/// \param data the vector
236/// \return vector
237/// \details VecReverse() reverses the bytes in a vector
238/// \par Wraps
239/// vec_perm
240/// \since Crypto++ 6.0
241template <class T>
242inline T VecReverse(const T data)
243{
244#if defined(CRYPTOPP_BIG_ENDIAN)
245 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
246 return (T)vec_perm(data, data, mask);
247#else
248 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
249 return (T)vec_perm(data, data, mask);
250#endif
251}
252
253/// \brief Reverse bytes in a vector
254/// \tparam T vector type
255/// \param data the vector
256/// \return vector
257/// \details VecReverseLE() reverses the bytes in a vector on
258/// little-endian systems.
259/// \par Wraps
260/// vec_perm
261/// \since Crypto++ 6.0
262template <class T>
263inline T VecReverseLE(const T data)
264{
265#if defined(CRYPTOPP_LITTLE_ENDIAN)
266 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
267 return (T)vec_perm(data, data, mask);
268#else
269 return data;
270#endif
271}
272
273/// \brief Reverse bytes in a vector
274/// \tparam T vector type
275/// \param data the vector
276/// \return vector
277/// \details VecReverseBE() reverses the bytes in a vector on
278/// big-endian systems.
279/// \par Wraps
280/// vec_perm
281/// \since Crypto++ 6.0
282template <class T>
283inline T VecReverseBE(const T data)
284{
285#if defined(CRYPTOPP_BIG_ENDIAN)
286 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
287 return (T)vec_perm(data, data, mask);
288#else
289 return data;
290#endif
291}
292
293/// \name LOAD OPERATIONS
294//@{
295
296/// \brief Loads a vector from a byte array
297/// \param src the byte array
298/// \details Loads a vector in native endian format from a byte array.
299/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
300/// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
301/// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
302/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
303/// you should provide aligned memory addresses.
304/// \par Wraps
305/// vec_ld, vec_lvsl, vec_perm
306/// \sa VecLoad, VecLoadAligned
307/// \since Crypto++ 6.0
308inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
309{
310 // Avoid IsAlignedOn for convenience.
311 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
312 if (addr % 16 == 0)
313 {
314 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
315 }
316 else
317 {
318 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
319 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
320 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
321 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
322 return (uint32x4_p)vec_perm(low, high, perm);
323 }
324}
325
326/// \brief Loads a vector from a byte array
327/// \param src the byte array
328/// \param off offset into the src byte array
329/// \details Loads a vector in native endian format from a byte array.
330/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
331/// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
332/// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
333/// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
334/// relatively expensive so you should provide aligned memory addresses.
335/// \par Wraps
336/// vec_ld, vec_lvsl, vec_perm
337/// \sa VecLoad, VecLoadAligned
338/// \since Crypto++ 6.0
339inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
340{
341 // Avoid IsAlignedOn for convenience.
342 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
343 if (addr % 16 == 0)
344 {
345 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
346 }
347 else
348 {
349 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
350 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
351 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
352 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
353 return (uint32x4_p)vec_perm(low, high, perm);
354 }
355}
356
357/// \brief Loads a vector from a byte array
358/// \param src the byte array
359/// \details VecLoad() loads a vector from a byte array.
360/// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
361/// The instruction does not require aligned effective memory addresses.
362/// VecLoad_ALTIVEC() is used if POWER9 is not available.
363/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
364/// are required to fix up unaligned memory addresses.
365/// \par Wraps
366/// vec_xl on POWER9 and above, Altivec load on POWER8 and below
367/// \sa VecLoad_ALTIVEC, VecLoadAligned
368/// \since Crypto++ 6.0
369inline uint32x4_p VecLoad(const byte src[16])
370{
371 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
372 // word pointers. The ISA lacks loads for short* and char*.
373 // Power9/ISA 3.0 provides vec_xl for all datatypes.
374
375 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
376 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
377 CRYPTOPP_UNUSED(addr);
378
379#if defined(_ARCH_PWR9)
380 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
381#else
383#endif
384}
385
386/// \brief Loads a vector from a byte array
387/// \param src the byte array
388/// \param off offset into the src byte array
389/// \details VecLoad() loads a vector from a byte array.
390/// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
391/// The instruction does not require aligned effective memory addresses.
392/// VecLoad_ALTIVEC() is used if POWER9 is not available.
393/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
394/// are required to fix up unaligned memory addresses.
395/// \par Wraps
396/// vec_xl on POWER9 and above, Altivec load on POWER8 and below
397/// \sa VecLoad_ALTIVEC, VecLoadAligned
398/// \since Crypto++ 6.0
399inline uint32x4_p VecLoad(int off, const byte src[16])
400{
401 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
402 // word pointers. The ISA lacks loads for short* and char*.
403 // Power9/ISA 3.0 provides vec_xl for all datatypes.
404
405 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
406 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
407 CRYPTOPP_UNUSED(addr);
408
409#if defined(_ARCH_PWR9)
410 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
411#else
413#endif
414}
415
416/// \brief Loads a vector from a word array
417/// \param src the word array
418/// \details VecLoad() loads a vector from a word array.
419/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
420/// The instruction does not require aligned effective memory addresses.
421/// VecLoad_ALTIVEC() is used if POWER7 is not available.
422/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
423/// are required to fix up unaligned memory addresses.
424/// \par Wraps
425/// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
426/// \sa VecLoad_ALTIVEC, VecLoadAligned
427/// \since Crypto++ 8.0
428inline uint32x4_p VecLoad(const word32 src[4])
429{
430 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
431 // word pointers. The ISA lacks loads for short* and char*.
432 // Power9/ISA 3.0 provides vec_xl for all datatypes.
433
434 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
435 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
436 CRYPTOPP_UNUSED(addr);
437
438#if defined(_ARCH_PWR9)
439 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
440#elif defined(__VSX__) || defined(_ARCH_PWR8)
441 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
442#else
444#endif
445}
446
447/// \brief Loads a vector from a word array
448/// \param src the word array
449/// \param off offset into the word array
450/// \details VecLoad() loads a vector from a word array.
451/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
452/// The instruction does not require aligned effective memory addresses.
453/// VecLoad_ALTIVEC() is used if POWER7 is not available.
454/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
455/// are required to fix up unaligned memory addresses.
456/// \par Wraps
457/// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
458/// \sa VecLoad_ALTIVEC, VecLoadAligned
459/// \since Crypto++ 8.0
460inline uint32x4_p VecLoad(int off, const word32 src[4])
461{
462 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
463 // word pointers. The ISA lacks loads for short* and char*.
464 // Power9/ISA 3.0 provides vec_xl for all datatypes.
465
466 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
467 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
468 CRYPTOPP_UNUSED(addr);
469
470#if defined(_ARCH_PWR9)
471 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
472#elif defined(__VSX__) || defined(_ARCH_PWR8)
473 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
474#else
476#endif
477}
478
479#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
480
481/// \brief Loads a vector from a double word array
482/// \param src the double word array
483/// \details VecLoad() loads a vector from a double word array.
484/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
485/// The instruction does not require aligned effective memory addresses.
486/// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
487/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
488/// are required to fix up unaligned memory addresses.
489/// \details VecLoad() with 64-bit elements is available on POWER7 and above.
490/// \par Wraps
491/// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
492/// \sa VecLoad_ALTIVEC, VecLoadAligned
493/// \since Crypto++ 8.0
494inline uint64x2_p VecLoad(const word64 src[2])
495{
496 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
497 // word pointers. The ISA lacks loads for short* and char*.
498 // Power9/ISA 3.0 provides vec_xl for all datatypes.
499
500 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
501 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
502 CRYPTOPP_UNUSED(addr);
503
504#if defined(_ARCH_PWR9)
505 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
506#elif defined(__VSX__) || defined(_ARCH_PWR8)
507 // The 32-bit cast is not a typo. Compiler workaround.
508 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
509#else
511#endif
512}
513
514/// \brief Loads a vector from a double word array
515/// \param src the double word array
516/// \param off offset into the double word array
517/// \details VecLoad() loads a vector from a double word array.
518/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
519/// The instruction does not require aligned effective memory addresses.
520/// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
521/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
522/// are required to fix up unaligned memory addresses.
523/// \details VecLoad() with 64-bit elements is available on POWER8 and above.
524/// \par Wraps
525/// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
526/// \sa VecLoad_ALTIVEC, VecLoadAligned
527/// \since Crypto++ 8.0
528inline uint64x2_p VecLoad(int off, const word64 src[2])
529{
530 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
531 // word pointers. The ISA lacks loads for short* and char*.
532 // Power9/ISA 3.0 provides vec_xl for all datatypes.
533
534 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
535 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
536 CRYPTOPP_UNUSED(addr);
537
538#if defined(_ARCH_PWR9)
539 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
540#elif defined(__VSX__) || defined(_ARCH_PWR8)
541 // The 32-bit cast is not a typo. Compiler workaround.
542 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
543#else
545#endif
546}
547
548#endif // VSX or ARCH_PWR8
549
550/// \brief Loads a vector from an aligned byte array
551/// \param src the byte array
552/// \details VecLoadAligned() loads a vector from an aligned byte array.
553/// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
554/// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
555/// address of <tt>src</tt> must be 16-byte aligned for Altivec.
556/// \par Wraps
557/// vec_xl on POWER9, vec_ld on POWER8 and below
558/// \sa VecLoad_ALTIVEC, VecLoad
559/// \since Crypto++ 8.0
560inline uint32x4_p VecLoadAligned(const byte src[16])
561{
562 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
563 // word pointers. The ISA lacks loads for short* and char*.
564 // Power9/ISA 3.0 provides vec_xl for all datatypes.
565
566 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
567 CRYPTOPP_ASSERT(addr % 16 == 0);
568 CRYPTOPP_UNUSED(addr);
569
570#if defined(_ARCH_PWR9)
571 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
572#else
573 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
574#endif
575}
576
577/// \brief Loads a vector from an aligned byte array
578/// \param src the byte array
579/// \param off offset into the src byte array
580/// \details VecLoadAligned() loads a vector from an aligned byte array.
581/// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
582/// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
583/// address of <tt>src</tt> must be 16-byte aligned for Altivec.
584/// \par Wraps
585/// vec_xl on POWER9, vec_ld on POWER8 and below
586/// \sa VecLoad_ALTIVEC, VecLoad
587/// \since Crypto++ 8.0
588inline uint32x4_p VecLoadAligned(int off, const byte src[16])
589{
590 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
591 // word pointers. The ISA lacks loads for short* and char*.
592 // Power9/ISA 3.0 provides vec_xl for all datatypes.
593
594 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
595 CRYPTOPP_ASSERT(addr % 16 == 0);
596 CRYPTOPP_UNUSED(addr);
597
598#if defined(_ARCH_PWR9)
599 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
600#else
601 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
602#endif
603}
604
605/// \brief Loads a vector from an aligned word array
606/// \param src the word array
607/// \details VecLoadAligned() loads a vector from an aligned word array.
608/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
609/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
610/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
611/// \par Wraps
612/// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
613/// \sa VecLoad_ALTIVEC, VecLoad
614/// \since Crypto++ 8.0
615inline uint32x4_p VecLoadAligned(const word32 src[4])
616{
617 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
618 // word pointers. The ISA lacks loads for short* and char*.
619 // Power9/ISA 3.0 provides vec_xl for all datatypes.
620
621 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
622 CRYPTOPP_ASSERT(addr % 16 == 0);
623 CRYPTOPP_UNUSED(addr);
624
625#if defined(_ARCH_PWR9)
626 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
627#elif defined(__VSX__) || defined(_ARCH_PWR8)
628 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
629#else
630 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
631#endif
632}
633
634/// \brief Loads a vector from an aligned word array
635/// \param src the word array
636/// \param off offset into the src word array
637/// \details VecLoadAligned() loads a vector from an aligned word array.
638/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
639/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
640/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
641/// \par Wraps
642/// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
643/// \sa VecLoad_ALTIVEC, VecLoad
644/// \since Crypto++ 8.0
645inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
646{
647 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
648 // word pointers. The ISA lacks loads for short* and char*.
649 // Power9/ISA 3.0 provides vec_xl for all datatypes.
650
651 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
652 CRYPTOPP_ASSERT(addr % 16 == 0);
653 CRYPTOPP_UNUSED(addr);
654
655#if defined(_ARCH_PWR9)
656 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
657#elif defined(__VSX__) || defined(_ARCH_PWR8)
658 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
659#else
660 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
661#endif
662}
663
664#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
665
666/// \brief Loads a vector from an aligned double word array
667/// \param src the double word array
668/// \details VecLoadAligned() loads a vector from an aligned double word array.
669/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
670/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
671/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
672/// \par Wraps
673/// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
674/// \sa VecLoad_ALTIVEC, VecLoad
675/// \since Crypto++ 8.0
676inline uint64x2_p VecLoadAligned(const word64 src[4])
677{
678 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
679 // word pointers. The ISA lacks loads for short* and char*.
680 // Power9/ISA 3.0 provides vec_xl for all datatypes.
681
682 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
683 CRYPTOPP_ASSERT(addr % 16 == 0);
684 CRYPTOPP_UNUSED(addr);
685
686#if defined(_ARCH_PWR9)
687 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
688#elif defined(__VSX__) || defined(_ARCH_PWR8)
689 // The 32-bit cast is not a typo. Compiler workaround.
690 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
691#else
692 return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
693#endif
694}
695
696/// \brief Loads a vector from an aligned double word array
697/// \param src the double word array
698/// \param off offset into the src double word array
699/// \details VecLoadAligned() loads a vector from an aligned double word array.
700/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
701/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
702/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
703/// \par Wraps
704/// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
705/// \sa VecLoad_ALTIVEC, VecLoad
706/// \since Crypto++ 8.0
707inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
708{
709 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
710 // word pointers. The ISA lacks loads for short* and char*.
711 // Power9/ISA 3.0 provides vec_xl for all datatypes.
712
713 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
714 CRYPTOPP_ASSERT(addr % 16 == 0);
715 CRYPTOPP_UNUSED(addr);
716
717#if defined(_ARCH_PWR9)
718 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
719#elif defined(__VSX__) || defined(_ARCH_PWR8)
720 // The 32-bit cast is not a typo. Compiler workaround.
721 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
722#else
723 return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
724#endif
725}
726
727#endif
728
729/// \brief Loads a vector from a byte array
730/// \param src the byte array
731/// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
732/// will reverse all bytes in the array on a little endian system.
733/// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
734/// The instruction does not require aligned effective memory addresses.
735/// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
736/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
737/// are required to fix up unaligned memory addresses.
738/// \par Wraps
739/// vec_xl on POWER8, Altivec load on POWER7 and below
740/// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
741/// \since Crypto++ 6.0
742inline uint32x4_p VecLoadBE(const byte src[16])
743{
744 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
745 // word pointers. The ISA lacks loads for short* and char*.
746 // Power9/ISA 3.0 provides vec_xl for all datatypes.
747
748 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
749 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750 CRYPTOPP_UNUSED(addr);
751
752#if defined(_ARCH_PWR9)
753 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
754 return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
755#elif defined(CRYPTOPP_BIG_ENDIAN)
757#else
759#endif
760}
761
762/// \brief Loads a vector from a byte array
763/// \param src the byte array
764/// \param off offset into the src byte array
765/// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
766/// will reverse all bytes in the array on a little endian system.
767/// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
768/// The instruction does not require aligned effective memory addresses.
769/// VecLoad_ALTIVEC() is used if POWER7 is not available.
770/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
771/// are required to fix up unaligned memory addresses.
772/// \par Wraps
773/// vec_xl on POWER8, Altivec load on POWER7 and below
774/// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
775/// \since Crypto++ 6.0
776inline uint32x4_p VecLoadBE(int off, const byte src[16])
777{
778 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
779 // word pointers. The ISA lacks loads for short* and char*.
780 // Power9/ISA 3.0 provides vec_xl for all datatypes.
781
782 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
783 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784 CRYPTOPP_UNUSED(addr);
785
786#if defined(_ARCH_PWR9)
787 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
788 return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
789#elif defined(CRYPTOPP_BIG_ENDIAN)
791#else
793#endif
794}
795
796//@}
797
798/// \name STORE OPERATIONS
799//@{
800
801/// \brief Stores a vector to a byte array
802/// \tparam T vector type
803/// \param data the vector
804/// \param dest the byte array
805/// \details VecStore_ALTIVEC() stores a vector to a byte array.
806/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
807/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
808/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
809/// memory addresses.
810/// \details VecStore_ALTIVEC() is used when POWER7 or above
811/// and unaligned loads is not available.
812/// \par Wraps
813/// vec_st, vec_ste, vec_lvsr, vec_perm
814/// \sa VecStore, VecStoreAligned
815/// \since Crypto++ 8.0
816template<class T>
817inline void VecStore_ALTIVEC(const T data, byte dest[16])
818{
819 // Avoid IsAlignedOn for convenience.
820 uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
821 if (addr % 16 == 0)
822 {
823 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
824 }
825 else
826 {
827 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
828 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
829 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
830 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
831 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
832 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
833 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
834 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
835 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
836 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
837 }
838}
839
840/// \brief Stores a vector to a byte array
841/// \tparam T vector type
842/// \param data the vector
843/// \param off offset into the dest byte array
844/// \param dest the byte array
845/// \details VecStore_ALTIVEC() stores a vector to a byte array.
846/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
847/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
848/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
849/// memory addresses.
850/// \details VecStore_ALTIVEC() is used when POWER7 or above
851/// and unaligned loads is not available.
852/// \par Wraps
853/// vec_st, vec_ste, vec_lvsr, vec_perm
854/// \sa VecStore, VecStoreAligned
855/// \since Crypto++ 8.0
856template<class T>
857inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
858{
859 // Avoid IsAlignedOn for convenience.
860 uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
861 if (addr % 16 == 0)
862 {
863 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
864 }
865 else
866 {
867 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
868 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
869 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
870 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
871 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
872 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
873 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
874 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
875 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
876 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
877 }
878}
879
880/// \brief Stores a vector to a byte array
881/// \tparam T vector type
882/// \param data the vector
883/// \param dest the byte array
884/// \details VecStore() stores a vector to a byte array.
885/// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
886/// The instruction does not require aligned effective memory addresses.
887/// VecStore_ALTIVEC() is used if POWER9 is not available.
888/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
889/// are required to fix up unaligned memory addresses.
890/// \par Wraps
891/// vec_xst on POWER9 and above, Altivec store on POWER8 and below
892/// \sa VecStore_ALTIVEC, VecStoreAligned
893/// \since Crypto++ 6.0
894template<class T>
895inline void VecStore(const T data, byte dest[16])
896{
897 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
898 // word pointers. The ISA lacks loads for short* and char*.
899 // Power9/ISA 3.0 provides vec_xl for all datatypes.
900
901 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
902 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
903 CRYPTOPP_UNUSED(addr);
904
905#if defined(_ARCH_PWR9)
906 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
907#else
909#endif
910}
911
912/// \brief Stores a vector to a byte array
913/// \tparam T vector type
914/// \param data the vector
915/// \param off offset into the dest byte array
916/// \param dest the byte array
917/// \details VecStore() stores a vector to a byte array.
918/// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
919/// The instruction does not require aligned effective memory addresses.
920/// VecStore_ALTIVEC() is used if POWER9 is not available.
921/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
922/// are required to fix up unaligned memory addresses.
923/// \par Wraps
924/// vec_xst on POWER9 and above, Altivec store on POWER8 and below
925/// \sa VecStore_ALTIVEC, VecStoreAligned
926/// \since Crypto++ 6.0
927template<class T>
928inline void VecStore(const T data, int off, byte dest[16])
929{
930 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
931 // word pointers. The ISA lacks loads for short* and char*.
932 // Power9/ISA 3.0 provides vec_xl for all datatypes.
933
934 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
935 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
936 CRYPTOPP_UNUSED(addr);
937
938#if defined(_ARCH_PWR9)
939 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
940#else
942#endif
943}
944
945/// \brief Stores a vector to a word array
946/// \tparam T vector type
947/// \param data the vector
948/// \param dest the word array
949/// \details VecStore() stores a vector to a word array.
950/// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
951/// The instruction does not require aligned effective memory addresses.
952/// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
953/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
954/// are required to fix up unaligned memory addresses.
955/// \par Wraps
956/// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
957/// \sa VecStore_ALTIVEC, VecStoreAligned
958/// \since Crypto++ 8.0
959template<class T>
960inline void VecStore(const T data, word32 dest[4])
961{
962 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
963 // word pointers. The ISA lacks stores for short* and char*.
964 // Power9/ISA 3.0 provides vec_xst for all datatypes.
965
966 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
967 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
968 CRYPTOPP_UNUSED(addr);
969
970#if defined(_ARCH_PWR9)
971 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
972#elif defined(__VSX__) || defined(_ARCH_PWR8)
973 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
974#else
976#endif
977}
978
979/// \brief Stores a vector to a word array
980/// \tparam T vector type
981/// \param data the vector
982/// \param off offset into the dest word array
983/// \param dest the word array
984/// \details VecStore() stores a vector to a word array.
985/// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
986/// The instruction does not require aligned effective memory addresses.
987/// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
988/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
989/// are required to fix up unaligned memory addresses.
990/// \par Wraps
991/// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
992/// \sa VecStore_ALTIVEC, VecStoreAligned
993/// \since Crypto++ 8.0
994template<class T>
995inline void VecStore(const T data, int off, word32 dest[4])
996{
997 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
998 // word pointers. The ISA lacks stores for short* and char*.
999 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1000
1001 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003 CRYPTOPP_UNUSED(addr);
1004
1005#if defined(_ARCH_PWR9)
1006 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007#elif defined(__VSX__) || defined(_ARCH_PWR8)
1008 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009#else
1011#endif
1012}
1013
1014/// \brief Stores a vector to a word array
1015/// \tparam T vector type
1016/// \param data the vector
1017/// \param dest the word array
1018/// \details VecStore() stores a vector to a word array.
1019/// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1020/// The instruction does not require aligned effective memory addresses.
1021/// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1022/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1023/// are required to fix up unaligned memory addresses.
1024/// \details VecStore() with 64-bit elements is available on POWER8 and above.
1025/// \par Wraps
1026/// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1027/// \sa VecStore_ALTIVEC, VecStoreAligned
1028/// \since Crypto++ 8.0
1029template<class T>
1030inline void VecStore(const T data, word64 dest[2])
1031{
1032 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1033 // word pointers. The ISA lacks stores for short* and char*.
1034 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1035
1036 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038 CRYPTOPP_UNUSED(addr);
1039
1040#if defined(_ARCH_PWR9)
1041 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042#elif defined(__VSX__) || defined(_ARCH_PWR8)
1043 // 32-bit cast is not a typo. Compiler workaround.
1044 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045#else
1047#endif
1048}
1049
1050/// \brief Stores a vector to a word array
1051/// \tparam T vector type
1052/// \param data the vector
1053/// \param off offset into the dest word array
1054/// \param dest the word array
1055/// \details VecStore() stores a vector to a word array.
1056/// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1057/// The instruction does not require aligned effective memory addresses.
1058/// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1059/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1060/// are required to fix up unaligned memory addresses.
1061/// \details VecStore() with 64-bit elements is available on POWER8 and above.
1062/// \par Wraps
1063/// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1064/// \sa VecStore_ALTIVEC, VecStoreAligned
1065/// \since Crypto++ 8.0
1066template<class T>
1067inline void VecStore(const T data, int off, word64 dest[2])
1068{
1069 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1070 // word pointers. The ISA lacks stores for short* and char*.
1071 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1072
1073 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075 CRYPTOPP_UNUSED(addr);
1076
1077#if defined(_ARCH_PWR9)
1078 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079#elif defined(__VSX__) || defined(_ARCH_PWR8)
1080 // 32-bit cast is not a typo. Compiler workaround.
1081 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082#else
1084#endif
1085}
1086
1087/// \brief Stores a vector to a byte array
1088/// \tparam T vector type
1089/// \param data the vector
1090/// \param dest the byte array
1091/// \details VecStoreAligned() stores a vector from an aligned byte array.
1092/// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1093/// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1094/// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1095/// \par Wraps
1096/// vec_xst on POWER9 or above, vec_st on POWER8 and below
1097/// \sa VecStore_ALTIVEC, VecStore
1098/// \since Crypto++ 8.0
1099template<class T>
1100inline void VecStoreAligned(const T data, byte dest[16])
1101{
1102 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1103 // word pointers. The ISA lacks loads for short* and char*.
1104 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1105
1106 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108 CRYPTOPP_UNUSED(addr);
1109
1110#if defined(_ARCH_PWR9)
1111 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112#else
1113 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114#endif
1115}
1116
1117/// \brief Stores a vector to a byte array
1118/// \tparam T vector type
1119/// \param data the vector
1120/// \param off offset into the dest byte array
1121/// \param dest the byte array
1122/// \details VecStoreAligned() stores a vector from an aligned byte array.
1123/// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1124/// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1125/// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1126/// \par Wraps
1127/// vec_xst on POWER9 or above, vec_st on POWER8 and below
1128/// \sa VecStore_ALTIVEC, VecStore
1129/// \since Crypto++ 8.0
1130template<class T>
1131inline void VecStoreAligned(const T data, int off, byte dest[16])
1132{
1133 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1134 // word pointers. The ISA lacks loads for short* and char*.
1135 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1136
1137 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139 CRYPTOPP_UNUSED(addr);
1140
1141#if defined(_ARCH_PWR9)
1142 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143#else
1144 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145#endif
1146}
1147
1148/// \brief Stores a vector to a word array
1149/// \tparam T vector type
1150/// \param data the vector
1151/// \param dest the word array
1152/// \details VecStoreAligned() stores a vector from an aligned word array.
1153/// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1154/// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1155/// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1156/// must be 16-byte aligned for Altivec.
1157/// \par Wraps
1158/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1159/// \sa VecStore_ALTIVEC, VecStore
1160/// \since Crypto++ 8.0
1161template<class T>
1162inline void VecStoreAligned(const T data, word32 dest[4])
1163{
1164 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1165 // word pointers. The ISA lacks stores for short* and char*.
1166 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1167
1168 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170 CRYPTOPP_UNUSED(addr);
1171
1172#if defined(_ARCH_PWR9)
1173 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174#elif defined(__VSX__) || defined(_ARCH_PWR8)
1175 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176#else
1177 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178#endif
1179}
1180
1181/// \brief Stores a vector to a word array
1182/// \tparam T vector type
1183/// \param data the vector
1184/// \param off offset into the dest word array
1185/// \param dest the word array
1186/// \details VecStoreAligned() stores a vector from an aligned word array.
1187/// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1188/// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1189/// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1190/// must be 16-byte aligned for Altivec.
1191/// \par Wraps
1192/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1193/// \sa VecStore_ALTIVEC, VecStore
1194/// \since Crypto++ 8.0
1195template<class T>
1196inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197{
1198 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1199 // word pointers. The ISA lacks stores for short* and char*.
1200 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1201
1202 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204 CRYPTOPP_UNUSED(addr);
1205
1206#if defined(_ARCH_PWR9)
1207 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208#elif defined(__VSX__) || defined(_ARCH_PWR8)
1209 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210#else
1211 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212#endif
1213}
1214
1215/// \brief Stores a vector to a byte array
1216/// \tparam T vector type
1217/// \param data the vector
1218/// \param dest the byte array
1219/// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1220/// will reverse all bytes in the array on a little endian system.
1221/// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1222/// The instruction does not require aligned effective memory addresses.
1223/// VecStore_ALTIVEC() is used if POWER7 is not available.
1224/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1225/// are required to fix up unaligned memory addresses.
1226/// \par Wraps
1227/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1228/// \sa VecStore_ALTIVEC, VecStoreAligned
1229/// \since Crypto++ 6.0
1230template <class T>
1231inline void VecStoreBE(const T data, byte dest[16])
1232{
1233 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1234 // word pointers. The ISA lacks stores for short* and char*.
1235 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1236
1237 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239 CRYPTOPP_UNUSED(addr);
1240
1241#if defined(_ARCH_PWR9)
1242 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243#elif defined(CRYPTOPP_BIG_ENDIAN)
1244 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245#else
1247#endif
1248}
1249
1250/// \brief Stores a vector to a byte array
1251/// \tparam T vector type
1252/// \param data the vector
1253/// \param off offset into the dest byte array
1254/// \param dest the byte array
1255/// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1256/// will reverse all bytes in the array on a little endian system.
1257/// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1258/// The instruction does not require aligned effective memory addresses.
1259/// VecStore_ALTIVEC() is used if POWER7 is not available.
1260/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1261/// are required to fix up unaligned memory addresses.
1262/// \par Wraps
1263/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1264/// \sa VecStore_ALTIVEC, VecStoreAligned
1265/// \since Crypto++ 6.0
1266template <class T>
1267inline void VecStoreBE(const T data, int off, byte dest[16])
1268{
1269 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1270 // word pointers. The ISA lacks stores for short* and char*.
1271 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1272
1273 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275 CRYPTOPP_UNUSED(addr);
1276
1277#if defined(_ARCH_PWR9)
1278 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279#elif defined(CRYPTOPP_BIG_ENDIAN)
1280 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281#else
1283#endif
1284}
1285
1286/// \brief Stores a vector to a word array
1287/// \tparam T vector type
1288/// \param data the vector
1289/// \param dest the word array
1290/// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1291/// will reverse all bytes in the array on a little endian system.
1292/// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1293/// The instruction does not require aligned effective memory addresses.
1294/// VecStore_ALTIVEC() is used if POWER7 is not available.
1295/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1296/// are required to fix up unaligned memory addresses.
1297/// \par Wraps
1298/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1299/// \sa VecStore_ALTIVEC, VecStoreAligned
1300/// \since Crypto++ 8.0
1301template <class T>
1302inline void VecStoreBE(const T data, word32 dest[4])
1303{
1304 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1305 // word pointers. The ISA lacks stores for short* and char*.
1306 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1307
1308 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310 CRYPTOPP_UNUSED(addr);
1311
1312#if defined(_ARCH_PWR9)
1313 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314#elif defined(CRYPTOPP_BIG_ENDIAN)
1315 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316#else
1318#endif
1319}
1320
1321/// \brief Stores a vector to a word array
1322/// \tparam T vector type
1323/// \param data the vector
1324/// \param off offset into the dest word array
1325/// \param dest the word array
1326/// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1327/// will reverse all words in the array on a little endian system.
1328/// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1329/// The instruction does not require aligned effective memory addresses.
1330/// VecStore_ALTIVEC() is used if POWER7 is not available.
1331/// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1332/// are required to fix up unaligned memory addresses.
1333/// \par Wraps
1334/// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1335/// \sa VecStore_ALTIVEC, VecStoreAligned
1336/// \since Crypto++ 8.0
1337template <class T>
1338inline void VecStoreBE(const T data, int off, word32 dest[4])
1339{
1340 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1341 // word pointers. The ISA lacks stores for short* and char*.
1342 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1343
1344 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346 CRYPTOPP_UNUSED(addr);
1347
1348#if defined(_ARCH_PWR9)
1349 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350#elif defined(CRYPTOPP_BIG_ENDIAN)
1351 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352#else
1354#endif
1355}
1356
1357//@}
1358
1359/// \name LOGICAL OPERATIONS
1360//@{
1361
1362/// \brief AND two vectors
1363/// \tparam T1 vector type
1364/// \tparam T2 vector type
1365/// \param vec1 the first vector
1366/// \param vec2 the second vector
1367/// \return vector
1368/// \details VecAnd() performs <tt>vec1 & vec2</tt>.
1369/// vec2 is cast to the same type as vec1. The return vector
1370/// is the same type as vec1.
1371/// \par Wraps
1372/// vec_and
1373/// \sa VecAnd64
1374/// \since Crypto++ 6.0
1375template <class T1, class T2>
1376inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377{
1378 return (T1)vec_and(vec1, (T1)vec2);
1379}
1380
1381/// \brief OR two vectors
1382/// \tparam T1 vector type
1383/// \tparam T2 vector type
1384/// \param vec1 the first vector
1385/// \param vec2 the second vector
1386/// \return vector
1387/// \details VecOr() performs <tt>vec1 | vec2</tt>.
1388/// vec2 is cast to the same type as vec1. The return vector
1389/// is the same type as vec1.
1390/// \par Wraps
1391/// vec_or
1392/// \sa VecOr64
1393/// \since Crypto++ 6.0
1394template <class T1, class T2>
1395inline T1 VecOr(const T1 vec1, const T2 vec2)
1396{
1397 return (T1)vec_or(vec1, (T1)vec2);
1398}
1399
1400/// \brief XOR two vectors
1401/// \tparam T1 vector type
1402/// \tparam T2 vector type
1403/// \param vec1 the first vector
1404/// \param vec2 the second vector
1405/// \return vector
1406/// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
1407/// vec2 is cast to the same type as vec1. The return vector
1408/// is the same type as vec1.
1409/// \par Wraps
1410/// vec_xor
1411/// \sa VecXor64
1412/// \since Crypto++ 6.0
1413template <class T1, class T2>
1414inline T1 VecXor(const T1 vec1, const T2 vec2)
1415{
1416 return (T1)vec_xor(vec1, (T1)vec2);
1417}
1418
1419//@}
1420
1421/// \name ARITHMETIC OPERATIONS
1422//@{
1423
1424/// \brief Add two vectors
1425/// \tparam T1 vector type
1426/// \tparam T2 vector type
1427/// \param vec1 the first vector
1428/// \param vec2 the second vector
1429/// \return vector
1430/// \details VecAdd() performs <tt>vec1 + vec2</tt>.
1431/// vec2 is cast to the same type as vec1. The return vector
1432/// is the same type as vec1.
1433/// \par Wraps
1434/// vec_add
1435/// \sa VecAdd64
1436/// \since Crypto++ 6.0
1437template <class T1, class T2>
1438inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439{
1440 return (T1)vec_add(vec1, (T1)vec2);
1441}
1442
1443/// \brief Subtract two vectors
1444/// \tparam T1 vector type
1445/// \tparam T2 vector type
1446/// \param vec1 the first vector
1447/// \param vec2 the second vector
1448/// \details VecSub() performs <tt>vec1 - vec2</tt>.
1449/// vec2 is cast to the same type as vec1. The return vector
1450/// is the same type as vec1.
1451/// \par Wraps
1452/// vec_sub
1453/// \sa VecSub64
1454/// \since Crypto++ 6.0
1455template <class T1, class T2>
1456inline T1 VecSub(const T1 vec1, const T2 vec2)
1457{
1458 return (T1)vec_sub(vec1, (T1)vec2);
1459}
1460
1461//@}
1462
1463/// \name PERMUTE OPERATIONS
1464//@{
1465
1466/// \brief Permutes a vector
1467/// \tparam T1 vector type
1468/// \tparam T2 vector type
1469/// \param vec the vector
1470/// \param mask vector mask
1471/// \return vector
1472/// \details VecPermute() creates a new vector from vec according to mask.
1473/// mask is an uint8x16_p vector. The return vector is the same type as vec.
1474/// \par Wraps
1475/// vec_perm
1476/// \since Crypto++ 6.0
1477template <class T1, class T2>
1478inline T1 VecPermute(const T1 vec, const T2 mask)
1479{
1480 return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481}
1482
1483/// \brief Permutes two vectors
1484/// \tparam T1 vector type
1485/// \tparam T2 vector type
1486/// \param vec1 the first vector
1487/// \param vec2 the second vector
1488/// \param mask vector mask
1489/// \return vector
1490/// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
1491/// mask is an uint8x16_p vector. The return vector is the same type as vec.
1492/// \par Wraps
1493/// vec_perm
1494/// \since Crypto++ 6.0
1495template <class T1, class T2>
1496inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497{
1498 return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499}
1500
1501//@}
1502
1503/// \name SHIFT AND ROTATE OPERATIONS
1504//@{
1505
1506/// \brief Shift a vector left
1507/// \tparam C shift byte count
1508/// \tparam T vector type
1509/// \param vec the vector
1510/// \return vector
1511/// \details VecShiftLeftOctet() returns a new vector after shifting the
1512/// concatenation of the zero vector and the source vector by the specified
1513/// number of bytes. The return vector is the same type as vec.
1514/// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1515/// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1516/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1517/// if on a big endian machine as shown below.
1518/// <pre>
1519/// uint8x16_p x = VecLoad(ptr);
1520/// uint8x16_p y = VecShiftLeftOctet<12>(x);
1521/// </pre>
1522/// \par Wraps
1523/// vec_sld
1524/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1525/// endian sensitive?</A> on Stack Overflow
1526/// \since Crypto++ 6.0
1527template <unsigned int C, class T>
1528inline T VecShiftLeftOctet(const T vec)
1529{
1530 const T zero = {0};
1531 if (C >= 16)
1532 {
1533 // Out of range
1534 return zero;
1535 }
1536 else if (C == 0)
1537 {
1538 // Noop
1539 return vec;
1540 }
1541 else
1542 {
1543#if defined(CRYPTOPP_BIG_ENDIAN)
1544 enum { R=C&0xf };
1545 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546#else
1547 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1548 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549#endif
1550 }
1551}
1552
1553/// \brief Shift a vector right
1554/// \tparam C shift byte count
1555/// \tparam T vector type
1556/// \param vec the vector
1557/// \return vector
1558/// \details VecShiftRightOctet() returns a new vector after shifting the
1559/// concatenation of the zero vector and the source vector by the specified
1560/// number of bytes. The return vector is the same type as vec.
1561/// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1562/// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1563/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1564/// if on a big endian machine as shown below.
1565/// <pre>
1566/// uint8x16_p x = VecLoad(ptr);
1567/// uint8x16_p y = VecShiftRightOctet<12>(y);
1568/// </pre>
1569/// \par Wraps
1570/// vec_sld
1571/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1572/// endian sensitive?</A> on Stack Overflow
1573/// \since Crypto++ 6.0
1574template <unsigned int C, class T>
1575inline T VecShiftRightOctet(const T vec)
1576{
1577 const T zero = {0};
1578 if (C >= 16)
1579 {
1580 // Out of range
1581 return zero;
1582 }
1583 else if (C == 0)
1584 {
1585 // Noop
1586 return vec;
1587 }
1588 else
1589 {
1590#if defined(CRYPTOPP_BIG_ENDIAN)
1591 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1592 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593#else
1594 enum { R=C&0xf };
1595 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596#endif
1597 }
1598}
1599
1600/// \brief Rotate a vector left
1601/// \tparam C shift byte count
1602/// \tparam T vector type
1603/// \param vec the vector
1604/// \return vector
1605/// \details VecRotateLeftOctet() returns a new vector after rotating the
1606/// concatenation of the source vector with itself by the specified
1607/// number of bytes. The return vector is the same type as vec.
1608/// \par Wraps
1609/// vec_sld
1610/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1611/// endian sensitive?</A> on Stack Overflow
1612/// \since Crypto++ 6.0
1613template <unsigned int C, class T>
1614inline T VecRotateLeftOctet(const T vec)
1615{
1616#if defined(CRYPTOPP_BIG_ENDIAN)
1617 enum { R = C&0xf };
1618 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619#else
1620 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1621 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622#endif
1623}
1624
1625/// \brief Rotate a vector right
1626/// \tparam C shift byte count
1627/// \tparam T vector type
1628/// \param vec the vector
1629/// \return vector
1630/// \details VecRotateRightOctet() returns a new vector after rotating the
1631/// concatenation of the source vector with itself by the specified
1632/// number of bytes. The return vector is the same type as vec.
1633/// \par Wraps
1634/// vec_sld
1635/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1636/// endian sensitive?</A> on Stack Overflow
1637/// \since Crypto++ 6.0
1638template <unsigned int C, class T>
1639inline T VecRotateRightOctet(const T vec)
1640{
1641#if defined(CRYPTOPP_BIG_ENDIAN)
1642 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1643 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644#else
1645 enum { R = C&0xf };
1646 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647#endif
1648}
1649
1650/// \brief Rotate a vector left
1651/// \tparam C rotate bit count
1652/// \param vec the vector
1653/// \return vector
1654/// \details VecRotateLeft() rotates each element in a vector by
1655/// bit count. The return vector is the same type as vec.
1656/// \par Wraps
1657/// vec_rl
1658/// \since Crypto++ 7.0
1659template<unsigned int C>
1661{
1662 const uint32x4_p m = {C, C, C, C};
1663 return vec_rl(vec, m);
1664}
1665
1666/// \brief Rotate a vector right
1667/// \tparam C rotate bit count
1668/// \param vec the vector
1669/// \return vector
1670/// \details VecRotateRight() rotates each element in a vector
1671/// by bit count. The return vector is the same type as vec.
1672/// \par Wraps
1673/// vec_rl
1674/// \since Crypto++ 7.0
1675template<unsigned int C>
1677{
1678 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679 return vec_rl(vec, m);
1680}
1681
1682/// \brief Shift a vector left
1683/// \tparam C shift bit count
1684/// \param vec the vector
1685/// \return vector
1686/// \details VecShiftLeft() rotates each element in a vector
1687/// by bit count. The return vector is the same type as vec.
1688/// \par Wraps
1689/// vec_sl
1690/// \since Crypto++ 8.1
1691template<unsigned int C>
1693{
1694 const uint32x4_p m = {C, C, C, C};
1695 return vec_sl(vec, m);
1696}
1697
1698/// \brief Shift a vector right
1699/// \tparam C shift bit count
1700/// \param vec the vector
1701/// \return vector
1702/// \details VecShiftRight() rotates each element in a vector
1703/// by bit count. The return vector is the same type as vec.
1704/// \par Wraps
1705/// vec_rl
1706/// \since Crypto++ 8.1
1707template<unsigned int C>
1709{
1710 const uint32x4_p m = {C, C, C, C};
1711 return vec_sr(vec, m);
1712}
1713
1714// 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1715#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716
1717/// \brief Rotate a vector left
1718/// \tparam C rotate bit count
1719/// \param vec the vector
1720/// \return vector
1721/// \details VecRotateLeft() rotates each element in a vector
1722/// by bit count. The return vector is the same type as vec.
1723/// \details VecRotateLeft() with 64-bit elements is available on
1724/// POWER8 and above.
1725/// \par Wraps
1726/// vec_rl
1727/// \since Crypto++ 8.0
1728template<unsigned int C>
1730{
1731 const uint64x2_p m = {C, C};
1732 return vec_rl(vec, m);
1733}
1734
1735/// \brief Shift a vector left
1736/// \tparam C shift bit count
1737/// \param vec the vector
1738/// \return vector
1739/// \details VecShiftLeft() rotates each element in a vector
1740/// by bit count. The return vector is the same type as vec.
1741/// \details VecShiftLeft() with 64-bit elements is available on
1742/// POWER8 and above.
1743/// \par Wraps
1744/// vec_sl
1745/// \since Crypto++ 8.1
1746template<unsigned int C>
1748{
1749 const uint64x2_p m = {C, C};
1750 return vec_sl(vec, m);
1751}
1752
1753/// \brief Rotate a vector right
1754/// \tparam C rotate bit count
1755/// \param vec the vector
1756/// \return vector
1757/// \details VecRotateRight() rotates each element in a vector
1758/// by bit count. The return vector is the same type as vec.
1759/// \details VecRotateRight() with 64-bit elements is available on
1760/// POWER8 and above.
1761/// \par Wraps
1762/// vec_rl
1763/// \since Crypto++ 8.0
1764template<unsigned int C>
1766{
1767 const uint64x2_p m = {64-C, 64-C};
1768 return vec_rl(vec, m);
1769}
1770
1771/// \brief Shift a vector right
1772/// \tparam C shift bit count
1773/// \param vec the vector
1774/// \return vector
1775/// \details VecShiftRight() rotates each element in a vector
1776/// by bit count. The return vector is the same type as vec.
1777/// \details VecShiftRight() with 64-bit elements is available on
1778/// POWER8 and above.
1779/// \par Wraps
1780/// vec_sr
1781/// \since Crypto++ 8.1
1782template<unsigned int C>
1784{
1785 const uint64x2_p m = {C, C};
1786 return vec_sr(vec, m);
1787}
1788
1789#endif // ARCH_PWR8
1790
1791//@}
1792
1793/// \name OTHER OPERATIONS
1794//@{
1795
1796/// \brief Merge two vectors
1797/// \tparam T vector type
1798/// \param vec1 the first vector
1799/// \param vec2 the second vector
1800/// \return vector
1801/// \par Wraps
1802/// vec_mergel
1803/// \since Crypto++ 8.1
1804template <class T>
1805inline T VecMergeLow(const T vec1, const T vec2)
1806{
1807 return vec_mergel(vec1, vec2);
1808}
1809
1810/// \brief Merge two vectors
1811/// \tparam T vector type
1812/// \param vec1 the first vector
1813/// \param vec2 the second vector
1814/// \return vector
1815/// \par Wraps
1816/// vec_mergeh
1817/// \since Crypto++ 8.1
1818template <class T>
1819inline T VecMergeHigh(const T vec1, const T vec2)
1820{
1821 return vec_mergeh(vec1, vec2);
1822}
1823
1824/// \brief Broadcast 32-bit word to a vector
1825/// \param val the 32-bit value
1826/// \return vector
1827/// \par Wraps
1828/// vec_splats
1829/// \since Crypto++ 8.3
1831{
1832 // Fix spurious GCC warning???
1833 CRYPTOPP_UNUSED(val);
1834
1835 // Apple Altivec and XL C++ do not offer vec_splats.
1836 // GCC offers vec_splats back to -mcpu=power4.
1837#if defined(_ARCH_PWR4) && defined(__GNUC__)
1838 return vec_splats(val);
1839#else
1840 //const word32 x[4] = {val,val,val,val};
1841 //return VecLoad(x);
1842 const word32 x[4] = {val};
1843 return vec_splat(VecLoad(x),0);
1844#endif
1845}
1846
1847/// \brief Broadcast 32-bit element to a vector
1848/// \tparam the element number
1849/// \param val the 32-bit value
1850/// \return vector
1851/// \par Wraps
1852/// vec_splat
1853/// \since Crypto++ 8.3
1854template <unsigned int N>
1856{
1857 return vec_splat(val, N);
1858}
1859
1860#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1861/// \brief Broadcast 64-bit double word to a vector
1862/// \param val the 64-bit value
1863/// \return vector
1864/// \par Wraps
1865/// vec_splats
1866/// \since Crypto++ 8.3
1868{
1869 // The PPC64 ABI says so.
1870 return vec_splats((unsigned long long)val);
1871}
1872
1873/// \brief Broadcast 64-bit element to a vector
1874/// \tparam the element number
1875/// \param val the 64-bit value
1876/// \return vector
1877/// \par Wraps
1878/// vec_splat
1879/// \since Crypto++ 8.3
1880template <unsigned int N>
1882{
1883#if defined(__VSX__) || defined(_ARCH_PWR8)
1884 return vec_splat(val, N);
1885#else
1886 enum {E=N&1};
1887 if (E == 0)
1888 {
1889 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890 return vec_perm(val, val, m);
1891 }
1892 else // (E == 1)
1893 {
1894 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895 return vec_perm(val, val, m);
1896 }
1897#endif
1898}
1899#endif
1900
1901/// \brief Extract a dword from a vector
1902/// \tparam T vector type
1903/// \param val the vector
1904/// \return vector created from low dword
1905/// \details VecGetLow() extracts the low dword from a vector. The low dword
1906/// is composed of the least significant bits and occupies bytes 8 through 15
1907/// when viewed as a big endian array. The return vector is the same type as
1908/// the original vector and padded with 0's in the most significant bit positions.
1909/// \par Wraps
1910/// vec_sld
1911/// \since Crypto++ 7.0
1912template <class T>
1913inline T VecGetLow(const T val)
1914{
1915#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916 const T zero = {0};
1917 return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918#else
1919 return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920#endif
1921}
1922
1923/// \brief Extract a dword from a vector
1924/// \tparam T vector type
1925/// \param val the vector
1926/// \return vector created from high dword
1927/// \details VecGetHigh() extracts the high dword from a vector. The high dword
1928/// is composed of the most significant bits and occupies bytes 0 through 7
1929/// when viewed as a big endian array. The return vector is the same type as
1930/// the original vector and padded with 0's in the most significant bit positions.
1931/// \par Wraps
1932/// vec_sld
1933/// \since Crypto++ 7.0
1934template <class T>
1935inline T VecGetHigh(const T val)
1936{
1937#if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938 const T zero = {0};
1939 return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940#else
1941 return VecShiftRightOctet<8>(val);
1942#endif
1943}
1944
1945/// \brief Exchange high and low double words
1946/// \tparam T vector type
1947/// \param vec the vector
1948/// \return vector
1949/// \par Wraps
1950/// vec_sld
1951/// \since Crypto++ 7.0
1952template <class T>
1953inline T VecSwapWords(const T vec)
1954{
1955 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956}
1957
1958//@}
1959
1960/// \name COMPARISON
1961//@{
1962
1963/// \brief Compare two vectors
1964/// \tparam T1 vector type
1965/// \tparam T2 vector type
1966/// \param vec1 the first vector
1967/// \param vec2 the second vector
1968/// \return true if vec1 equals vec2, false otherwise
1969/// \details VecEqual() performs a bitwise compare. The vector element types do
1970/// not matter.
1971/// \par Wraps
1972/// vec_all_eq
1973/// \since Crypto++ 8.0
1974template <class T1, class T2>
1975inline bool VecEqual(const T1 vec1, const T2 vec2)
1976{
1977 return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978}
1979
1980/// \brief Compare two vectors
1981/// \tparam T1 vector type
1982/// \tparam T2 vector type
1983/// \param vec1 the first vector
1984/// \param vec2 the second vector
1985/// \return true if vec1 does not equal vec2, false otherwise
1986/// \details VecNotEqual() performs a bitwise compare. The vector element types do
1987/// not matter.
1988/// \par Wraps
1989/// vec_all_eq
1990/// \since Crypto++ 8.0
1991template <class T1, class T2>
1992inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993{
1994 return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995}
1996
1997//@}
1998
1999////////////////// 32-bit Altivec /////////////////
2000
2001/// \name 32-BIT ALTIVEC
2002//@{
2003
2004/// \brief Add two vectors as if uint64x2_p
2005/// \param vec1 the first vector
2006/// \param vec2 the second vector
2007/// \return vector
2008/// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2009/// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2010/// the carries from the elements.
2011/// \par Wraps
2012/// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
2013/// \since Crypto++ 8.3
2014inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015{
2016 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2017#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018 return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019#else
2020 // The carry mask selects carrys for elements 1 and 3 and sets
2021 // remaining elements to 0. The results is then shifted so the
2022 // carried values are added to elements 0 and 2.
2023#if defined(CRYPTOPP_BIG_ENDIAN)
2024 const uint32x4_p zero = {0, 0, 0, 0};
2025 const uint32x4_p mask = {0, 1, 0, 1};
2026#else
2027 const uint32x4_p zero = {0, 0, 0, 0};
2028 const uint32x4_p mask = {1, 0, 1, 0};
2029#endif
2030
2031 uint32x4_p cy = vec_addc(vec1, vec2);
2032 uint32x4_p res = vec_add(vec1, vec2);
2033 cy = vec_and(mask, cy);
2034 cy = vec_sld (cy, zero, 4);
2035 return vec_add(res, cy);
2036#endif
2037}
2038
2039#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2040/// \brief Add two vectors as if uint64x2_p
2041/// \param vec1 the first vector
2042/// \param vec2 the second vector
2043/// \return vector
2044/// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2045/// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2046/// the carries from the elements.
2047/// \par Wraps
2048/// vec_add for POWER8
2049/// \since Crypto++ 8.3
2050inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051{
2052 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2053 const uint64x2_p res = vec_add(vec1, vec2);
2054
2055#if defined(CRYPTOPP_DEBUG)
2056 // Test 32-bit add in debug builds while we are here.
2057 const uint32x4_p x = (uint32x4_p)vec1;
2058 const uint32x4_p y = (uint32x4_p)vec2;
2059 const uint32x4_p r = VecAdd64(x, y);
2060
2061 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062#endif
2063
2064 return res;
2065}
2066#endif
2067
2068/// \brief Subtract two vectors as if uint64x2_p
2069/// \param vec1 the first vector
2070/// \param vec2 the second vector
2071/// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2072/// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2073/// manages the borrows from the elements.
2074/// \par Wraps
2075/// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
2076/// \since Crypto++ 8.3
2077inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078{
2079#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2081 return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082#else
2083 // The borrow mask selects borrows for elements 1 and 3 and sets
2084 // remaining elements to 0. The results is then shifted so the
2085 // borrowed values are subtracted from elements 0 and 2.
2086#if defined(CRYPTOPP_BIG_ENDIAN)
2087 const uint32x4_p zero = {0, 0, 0, 0};
2088 const uint32x4_p mask = {0, 1, 0, 1};
2089#else
2090 const uint32x4_p zero = {0, 0, 0, 0};
2091 const uint32x4_p mask = {1, 0, 1, 0};
2092#endif
2093
2094 // subc sets the complement of borrow, so we have to
2095 // un-complement it using andc.
2096 uint32x4_p bw = vec_subc(vec1, vec2);
2097 uint32x4_p res = vec_sub(vec1, vec2);
2098 bw = vec_andc(mask, bw);
2099 bw = vec_sld (bw, zero, 4);
2100 return vec_sub(res, bw);
2101#endif
2102}
2103
2104#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2105/// \brief Subtract two vectors as if uint64x2_p
2106/// \param vec1 the first vector
2107/// \param vec2 the second vector
2108/// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2109/// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2110/// manages the borrows from the elements.
2111/// \par Wraps
2112/// vec_sub for POWER8
2113/// \since Crypto++ 8.3
2114inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115{
2116 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2117 const uint64x2_p res = vec_sub(vec1, vec2);
2118
2119#if defined(CRYPTOPP_DEBUG)
2120 // Test 32-bit sub in debug builds while we are here.
2121 const uint32x4_p x = (uint32x4_p)vec1;
2122 const uint32x4_p y = (uint32x4_p)vec2;
2123 const uint32x4_p r = VecSub64(x, y);
2124
2125 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126#endif
2127
2128 return res;
2129}
2130#endif
2131
2132/// \brief Rotate a vector left as if uint64x2_p
2133/// \tparam C rotate bit count
2134/// \param vec the vector
2135/// \return vector
2136/// \details VecRotateLeft() rotates each element in a vector by bit count.
2137/// vec is rotated as if uint64x2_p.
2138/// \par Wraps
2139/// vec_rl
2140/// \since Crypto++ 8.3
2141template<unsigned int C>
2143{
2144#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2146 return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147#else
2148 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2149 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150
2151 // Get the low bits, shift them to high bits
2152 uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153 // Get the high bits, shift them to low bits
2154 uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155
2156 if (S64 == 0)
2157 {
2158 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159 return VecPermute(vec, m);
2160 }
2161 else if (S64 == 32)
2162 {
2163 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164 return VecPermute(vec, m);
2165 }
2166 else if (BR) // Big rotate amount?
2167 {
2168 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169 t1 = VecPermute(t1, m);
2170 }
2171 else
2172 {
2173 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174 t2 = VecPermute(t2, m);
2175 }
2176
2177 return vec_or(t1, t2);
2178#endif
2179}
2180
2181/// \brief Rotate a vector left as if uint64x2_p
2182/// \param vec the vector
2183/// \return vector
2184/// \details VecRotateLeft<8>() rotates each element in a vector
2185/// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2186/// is used by algorithms like Speck128.
2187/// \par Wraps
2188/// vec_rl
2189/// \since Crypto++ 8.3
2190template<>
2192{
2193#if (CRYPTOPP_BIG_ENDIAN)
2194 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195 return VecPermute(vec, m);
2196#else
2197 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198 return VecPermute(vec, m);
2199#endif
2200}
2201
2202#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2203/// \brief Rotate a vector left as if uint64x2_p
2204/// \tparam C rotate bit count
2205/// \param vec the vector
2206/// \return vector
2207/// \details VecRotateLeft64() rotates each element in a vector by
2208/// bit count. vec is rotated as if uint64x2_p.
2209/// \par Wraps
2210/// vec_rl
2211/// \since Crypto++ 8.3
2212template<unsigned int C>
2214{
2215 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2216 const uint64x2_p res = VecRotateLeft<C>(vec);
2217
2218#if defined(CRYPTOPP_DEBUG)
2219 // Test 32-bit rotate in debug builds while we are here.
2220 const uint32x4_p x = (uint32x4_p)vec;
2221 const uint32x4_p r = VecRotateLeft64<C>(x);
2222
2223 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224#endif
2225
2226 return res;
2227}
2228#endif
2229
2230/// \brief Rotate a vector right as if uint64x2_p
2231/// \tparam C rotate bit count
2232/// \param vec the vector
2233/// \return vector
2234/// \details VecRotateRight64() rotates each element in a vector by
2235/// bit count. vec is rotated as if uint64x2_p.
2236/// \par Wraps
2237/// vec_rl
2238/// \since Crypto++ 8.3
2239template<unsigned int C>
2241{
2242#if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2244 return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245#else
2246 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2247 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248
2249 // Get the low bits, shift them to high bits
2250 uint32x4_p t1 = VecShiftRight<S32>(vec);
2251 // Get the high bits, shift them to low bits
2252 uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253
2254 if (S64 == 0)
2255 {
2256 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257 return VecPermute(vec, m);
2258 }
2259 else if (S64 == 32)
2260 {
2261 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262 return VecPermute(vec, m);
2263 }
2264 else if (BR) // Big rotate amount?
2265 {
2266 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267 t1 = VecPermute(t1, m);
2268 }
2269 else
2270 {
2271 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272 t2 = VecPermute(t2, m);
2273 }
2274
2275 return vec_or(t1, t2);
2276#endif
2277}
2278
2279/// \brief Rotate a vector right as if uint64x2_p
2280/// \param vec the vector
2281/// \return vector
2282/// \details VecRotateRight64<8>() rotates each element in a vector
2283/// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2284/// is used by algorithms like Speck128.
2285/// \details vec is rotated as if uint64x2_p.
2286/// \par Wraps
2287/// vec_rl
2288/// \since Crypto++ 8.3
2289template<>
2291{
2292#if (CRYPTOPP_BIG_ENDIAN)
2293 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294 return VecPermute(vec, m);
2295#else
2296 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297 return VecPermute(vec, m);
2298#endif
2299}
2300
2301#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2302/// \brief Rotate a vector right as if uint64x2_p
2303/// \tparam C rotate bit count
2304/// \param vec the vector
2305/// \return vector
2306/// \details VecRotateRight64() rotates each element in a vector by
2307/// bit count. vec is rotated as if uint64x2_p.
2308/// \par Wraps
2309/// vec_rl
2310/// \since Crypto++ 8.3
2311template<unsigned int C>
2313{
2314 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2315 const uint64x2_p res = VecRotateRight<C>(vec);
2316
2317#if defined(CRYPTOPP_DEBUG)
2318 // Test 32-bit rotate in debug builds while we are here.
2319 const uint32x4_p x = (uint32x4_p)vec;
2320 const uint32x4_p r = VecRotateRight64<C>(x);
2321
2322 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323#endif
2324
2325 return res;
2326}
2327#endif
2328
2329/// \brief AND two vectors as if uint64x2_p
2330/// \tparam T1 vector type
2331/// \tparam T2 vector type
2332/// \param vec1 the first vector
2333/// \param vec2 the second vector
2334/// \return vector
2335/// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
2336/// vec2 is cast to the same type as vec1. The return vector
2337/// is the same type as vec1.
2338/// \details VecAnd64() is a convenience function that simply performs a VecAnd().
2339/// \par Wraps
2340/// vec_and
2341/// \since Crypto++ 8.3
2342template <class T1, class T2>
2343inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344{
2345 return (T1)vec_and(vec1, (T1)vec2);
2346}
2347
2348/// \brief OR two vectors as if uint64x2_p
2349/// \tparam T1 vector type
2350/// \tparam T2 vector type
2351/// \param vec1 the first vector
2352/// \param vec2 the second vector
2353/// \return vector
2354/// \details VecOr64() performs <tt>vec1 | vec2</tt>.
2355/// vec2 is cast to the same type as vec1. The return vector
2356/// is the same type as vec1.
2357/// \details VecOr64() is a convenience function that simply performs a VecOr().
2358/// \par Wraps
2359/// vec_or
2360/// \since Crypto++ 8.3
2361template <class T1, class T2>
2362inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363{
2364 return (T1)vec_or(vec1, (T1)vec2);
2365}
2366
2367/// \brief XOR two vectors as if uint64x2_p
2368/// \tparam T1 vector type
2369/// \tparam T2 vector type
2370/// \param vec1 the first vector
2371/// \param vec2 the second vector
2372/// \return vector
2373/// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
2374/// vec2 is cast to the same type as vec1. The return vector
2375/// is the same type as vec1.
2376/// \details VecXor64() is a convenience function that simply performs a VecXor().
2377/// \par Wraps
2378/// vec_xor
2379/// \since Crypto++ 8.3
2380template <class T1, class T2>
2381inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382{
2383 return (T1)vec_xor(vec1, (T1)vec2);
2384}
2385
2386/// \brief Broadcast 64-bit double word to a vector
2387/// \param val the 64-bit value
2388/// \return vector
2389/// \par Wraps
2390/// vec_splats
2391/// \since Crypto++ 8.3
2393{
2394#if defined(_ARCH_PWR8)
2395 // The PPC64 ABI says so.
2396 return (uint32x4_p)vec_splats((unsigned long long)val);
2397#else
2398 const word64 x[2] = {val,val};
2399 return (uint32x4_p)VecLoad((const word32*)x);
2400#endif
2401}
2402
2403/// \brief Broadcast 64-bit element to a vector as if uint64x2_p
2404/// \tparam the element number
2405/// \param val the 64-bit value
2406/// \return vector
2407/// \par Wraps
2408/// vec_splat
2409/// \since Crypto++ 8.3
2410template <unsigned int N>
2412{
2413#if defined(__VSX__) || defined(_ARCH_PWR8)
2414 return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415#else
2416 enum {E=N&1};
2417 if (E == 0)
2418 {
2419 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420 return (uint32x4_p)vec_perm(val, val, m);
2421 }
2422 else // (E == 1)
2423 {
2424 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425 return (uint32x4_p)vec_perm(val, val, m);
2426 }
2427#endif
2428}
2429
2430#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2431/// \brief Broadcast 64-bit element to a vector
2432/// \tparam the element number
2433/// \param val the 64-bit value
2434/// \return vector
2435/// \since Crypto++ 8.3
2436template <unsigned int N>
2438{
2439 return vec_splat(val, N);
2440}
2441#endif
2442
2443//@}
2444
2445//////////////////////// Power8 Crypto ////////////////////////
2446
2447// __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2448// when it is not available, like with Power7. Sigh...
2449#if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450
2451/// \name POLYNOMIAL MULTIPLICATION
2452//@{
2453
2454/// \brief Polynomial multiplication
2455/// \param a the first term
2456/// \param b the second term
2457/// \return vector product
2458/// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2459/// polynomial multiplication multiplies the high and low terms, and then
2460/// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2461/// al*bl</tt>. It is different behavior than Intel polynomial
2462/// multiplication. To obtain a single product without the XOR, then set
2463/// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2464/// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2465/// \par Wraps
2466/// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
2467/// \since Crypto++ 8.1
2469{
2470#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471 return __vpmsumw (a, b);
2472#elif defined(__clang__)
2473 return __builtin_altivec_crypto_vpmsumw (a, b);
2474#else
2475 return __builtin_crypto_vpmsumw (a, b);
2476#endif
2477}
2478
2479/// \brief Polynomial multiplication
2480/// \param a the first term
2481/// \param b the second term
2482/// \return vector product
2483/// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2484/// polynomial multiplication multiplies the high and low terms, and then
2485/// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2486/// al*bl</tt>. It is different behavior than Intel polynomial
2487/// multiplication. To obtain a single product without the XOR, then set
2488/// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2489/// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2490/// \par Wraps
2491/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2492/// \since Crypto++ 8.1
2494{
2495#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496 return __vpmsumd (a, b);
2497#elif defined(__clang__)
2498 return __builtin_altivec_crypto_vpmsumd (a, b);
2499#else
2500 return __builtin_crypto_vpmsumd (a, b);
2501#endif
2502}
2503
2504/// \brief Polynomial multiplication
2505/// \param a the first term
2506/// \param b the second term
2507/// \return vector product
2508/// \details VecIntelMultiply00() performs polynomial multiplication and presents
2509/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
2510/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
2511/// are multiplied.
2512/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2513/// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2514/// \par Wraps
2515/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2516/// \since Crypto++ 8.0
2518{
2519#if defined(CRYPTOPP_BIG_ENDIAN)
2521#else
2522 return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523#endif
2524}
2525
2526/// \brief Polynomial multiplication
2527/// \param a the first term
2528/// \param b the second term
2529/// \return vector product
2530/// \details VecIntelMultiply01 performs() polynomial multiplication and presents
2531/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
2532/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
2533/// 64-bits of <tt>b</tt> are multiplied.
2534/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2535/// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2536/// \par Wraps
2537/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2538/// \since Crypto++ 8.0
2540{
2541#if defined(CRYPTOPP_BIG_ENDIAN)
2543#else
2544 return VecPolyMultiply(a, VecGetHigh(b));
2545#endif
2546}
2547
2548/// \brief Polynomial multiplication
2549/// \param a the first term
2550/// \param b the second term
2551/// \return vector product
2552/// \details VecIntelMultiply10() performs polynomial multiplication and presents
2553/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
2554/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
2555/// 64-bits of <tt>b</tt> are multiplied.
2556/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2557/// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2558/// \par Wraps
2559/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2560/// \since Crypto++ 8.0
2562{
2563#if defined(CRYPTOPP_BIG_ENDIAN)
2565#else
2566 return VecPolyMultiply(VecGetHigh(a), b);
2567#endif
2568}
2569
2570/// \brief Polynomial multiplication
2571/// \param a the first term
2572/// \param b the second term
2573/// \return vector product
2574/// \details VecIntelMultiply11() performs polynomial multiplication and presents
2575/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
2576/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
2577/// are multiplied.
2578/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2579/// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2580/// \par Wraps
2581/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2582/// \since Crypto++ 8.0
2584{
2585#if defined(CRYPTOPP_BIG_ENDIAN)
2586 return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587#else
2588 return VecPolyMultiply(VecGetLow(a), b);
2589#endif
2590}
2591
2592//@}
2593
2594/// \name AES ENCRYPTION
2595//@{
2596
2597/// \brief One round of AES encryption
2598/// \tparam T1 vector type
2599/// \tparam T2 vector type
2600/// \param state the state vector
2601/// \param key the subkey vector
2602/// \details VecEncrypt() performs one round of AES encryption of state
2603/// using subkey key. The return vector is the same type as state.
2604/// \details VecEncrypt() is available on POWER8 and above.
2605/// \par Wraps
2606/// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
2607/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2608template <class T1, class T2>
2609inline T1 VecEncrypt(const T1 state, const T2 key)
2610{
2611#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612 return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613#elif defined(__clang__)
2614 return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615#elif defined(__GNUC__)
2616 return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617#else
2618 CRYPTOPP_ASSERT(0);
2619#endif
2620}
2621
2622/// \brief Final round of AES encryption
2623/// \tparam T1 vector type
2624/// \tparam T2 vector type
2625/// \param state the state vector
2626/// \param key the subkey vector
2627/// \details VecEncryptLast() performs the final round of AES encryption
2628/// of state using subkey key. The return vector is the same type as state.
2629/// \details VecEncryptLast() is available on POWER8 and above.
2630/// \par Wraps
2631/// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
2632/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2633template <class T1, class T2>
2634inline T1 VecEncryptLast(const T1 state, const T2 key)
2635{
2636#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637 return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638#elif defined(__clang__)
2639 return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640#elif defined(__GNUC__)
2641 return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642#else
2643 CRYPTOPP_ASSERT(0);
2644#endif
2645}
2646
2647/// \brief One round of AES decryption
2648/// \tparam T1 vector type
2649/// \tparam T2 vector type
2650/// \param state the state vector
2651/// \param key the subkey vector
2652/// \details VecDecrypt() performs one round of AES decryption of state
2653/// using subkey key. The return vector is the same type as state.
2654/// \details VecDecrypt() is available on POWER8 and above.
2655/// \par Wraps
2656/// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
2657/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2658template <class T1, class T2>
2659inline T1 VecDecrypt(const T1 state, const T2 key)
2660{
2661#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662 return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663#elif defined(__clang__)
2664 return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665#elif defined(__GNUC__)
2666 return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667#else
2668 CRYPTOPP_ASSERT(0);
2669#endif
2670}
2671
2672/// \brief Final round of AES decryption
2673/// \tparam T1 vector type
2674/// \tparam T2 vector type
2675/// \param state the state vector
2676/// \param key the subkey vector
2677/// \details VecDecryptLast() performs the final round of AES decryption
2678/// of state using subkey key. The return vector is the same type as state.
2679/// \details VecDecryptLast() is available on POWER8 and above.
2680/// \par Wraps
2681/// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
2682/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2683template <class T1, class T2>
2684inline T1 VecDecryptLast(const T1 state, const T2 key)
2685{
2686#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687 return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688#elif defined(__clang__)
2689 return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690#elif defined(__GNUC__)
2691 return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692#else
2693 CRYPTOPP_ASSERT(0);
2694#endif
2695}
2696
2697//@}
2698
2699/// \name SHA DIGESTS
2700//@{
2701
2702/// \brief SHA256 Sigma functions
2703/// \tparam func function
2704/// \tparam fmask function mask
2705/// \tparam T vector type
2706/// \param data the block to transform
2707/// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
2708/// func and fmask. The return vector is the same type as data.
2709/// \details VecSHA256() is available on POWER8 and above.
2710/// \par Wraps
2711/// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
2712/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2713template <int func, int fmask, class T>
2714inline T VecSHA256(const T data)
2715{
2716#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717 return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718#elif defined(__clang__)
2719 return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720#elif defined(__GNUC__)
2721 return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722#else
2723 CRYPTOPP_ASSERT(0);
2724#endif
2725}
2726
2727/// \brief SHA512 Sigma functions
2728/// \tparam func function
2729/// \tparam fmask function mask
2730/// \tparam T vector type
2731/// \param data the block to transform
2732/// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
2733/// func and fmask. The return vector is the same type as data.
2734/// \details VecSHA512() is available on POWER8 and above.
2735/// \par Wraps
2736/// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
2737/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2738template <int func, int fmask, class T>
2739inline T VecSHA512(const T data)
2740{
2741#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742 return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743#elif defined(__clang__)
2744 return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745#elif defined(__GNUC__)
2746 return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747#else
2748 CRYPTOPP_ASSERT(0);
2749#endif
2750}
2751
2752//@}
2753
2754#endif // __CRYPTO__
2755
2756#endif // _ALTIVEC_
2757
2758NAMESPACE_END
2759
2760#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761# pragma GCC diagnostic pop
2762#endif
2763
2764#endif // CRYPTOPP_PPC_CRYPTO_H
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition config_int.h:101
Utility functions for the Crypto++ library.
Crypto++ library namespace.
uint32x4_p VecZero()
The 0 vector.
Definition ppc_simd.h:218
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a vector right.
Definition ppc_simd.h:1676
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition ppc_simd.h:1395
T VecSHA512(const T data)
SHA512 Sigma functions.
Definition ppc_simd.h:2739
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition ppc_simd.h:742
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition ppc_simd.h:817
T1 VecOr64(const T1 vec1, const T2 vec2)
OR two vectors as if uint64x2_p.
Definition ppc_simd.h:2362
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition ppc_simd.h:560
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition ppc_simd.h:1639
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition ppc_simd.h:1575
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition ppc_simd.h:202
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition ppc_simd.h:1231
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition ppc_simd.h:1528
T VecSHA256(const T data)
SHA256 Sigma functions.
Definition ppc_simd.h:2714
uint32x4_p VecSub64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Subtract two vectors as if uint64x2_p.
Definition ppc_simd.h:2077
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition ppc_simd.h:308
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition ppc_simd.h:1478
uint64x2_p VecIntelMultiply00(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition ppc_simd.h:2517
T VecReverseLE(const T data)
Reverse bytes in a vector.
Definition ppc_simd.h:263
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition ppc_simd.h:1819
uint32x4_p VecSplatElement(const uint32x4_p val)
Broadcast 32-bit element to a vector.
Definition ppc_simd.h:1855
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition ppc_simd.h:192
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition ppc_simd.h:1992
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition ppc_simd.h:212
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition ppc_simd.h:1456
void VecStoreAligned(const T data, byte dest[16])
Stores a vector to a byte array.
Definition ppc_simd.h:1100
#define NCONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:169
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition ppc_simd.h:1975
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition ppc_simd.h:2634
uint32x4_p VecSplatElement64(const uint32x4_p val)
Broadcast 64-bit element to a vector as if uint64x2_p.
Definition ppc_simd.h:2411
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition ppc_simd.h:1805
#define CONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:145
T1 VecXor64(const T1 vec1, const T2 vec2)
XOR two vectors as if uint64x2_p.
Definition ppc_simd.h:2381
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition ppc_simd.h:2609
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition ppc_simd.h:2684
uint32x4_p VecPolyMultiply(const uint32x4_p &a, const uint32x4_p &b)
Polynomial multiplication.
Definition ppc_simd.h:2468
uint32x4_p VecRotateRight64(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition ppc_simd.h:2240
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition ppc_simd.h:1438
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a vector left.
Definition ppc_simd.h:1660
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition ppc_simd.h:1614
uint32x4_p VecSplatWord64(word64 val)
Broadcast 64-bit double word to a vector.
Definition ppc_simd.h:2392
uint64x2_p VecIntelMultiply11(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition ppc_simd.h:2583
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition ppc_simd.h:1376
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a vector right.
Definition ppc_simd.h:1708
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition ppc_simd.h:1935
uint32x4_p VecRotateRight64< 8 >(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition ppc_simd.h:2290
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition ppc_simd.h:2659
#define NCONST_V8_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:163
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition ppc_simd.h:895
T VecReverse(const T data)
Reverse bytes in a vector.
Definition ppc_simd.h:242
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a vector left.
Definition ppc_simd.h:1692
#define CONST_V32_CAST(x)
Cast array to vector pointer.
Definition ppc_simd.h:151
uint32x4_p VecSplatWord(word32 val)
Broadcast 32-bit word to a vector.
Definition ppc_simd.h:1830
uint64x2_p VecIntelMultiply01(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition ppc_simd.h:2539
uint32x4_p VecOne()
The 1 vector.
Definition ppc_simd.h:227
T VecReverseBE(const T data)
Reverse bytes in a vector.
Definition ppc_simd.h:283
T VecGetLow(const T val)
Extract a dword from a vector.
Definition ppc_simd.h:1913
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors as if uint64x2_p.
Definition ppc_simd.h:2014
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition ppc_simd.h:1953
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition ppc_simd.h:197
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition ppc_simd.h:369
uint32x4_p VecRotateLeft64(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition ppc_simd.h:2142
uint64x2_p VecIntelMultiply10(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition ppc_simd.h:2561
uint32x4_p VecRotateLeft64< 8 >(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition ppc_simd.h:2191
T1 VecAnd64(const T1 vec1, const T2 vec2)
AND two vectors as if uint64x2_p.
Definition ppc_simd.h:2343
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition trap.h:68