Crypto++ 8.9
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1// arm_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file arm_simd.h
4/// \brief Support functions for ARM and vector operations
5
6#ifndef CRYPTOPP_ARM_SIMD_H
7#define CRYPTOPP_ARM_SIMD_H
8
9#include "config.h"
10
11#if (CRYPTOPP_ARM_NEON_HEADER)
12# include <stdint.h>
13# include <arm_neon.h>
14#endif
15
16#if (CRYPTOPP_ARM_ACLE_HEADER)
17# include <stdint.h>
18# include <arm_acle.h>
19#endif
20
21#if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22/// \name CRC32 checksum
23//@{
24
25/// \brief CRC32 checksum
26/// \param crc the starting crc value
27/// \param val the value to checksum
28/// \return CRC32 value
29/// \since Crypto++ 8.6
30inline uint32_t CRC32B (uint32_t crc, uint8_t val)
31{
32#if defined(CRYPTOPP_MSC_VERSION)
33 return __crc32b(crc, val);
34#else
35 __asm__ ("crc32b %w0, %w0, %w1 \n\t"
36 :"+r" (crc) : "r" (val) );
37 return crc;
38#endif
39}
40
41/// \brief CRC32 checksum
42/// \param crc the starting crc value
43/// \param val the value to checksum
44/// \return CRC32 value
45/// \since Crypto++ 8.6
46inline uint32_t CRC32W (uint32_t crc, uint32_t val)
47{
48#if defined(CRYPTOPP_MSC_VERSION)
49 return __crc32w(crc, val);
50#else
51 __asm__ ("crc32w %w0, %w0, %w1 \n\t"
52 :"+r" (crc) : "r" (val) );
53 return crc;
54#endif
55}
56
57/// \brief CRC32 checksum
58/// \param crc the starting crc value
59/// \param vals the values to checksum
60/// \return CRC32 value
61/// \since Crypto++ 8.6
62inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
63{
64#if defined(CRYPTOPP_MSC_VERSION)
65 return __crc32w(__crc32w(__crc32w(__crc32w(
66 crc, vals[0]), vals[1]), vals[2]), vals[3]);
67#else
68 __asm__ ("crc32w %w0, %w0, %w1 \n\t"
69 "crc32w %w0, %w0, %w2 \n\t"
70 "crc32w %w0, %w0, %w3 \n\t"
71 "crc32w %w0, %w0, %w4 \n\t"
72 :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
73 "r" (vals[2]), "r" (vals[3]));
74 return crc;
75#endif
76}
77
78//@}
79/// \name CRC32-C checksum
80
81/// \brief CRC32-C checksum
82/// \param crc the starting crc value
83/// \param val the value to checksum
84/// \return CRC32-C value
85/// \since Crypto++ 8.6
86inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
87{
88#if defined(CRYPTOPP_MSC_VERSION)
89 return __crc32cb(crc, val);
90#else
91 __asm__ ("crc32cb %w0, %w0, %w1 \n\t"
92 :"+r" (crc) : "r" (val) );
93 return crc;
94#endif
95}
96
97/// \brief CRC32-C checksum
98/// \param crc the starting crc value
99/// \param val the value to checksum
100/// \return CRC32-C value
101/// \since Crypto++ 8.6
102inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
103{
104#if defined(CRYPTOPP_MSC_VERSION)
105 return __crc32cw(crc, val);
106#else
107 __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
108 :"+r" (crc) : "r" (val) );
109 return crc;
110#endif
111}
112
113/// \brief CRC32-C checksum
114/// \param crc the starting crc value
115/// \param vals the values to checksum
116/// \return CRC32-C value
117/// \since Crypto++ 8.6
118inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
119{
120#if defined(CRYPTOPP_MSC_VERSION)
121 return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122 crc, vals[0]), vals[1]), vals[2]), vals[3]);
123#else
124 __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
125 "crc32cw %w0, %w0, %w2 \n\t"
126 "crc32cw %w0, %w0, %w3 \n\t"
127 "crc32cw %w0, %w0, %w4 \n\t"
128 :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
129 "r" (vals[2]), "r" (vals[3]));
130 return crc;
131#endif
132}
133//@}
134#endif // CRYPTOPP_ARM_CRC32_AVAILABLE
135
136#if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
137/// \name Polynomial multiplication
138//@{
139
140/// \brief Polynomial multiplication
141/// \param a the first value
142/// \param b the second value
143/// \return vector product
144/// \details PMULL_00() performs polynomial multiplication and presents
145/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
146/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
147/// are multiplied.
148/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
149/// is MSB and numbered 127, while the rightmost bit is LSB and
150/// numbered 0.
151/// \since Crypto++ 8.0
152inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
153{
154#if defined(CRYPTOPP_MSC_VERSION)
155 const __n64 x = { vgetq_lane_u64(a, 0) };
156 const __n64 y = { vgetq_lane_u64(b, 0) };
157 return vmull_p64(x, y);
158#elif defined(__GNUC__)
159 uint64x2_t r;
160 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
161 :"=w" (r) : "w" (a), "w" (b) );
162 return r;
163#else
164 return (uint64x2_t)(vmull_p64(
165 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
167#endif
168}
169
170/// \brief Polynomial multiplication
171/// \param a the first value
172/// \param b the second value
173/// \return vector product
174/// \details PMULL_01 performs() polynomial multiplication and presents
175/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
176/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
177/// 64-bits of <tt>b</tt> are multiplied.
178/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
179/// is MSB and numbered 127, while the rightmost bit is LSB and
180/// numbered 0.
181/// \since Crypto++ 8.0
182inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
183{
184#if defined(CRYPTOPP_MSC_VERSION)
185 const __n64 x = { vgetq_lane_u64(a, 0) };
186 const __n64 y = { vgetq_lane_u64(b, 1) };
187 return vmull_p64(x, y);
188#elif defined(__GNUC__)
189 uint64x2_t r;
190 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
191 :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
192 return r;
193#else
194 return (uint64x2_t)(vmull_p64(
195 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
197#endif
198}
199
200/// \brief Polynomial multiplication
201/// \param a the first value
202/// \param b the second value
203/// \return vector product
204/// \details PMULL_10() performs polynomial multiplication and presents
205/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
206/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
207/// 64-bits of <tt>b</tt> are multiplied.
208/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
209/// is MSB and numbered 127, while the rightmost bit is LSB and
210/// numbered 0.
211/// \since Crypto++ 8.0
212inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
213{
214#if defined(CRYPTOPP_MSC_VERSION)
215 const __n64 x = { vgetq_lane_u64(a, 1) };
216 const __n64 y = { vgetq_lane_u64(b, 0) };
217 return vmull_p64(x, y);
218#elif defined(__GNUC__)
219 uint64x2_t r;
220 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
221 :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
222 return r;
223#else
224 return (uint64x2_t)(vmull_p64(
225 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
227#endif
228}
229
230/// \brief Polynomial multiplication
231/// \param a the first value
232/// \param b the second value
233/// \return vector product
234/// \details PMULL_11() performs polynomial multiplication and presents
235/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
236/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
237/// are multiplied.
238/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
239/// is MSB and numbered 127, while the rightmost bit is LSB and
240/// numbered 0.
241/// \since Crypto++ 8.0
242inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
243{
244#if defined(CRYPTOPP_MSC_VERSION)
245 const __n64 x = { vgetq_lane_u64(a, 1) };
246 const __n64 y = { vgetq_lane_u64(b, 1) };
247 return vmull_p64(x, y);
248#elif defined(__GNUC__)
249 uint64x2_t r;
250 __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
251 :"=w" (r) : "w" (a), "w" (b) );
252 return r;
253#else
254 return (uint64x2_t)(vmull_p64(
255 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
257#endif
258}
259
260/// \brief Polynomial multiplication
261/// \param a the first value
262/// \param b the second value
263/// \return vector product
264/// \details PMULL() performs vmull_p64(). PMULL is provided as
265/// GCC inline assembly due to Clang and lack of support for the intrinsic.
266/// \since Crypto++ 8.0
267inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
268{
269#if defined(CRYPTOPP_MSC_VERSION)
270 const __n64 x = { vgetq_lane_u64(a, 0) };
271 const __n64 y = { vgetq_lane_u64(b, 0) };
272 return vmull_p64(x, y);
273#elif defined(__GNUC__)
274 uint64x2_t r;
275 __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
276 :"=w" (r) : "w" (a), "w" (b) );
277 return r;
278#else
279 return (uint64x2_t)(vmull_p64(
280 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
282#endif
283}
284
285/// \brief Polynomial multiplication
286/// \param a the first value
287/// \param b the second value
288/// \return vector product
289/// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as
290/// GCC inline assembly due to Clang and lack of support for the intrinsic.
291/// \since Crypto++ 8.0
292inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
293{
294#if defined(CRYPTOPP_MSC_VERSION)
295 const __n64 x = { vgetq_lane_u64(a, 1) };
296 const __n64 y = { vgetq_lane_u64(b, 1) };
297 return vmull_p64(x, y);
298#elif defined(__GNUC__)
299 uint64x2_t r;
300 __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
301 :"=w" (r) : "w" (a), "w" (b) );
302 return r;
303#else
304 return (uint64x2_t)(vmull_p64(
305 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306 vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
307#endif
308}
309
310/// \brief Vector extraction
311/// \tparam C the byte count
312/// \param a the first value
313/// \param b the second value
314/// \return vector
315/// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
316/// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
317/// as GCC inline assembly due to Clang and lack of support for the intrinsic.
318/// \since Crypto++ 8.0
319template <unsigned int C>
320inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
321{
322 // https://github.com/weidai11/cryptopp/issues/366
323#if defined(CRYPTOPP_MSC_VERSION)
324 return vreinterpretq_u64_u8(vextq_u8(
325 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
326#else
327 uint64x2_t r;
328 __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
329 :"=w" (r) : "w" (a), "w" (b), "I" (C) );
330 return r;
331#endif
332}
333
334//@}
335#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
336
337#if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
338/// \name ARMv8.2 operations
339//@{
340
341/// \brief Three-way XOR
342/// \param a the first value
343/// \param b the second value
344/// \param c the third value
345/// \return three-way exclusive OR of the values
346/// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
347/// to Clang and lack of support for the intrinsic.
348/// \details VEOR3 requires ARMv8.2.
349/// \since Crypto++ 8.6
350inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
351{
352#if defined(CRYPTOPP_MSC_VERSION)
353 return veor3q_u64(a, b, c);
354#else
355 uint64x2_t r;
356 __asm__ ("eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
357 :"=w" (r) : "w" (a), "w" (b), "w" (c));
358 return r;
359#endif
360}
361
362/// \brief XOR and rotate
363/// \param a the first value
364/// \param b the second value
365/// \param c the third value
366/// \return two-way exclusive OR of the values, then rotated by c
367/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
368/// to Clang and lack of support for the intrinsic.
369/// \details VXARQ requires ARMv8.2.
370/// \since Crypto++ 8.6
371inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
372{
373#if defined(CRYPTOPP_MSC_VERSION)
374 return vxarq_u64(a, b, c);
375#else
376 uint64x2_t r;
377 __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
378 :"=w" (r) : "w" (a), "w" (b), "I" (c));
379 return r;
380#endif
381}
382
383/// \brief XOR and rotate
384/// \tparam C the rotate amount
385/// \param a the first value
386/// \param b the second value
387/// \return two-way exclusive OR of the values, then rotated by C
388/// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
389/// to Clang and lack of support for the intrinsic.
390/// \details VXARQ requires ARMv8.2.
391/// \since Crypto++ 8.6
392template <unsigned int C>
393inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
394{
395#if defined(CRYPTOPP_MSC_VERSION)
396 return vxarq_u64(a, b, C);
397#else
398 uint64x2_t r;
399 __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400 :"=w" (r) : "w" (a), "w" (b), "I" (C));
401 return r;
402#endif
403}
404
405/// \brief XOR and rotate
406/// \param a the first value
407/// \param b the second value
408/// \return two-way exclusive OR of the values, then rotated 1-bit
409/// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due
410/// to Clang and lack of support for the intrinsic.
411/// \details VRAX1 requires ARMv8.2.
412/// \since Crypto++ 8.6
413inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
414{
415#if defined(CRYPTOPP_MSC_VERSION)
416 return vrax1q_u64(a, b);
417#else
418 uint64x2_t r;
419 __asm__ ("rax1 %0.2d, %1.2d, %2.2d \n\t"
420 :"=w" (r) : "w" (a), "w" (b));
421 return r;
422#endif
423}
424//@}
425#endif // CRYPTOPP_ARM_SHA3_AVAILABLE
426
427#endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
XOR and rotate.
Definition arm_simd.h:371
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:152
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
Definition arm_simd.h:413
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
Definition arm_simd.h:118
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:242
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
Definition arm_simd.h:86
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:292
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:182
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
Definition arm_simd.h:350
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
Definition arm_simd.h:46
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
Definition arm_simd.h:30
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:212
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition arm_simd.h:267
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
Definition arm_simd.h:102
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
Vector extraction.
Definition arm_simd.h:320
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
Definition arm_simd.h:62
Library configuration file.