Crypto++ 8.9
Free C++ class library of cryptographic schemes
xts.cpp
1// xts.cpp - written and placed in the public domain by Jeffrey Walton
2
3// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4// base architecture. We can use the SIMD code below without an
5// architecture option. No runtime tests are required. Unfortunately,
6// we can't use it on Altivec because an architecture switch is required.
7// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8// 16-byte block sizes.
9
10#include "pch.h"
11
12#include "xts.h"
13#include "misc.h"
14#include "modes.h"
15#include "cpu.h"
16
17#if defined(CRYPTOPP_DEBUG)
18# include "aes.h"
19# include "threefish.h"
20#endif
21
22// 0.3 to 0.4 cpb profit
23#if defined(__SSE2__) || defined(_M_X64)
24# include <emmintrin.h>
25#endif
26
27#if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
28# if (CRYPTOPP_ARM_NEON_HEADER) || (CRYPTOPP_ARM_ASIMD_AVAILABLE)
29# include <arm_neon.h>
30# endif
31#endif
32
33#if defined(__ALTIVEC__)
34# include "ppc_simd.h"
35#endif
36
37ANONYMOUS_NAMESPACE_BEGIN
38
39using namespace CryptoPP;
40
41#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
42
43using CryptoPP::AES;
44using CryptoPP::XTS_Mode;
45using CryptoPP::Threefish512;
46
47void Modes_TestInstantiations()
48{
49 XTS_Mode<AES>::Encryption m0;
50 XTS_Mode<AES>::Decryption m1;
51 XTS_Mode<AES>::Encryption m2;
52 XTS_Mode<AES>::Decryption m3;
53
54#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
55 XTS_Mode<Threefish512>::Encryption m4;
56 XTS_Mode<Threefish512>::Decryption m5;
57#endif
58}
59#endif // CRYPTOPP_DEBUG
60
61inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
62{
63 CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
64
65#if defined(CRYPTOPP_DISABLE_ASM)
66 xorbuf(output, input, mask, count);
67
68#elif defined(__SSE2__) || defined(_M_X64)
69 for (size_t i=0; i<count; i+=16)
70 _mm_storeu_si128(M128_CAST(output+i),
71 _mm_xor_si128(
72 _mm_loadu_si128(CONST_M128_CAST(input+i)),
73 _mm_loadu_si128(CONST_M128_CAST(mask+i))));
74
75#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
76 for (size_t i=0; i<count; i+=16)
77 vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
78
79#elif defined(__ALTIVEC__)
80 for (size_t i=0; i<count; i+=16)
81 VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
82
83#else
84 xorbuf(output, input, mask, count);
85#endif
86}
87
88inline void XorBuffer(byte *buf, const byte *mask, size_t count)
89{
90 XorBuffer(buf, buf, mask, count);
91}
92
93// Borrowed from CMAC, but little-endian representation
94inline void GF_Double(byte *out, const byte* in, unsigned int len)
95{
96#if defined(CRYPTOPP_WORD128_AVAILABLE)
97 word128 carry = 0, x;
98 for (size_t i=0, idx=0; i<len/16; ++i, idx+=16)
99 {
100 x = GetWord<word128>(false, LITTLE_ENDIAN_ORDER, in+idx);
101 word128 y = (x >> 127); x = (x << 1) + carry;
102 PutWord<word128>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
103 carry = y;
104 }
105#elif defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
106 word64 carry = 0, x;
107 for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
108 {
109 x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
110 word64 y = (x >> 63); x = (x << 1) + carry;
111 PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
112 carry = y;
113 }
114#else
115 word32 carry = 0, x;
116 for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
117 {
118 x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
119 word32 y = (x >> 31); x = (x << 1) + carry;
120 PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
121 carry = y;
122 }
123#endif
124
125#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
126
128 CRYPTOPP_ASSERT(len >= 16);
129 CRYPTOPP_ASSERT(len <= 128);
130
131 byte* k = out;
132 if (carry)
133 {
134 switch (len)
135 {
136 case 16:
137 {
138 const size_t LEIDX = 16-1;
139 k[LEIDX-15] ^= 0x87;
140 break;
141 }
142 case 32:
143 {
144 // https://crypto.stackexchange.com/q/9815/10496
145 // Polynomial x^256 + x^10 + x^5 + x^2 + 1
146 const size_t LEIDX = 32-1;
147 k[LEIDX-30] ^= 4;
148 k[LEIDX-31] ^= 0x25;
149 break;
150 }
151 case 64:
152 {
153 // https://crypto.stackexchange.com/q/9815/10496
154 // Polynomial x^512 + x^8 + x^5 + x^2 + 1
155 const size_t LEIDX = 64-1;
156 k[LEIDX-62] ^= 1;
157 k[LEIDX-63] ^= 0x25;
158 break;
159 }
160 case 128:
161 {
162 // https://crypto.stackexchange.com/q/9815/10496
163 // Polynomial x^1024 + x^19 + x^6 + x + 1
164 const size_t LEIDX = 128-1;
165 k[LEIDX-125] ^= 8;
166 k[LEIDX-126] ^= 0x00;
167 k[LEIDX-127] ^= 0x43;
168 break;
169 }
170 default:
172 }
173 }
174#else
175 CRYPTOPP_ASSERT(len == 16);
176
177 byte* k = out;
178 if (carry)
179 {
180 k[0] ^= 0x87;
181 return;
182 }
183#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
184}
185
186inline void GF_Double(byte *inout, unsigned int len)
187{
188 GF_Double(inout, inout, len);
189}
190
191ANONYMOUS_NAMESPACE_END
192
193NAMESPACE_BEGIN(CryptoPP)
194
195void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
196{
197#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
198 CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
199 if (length < 16 || length > 128 || !IsPowerOf2(length))
200 throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
201#else
202 CRYPTOPP_ASSERT(length == 16);
203 if (length != 16)
204 throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
205#endif
206}
207
209{
210 CRYPTOPP_ASSERT(length % 2 == 0);
211 if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
212 throw InvalidKeyLength(AlgorithmName(), length);
213}
214
215void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
216{
219
220 const size_t klen = length/2;
221 AccessBlockCipher().SetKey(key+0, klen, params);
222 AccessTweakCipher().SetKey(key+klen, klen, params);
223
224 ResizeBuffers();
225
226 size_t ivLength;
227 const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
228 Resynchronize(iv, (int)ivLength);
229}
230
231void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
232{
234 std::memcpy(m_xregister, m_register, ivLength);
235 GetTweakCipher().ProcessBlock(m_xregister);
236}
237
239{
240 SecByteBlock iv(GetTweakCipher().BlockSize());
241 PutWord<word64>(false, order, iv, sector);
242 std::memset(iv+8, 0x00, iv.size()-8);
243
245 std::memcpy(m_xregister, iv, iv.size());
246 GetTweakCipher().ProcessBlock(m_xregister);
247}
248
249void XTS_ModeBase::ResizeBuffers()
250{
251 BlockOrientedCipherModeBase::ResizeBuffers();
252 m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
253 m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
254}
255
256// ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
257// selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
258// can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
259// of registers. The unneeded code paths should be removed by optimizer.
260// The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
261void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
262{
263 // data unit is multiple of 16 bytes
264 CRYPTOPP_ASSERT(length % BlockSize() == 0);
265
266 enum { lastParallelBlock = ParallelBlocks-1 };
267 const unsigned int blockSize = GetBlockCipher().BlockSize();
268 const size_t parallelSize = blockSize*ParallelBlocks;
269
270 // encrypt the data unit, optimal size at a time
271 while (length >= parallelSize)
272 {
273 // m_xregister[0] always points to the next tweak.
274 GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
275 GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
276 GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
277
278 if (ParallelBlocks > 4)
279 {
280 GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
281 GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
282 GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
283 GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
284 }
285 if (ParallelBlocks > 8)
286 {
287 GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
288 GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
289 GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
290 GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
291 }
292
293 // merge the tweak into the input block
294 XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
295
296 // encrypt one block, merge the tweak into the output block
297 GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
298 outString, parallelSize, BlockTransformation::BT_AllowParallel);
299
300 // m_xregister[0] always points to the next tweak.
301 GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
302
303 inString += parallelSize;
304 outString += parallelSize;
305 length -= parallelSize;
306 }
307
308 // encrypt the data unit, 4 blocks at a time
309 while (ParallelBlocks == 12 && length >= blockSize*4)
310 {
311 // m_xregister[0] always points to the next tweak.
312 GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
313 GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
314 GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
315
316 // merge the tweak into the input block
317 XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
318
319 // encrypt one block, merge the tweak into the output block
320 GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
321 outString, blockSize*4, BlockTransformation::BT_AllowParallel);
322
323 // m_xregister[0] always points to the next tweak.
324 GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
325
326 inString += blockSize*4;
327 outString += blockSize*4;
328 length -= blockSize*4;
329 }
330
331 // encrypt the data unit, 2 blocks at a time
332 while (ParallelBlocks == 8 && length >= blockSize*2)
333 {
334 // m_xregister[0] always points to the next tweak.
335 GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
336
337 // merge the tweak into the input block
338 XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
339
340 // encrypt one block, merge the tweak into the output block
341 GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
342 outString, blockSize*2, BlockTransformation::BT_AllowParallel);
343
344 // m_xregister[0] always points to the next tweak.
345 GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
346
347 inString += blockSize*2;
348 outString += blockSize*2;
349 length -= blockSize*2;
350 }
351
352 // encrypt the data unit, blocksize at a time
353 while (length)
354 {
355 // merge the tweak into the input block
356 XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
357
358 // encrypt one block
359 GetBlockCipher().ProcessBlock(m_xworkspace);
360
361 // merge the tweak into the output block
362 XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
363
364 // Multiply T by alpha
365 GF_Double(m_xregister, blockSize);
366
367 inString += blockSize;
368 outString += blockSize;
369 length -= blockSize;
370 }
371}
372
373size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
374{
375 // need at least a full AES block
376 CRYPTOPP_ASSERT(inLength >= BlockSize());
377
378 if (inLength < BlockSize())
379 throw InvalidArgument("XTS: message is too short for ciphertext stealing");
380
382 return ProcessLastPlainBlock(outString, outLength, inString, inLength);
383 else
384 return ProcessLastCipherBlock(outString, outLength, inString, inLength);
385}
386
387size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
388{
389 // ensure output buffer is large enough
390 CRYPTOPP_ASSERT(outLength >= inLength);
391
392 const unsigned int blockSize = GetBlockCipher().BlockSize();
393 const size_t blocks = inLength / blockSize;
394 const size_t tail = inLength % blockSize;
395 outLength = inLength;
396
397 if (tail == 0)
398 {
399 // Allow ProcessData to handle all the full blocks
400 ProcessData(outString, inString, inLength);
401 return inLength;
402 }
403 else if (blocks > 1)
404 {
405 // Allow ProcessData to handle full blocks except one
406 const size_t head = (blocks-1)*blockSize;
407 ProcessData(outString, inString, inLength-head);
408
409 outString += head;
410 inString += head; inLength -= head;
411 }
412
413 ///// handle the full block /////
414
415 // merge the tweak into the input block
416 XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
417
418 // encrypt one block
419 GetBlockCipher().ProcessBlock(m_xworkspace);
420
421 // merge the tweak into the output block
422 XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
423
424 // Multiply T by alpha
425 GF_Double(m_xregister, blockSize);
426
427 ///// handle final partial block /////
428
429 inString += blockSize;
430 outString += blockSize;
431 const size_t len = inLength-blockSize;
432
433 // copy in the final plaintext bytes
434 std::memcpy(m_xworkspace, inString, len);
435 // and copy out the final ciphertext bytes
436 std::memcpy(outString, outString-blockSize, len);
437 // "steal" ciphertext to complete the block
438 std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
439
440 // merge the tweak into the input block
441 XorBuffer(m_xworkspace, m_xregister, blockSize);
442
443 // encrypt one block
444 GetBlockCipher().ProcessBlock(m_xworkspace);
445
446 // merge the tweak into the previous output block
447 XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
448
449 return outLength;
450}
451
452size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
453{
454 // ensure output buffer is large enough
455 CRYPTOPP_ASSERT(outLength >= inLength);
456
457 const unsigned int blockSize = GetBlockCipher().BlockSize();
458 const size_t blocks = inLength / blockSize;
459 const size_t tail = inLength % blockSize;
460 outLength = inLength;
461
462 if (tail == 0)
463 {
464 // Allow ProcessData to handle all the full blocks
465 ProcessData(outString, inString, inLength);
466 return inLength;
467 }
468 else if (blocks > 1)
469 {
470 // Allow ProcessData to handle full blocks except one
471 const size_t head = (blocks-1)*blockSize;
472 ProcessData(outString, inString, inLength-head);
473
474 outString += head;
475 inString += head; inLength -= head;
476 }
477
478 #define poly1 (m_xregister+0*blockSize)
479 #define poly2 (m_xregister+1*blockSize)
480 GF_Double(poly2, poly1, blockSize);
481
482 ///// handle final partial block /////
483
484 inString += blockSize;
485 outString += blockSize;
486 const size_t len = inLength-blockSize;
487
488 // merge the tweak into the input block
489 XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
490
491 // encrypt one block
492 GetBlockCipher().ProcessBlock(m_xworkspace);
493
494 // merge the tweak into the output block
495 XorBuffer(m_xworkspace, poly2, blockSize);
496
497 // copy in the final plaintext bytes
498 std::memcpy(outString-blockSize, inString, len);
499 // and copy out the final ciphertext bytes
500 std::memcpy(outString, m_xworkspace, len);
501 // "steal" ciphertext to complete the block
502 std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
503
504 ///// handle the full previous block /////
505
506 inString -= blockSize;
507 outString -= blockSize;
508
509 // merge the tweak into the input block
510 XorBuffer(m_xworkspace, outString, poly1, blockSize);
511
512 // encrypt one block
513 GetBlockCipher().ProcessBlock(m_xworkspace);
514
515 // merge the tweak into the output block
516 XorBuffer(outString, m_xworkspace, poly1, blockSize);
517
518 return outLength;
519}
520
521NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition adv_simd.h:614
Class file for the AES cipher (Rijndael)
bool IsForwardTransformation() const
Determines if the cipher is being operated in its forward direction.
Definition modes.h:258
void Resynchronize(const byte *iv, int length=-1)
Resynchronize with an IV.
Definition modes.h:260
@ BT_AllowParallel
Allow parallel transformations.
Definition cryptlib.h:930
An invalid argument was detected.
Definition cryptlib.h:208
Exception thrown when an invalid key length is encountered.
Definition simple.h:56
Interface for retrieving values given their names.
Definition cryptlib.h:327
void New(size_type newSize)
Change size without preserving contents.
Definition secblock.h:1126
size_type size() const
Provides the count of elements in the SecBlock.
Definition secblock.h:867
SecBlock typedef.
Definition secblock.h:1226
virtual void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
XTS block cipher mode of operation default implementation.
Definition xts.h:50
bool IsValidKeyLength(size_t keylength) const
Returns whether keylength is a valid key length.
Definition xts.h:74
void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition xts.cpp:215
void ProcessData(byte *outString, const byte *inString, size_t length)
Encrypt or decrypt an array of bytes.
Definition xts.cpp:261
void Resynchronize(const byte *iv, int ivLength=-1)
Resynchronize with an IV.
Definition xts.cpp:231
size_t ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
Encrypt or decrypt the last block of data.
Definition xts.cpp:373
unsigned int BlockSize() const
Provides the block size of the cipher.
Definition xts.h:84
void ThrowIfInvalidKeyLength(size_t length)
Validates the key length.
Definition xts.cpp:208
void ThrowIfInvalidBlockSize(size_t length)
Validates the block size.
Definition xts.cpp:195
std::string AlgorithmName() const
Provides the name of this algorithm.
Definition xts.h:61
__uint128_t word128
128-bit unsigned datatype
Definition config_int.h:119
unsigned int word32
32-bit unsigned datatype
Definition config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition config_int.h:101
Functions for CPU features and intrinsics.
ByteOrder
Provides the byte ordering.
Definition cryptlib.h:148
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition cryptlib.h:150
Utility functions for the Crypto++ library.
bool IsPowerOf2(const T &value)
Tests whether a value is a power of 2.
Definition misc.h:1215
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Classes for block cipher modes of operation.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition ppc_simd.h:1414
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition ppc_simd.h:369
Classes for the Threefish block cipher.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition trap.h:68
Classes for XTS block cipher mode of operation.