56#if CRYPTOPP_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
65 AS2( lea edx, [table])
70 AS2( movq mm1, [eax+1*8])
72 AS2( movq mm2, [eax+2*8])
73 AS2( movq mm7, [edx+4*2048+0*8])
74 AS2( movq mm6, [edx+4*2048+1*8])
76 AS2( and esp, 0xfffffff0)
80#define SSE2_round(a,b,c,x,mul) \
84 AS2( movq mm3, [edx+0*2048+edi*8])\
86 AS2( movq mm4, [edx+3*2048+edi*8])\
89 AS2( pxor mm3, [edx+1*2048+edi*8])\
91 AS2( pxor mm4, [edx+2*2048+edi*8])\
92 AS3( pextrw ecx, c, 2)\
94 AS2( pxor mm3, [edx+2*2048+edi*8])\
96 AS2( pxor mm4, [edx+1*2048+edi*8])\
97 AS3( pextrw ecx, c, 3)\
99 AS2( pxor mm3, [edx+3*2048+edi*8])\
102 AS2( pxor mm4, [edx+0*2048+edi*8])\
106#define SSE2_mul_5(b) \
111#define SSE2_mul_7(b) \
116#define SSE2_mul_9(b) \
125#define SSE2_pass(A,B,C,mul,X) \
128 SSE2_round(A,B,C,X+0*8+ebx,mul)\
129 SSE2_round(B,C,A,X+1*8+ebx,mul)\
131 ASJ( je, label2_##mul, f)\
132 SSE2_round(C,A,B,X+2*8+ebx,mul)\
137#define SSE2_key_schedule(Y,X) \
138 AS2( movq mm3, [X+7*8])\
140 AS2( movq mm4, [X+0*8])\
141 AS2( psubq mm4, mm3)\
142 AS2( movq [Y+0*8], mm4)\
143 AS2( pxor mm4, [X+1*8])\
145 AS2( movq [Y+1*8], mm4)\
146 AS2( paddq mm4, [X+2*8])\
149 AS2( movq [Y+2*8], mm4)\
151 AS2( movq mm4, [X+3*8])\
152 AS2( psubq mm4, mm3)\
153 AS2( movq [Y+3*8], mm4)\
154 AS2( pxor mm4, [X+4*8])\
156 AS2( movq [Y+4*8], mm4)\
157 AS2( paddq mm4, [X+5*8])\
160 AS2( movq [Y+5*8], mm4)\
162 AS2( movq mm4, [X+6*8])\
163 AS2( psubq mm4, mm3)\
164 AS2( movq [Y+6*8], mm4)\
165 AS2( pxor mm4, [X+7*8])\
167 AS2( movq [Y+7*8], mm4)\
168 AS2( paddq mm4, [Y+0*8])\
171 AS2( movq [Y+0*8], mm4)\
173 AS2( movq mm4, [Y+1*8])\
174 AS2( psubq mm4, mm3)\
175 AS2( movq [Y+1*8], mm4)\
176 AS2( pxor mm4, [Y+2*8])\
178 AS2( movq [Y+2*8], mm4)\
179 AS2( paddq mm4, [Y+3*8])\
182 AS2( movq [Y+3*8], mm4)\
184 AS2( movq mm4, [Y+4*8])\
185 AS2( psubq mm4, mm3)\
186 AS2( movq [Y+4*8], mm4)\
187 AS2( pxor mm4, [Y+5*8])\
188 AS2( movq [Y+5*8], mm4)\
189 AS2( paddq mm4, [Y+6*8])\
190 AS2( movq [Y+6*8], mm4)\
191 AS2( pxor mm4, [edx+4*2048+2*8])\
192 AS2( movq mm3, [Y+7*8])\
193 AS2( psubq mm3, mm4)\
194 AS2( movq [Y+7*8], mm3)
196 SSE2_pass(mm0, mm1, mm2, 5, esi)
197 SSE2_key_schedule(esp+4, esi)
198 SSE2_pass(mm2, mm0, mm1, 7, esp+4)
199 SSE2_key_schedule(esp+4, esp+4)
200 SSE2_pass(mm1, mm2, mm0, 9, esp+4)
202 AS2( pxor mm0, [eax+0*8])
203 AS2( movq [eax+0*8], mm0)
205 AS2( movq [eax+1*8], mm1)
206 AS2( paddq mm2, [eax+2*8])
207 AS2( movq [eax+2*8], mm2)
216 :
"a" (state),
"S" (data),
"d" (table)
217 :
"%ecx",
"%edi",
"memory",
"cc"
230#define t2 (table+256)
231#define t3 (table+256*2)
232#define t4 (table+256*3)
234#define round(a,b,c,x,mul) \
236 a -= t1[GETBYTE(c,0)] ^ t2[GETBYTE(c,2)] ^ t3[GETBYTE(c,4)] ^ t4[GETBYTE(c,6)]; \
237 b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \
240#define pass(a,b,c,mul,X) {\
244 round(a,b,c,X[i+0],mul); \
245 round(b,c,a,X[i+1],mul); \
248 round(c,a,b,X[i+2],mul); \
252#define key_schedule(Y,X) \
253 Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \
254 Y[1] = X[1] ^ Y[0]; \
255 Y[2] = X[2] + Y[1]; \
256 Y[3] = X[3] - (Y[2] ^ ((~Y[1])<<19)); \
257 Y[4] = X[4] ^ Y[3]; \
258 Y[5] = X[5] + Y[4]; \
259 Y[6] = X[6] - (Y[5] ^ ((~Y[4])>>23)); \
260 Y[7] = X[7] ^ Y[6]; \
262 Y[1] -= Y[0] ^ ((~Y[7])<<19); \
265 Y[4] -= Y[3] ^ ((~Y[2])>>23); \
268 Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)
271 key_schedule(Y,data);
276 state[0] = a ^ state[0];
277 state[1] = b - state[1];
278 state[2] = c + state[2];