-rw-r--r-- 31634 libmceliece-20241009/crypto_kem/348864/avx/transpose_64x64_asm.q raw
reg128 r0 reg128 r1 reg128 r2 reg128 r3 reg128 r4 reg128 r5 reg128 r6 reg128 r7 reg128 t0 reg128 t1 reg128 v00 reg128 v01 reg128 v10 reg128 v11 int64 buf reg128 mask0 reg128 mask1 reg128 mask2 reg128 mask3 reg128 mask4 reg128 mask5 enter transpose_64x64_asm mask0 aligned= mem128[ MASK5_0 ] mask1 aligned= mem128[ MASK5_1 ] mask2 aligned= mem128[ MASK4_0 ] mask3 aligned= mem128[ MASK4_1 ] mask4 aligned= mem128[ MASK3_0 ] mask5 aligned= mem128[ MASK3_1 ] r0 = mem64[ input_0 + 0 ] x2 r1 = mem64[ input_0 + 64 ] x2 r2 = mem64[ input_0 + 128 ] x2 r3 = mem64[ input_0 + 192 ] x2 r4 = mem64[ input_0 + 256 ] x2 r5 = mem64[ input_0 + 320 ] x2 r6 = mem64[ input_0 + 384 ] x2 r7 = mem64[ input_0 + 448 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 0 ] = buf buf = r1[0] mem64[ input_0 + 64 ] = buf buf = r2[0] mem64[ input_0 + 128 ] = buf buf = r3[0] mem64[ input_0 + 192 ] = buf buf = r4[0] mem64[ input_0 + 256 ] = buf buf = r5[0] mem64[ input_0 + 320 ] = buf buf = r6[0] mem64[ input_0 + 384 ] = buf buf = r7[0] mem64[ input_0 + 448 ] = buf r0 = mem64[ input_0 + 8 ] x2 r1 = mem64[ input_0 + 72 ] x2 r2 = mem64[ input_0 + 136 ] x2 r3 = mem64[ input_0 + 200 ] x2 r4 = mem64[ input_0 + 264 ] x2 r5 = mem64[ input_0 + 328 ] x2 r6 = mem64[ input_0 + 392 ] x2 r7 = mem64[ input_0 + 456 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 8 ] = buf buf = r1[0] mem64[ input_0 + 72 ] = buf buf = r2[0] mem64[ input_0 + 136 ] = buf buf = r3[0] mem64[ input_0 + 200 ] = buf buf = r4[0] mem64[ input_0 + 264 ] = buf buf = r5[0] mem64[ input_0 + 328 ] = buf buf = r6[0] mem64[ input_0 + 392 ] = buf buf = r7[0] mem64[ input_0 + 456 ] = buf r0 = mem64[ input_0 + 16 ] x2 r1 = mem64[ input_0 + 80 ] x2 r2 = mem64[ input_0 + 144 ] x2 r3 = mem64[ input_0 + 208 ] x2 r4 = mem64[ input_0 + 272 ] x2 r5 = mem64[ input_0 + 336 ] x2 r6 = mem64[ input_0 + 400 ] x2 r7 = mem64[ input_0 + 464 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 16 ] = buf buf = r1[0] mem64[ input_0 + 80 ] = buf buf = r2[0] mem64[ input_0 + 144 ] = buf buf = r3[0] mem64[ input_0 + 208 ] = buf buf = r4[0] mem64[ input_0 + 272 ] = buf buf = r5[0] mem64[ input_0 + 336 ] = buf buf = r6[0] mem64[ input_0 + 400 ] = buf buf = r7[0] mem64[ input_0 + 464 ] = buf r0 = mem64[ input_0 + 24 ] x2 r1 = mem64[ input_0 + 88 ] x2 r2 = mem64[ input_0 + 152 ] x2 r3 = mem64[ input_0 + 216 ] x2 r4 = mem64[ input_0 + 280 ] x2 r5 = mem64[ input_0 + 344 ] x2 r6 = mem64[ input_0 + 408 ] x2 r7 = mem64[ input_0 + 472 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 24 ] = buf buf = r1[0] mem64[ input_0 + 88 ] = buf buf = r2[0] mem64[ input_0 + 152 ] = buf buf = r3[0] mem64[ input_0 + 216 ] = buf buf = r4[0] mem64[ input_0 + 280 ] = buf buf = r5[0] mem64[ input_0 + 344 ] = buf buf = r6[0] mem64[ input_0 + 408 ] = buf buf = r7[0] mem64[ input_0 + 472 ] = buf r0 = mem64[ input_0 + 32 ] x2 r1 = mem64[ input_0 + 96 ] x2 r2 = mem64[ input_0 + 160 ] x2 r3 = mem64[ input_0 + 224 ] x2 r4 = mem64[ input_0 + 288 ] x2 r5 = mem64[ input_0 + 352 ] x2 r6 = mem64[ input_0 + 416 ] x2 r7 = mem64[ input_0 + 480 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 32 ] = buf buf = r1[0] mem64[ input_0 + 96 ] = buf buf = r2[0] mem64[ input_0 + 160 ] = buf buf = r3[0] mem64[ input_0 + 224 ] = buf buf = r4[0] mem64[ input_0 + 288 ] = buf buf = r5[0] mem64[ input_0 + 352 ] = buf buf = r6[0] mem64[ input_0 + 416 ] = buf buf = r7[0] mem64[ input_0 + 480 ] = buf r0 = mem64[ input_0 + 40 ] x2 r1 = mem64[ input_0 + 104 ] x2 r2 = mem64[ input_0 + 168 ] x2 r3 = mem64[ input_0 + 232 ] x2 r4 = mem64[ input_0 + 296 ] x2 r5 = mem64[ input_0 + 360 ] x2 r6 = mem64[ input_0 + 424 ] x2 r7 = mem64[ input_0 + 488 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 40 ] = buf buf = r1[0] mem64[ input_0 + 104 ] = buf buf = r2[0] mem64[ input_0 + 168 ] = buf buf = r3[0] mem64[ input_0 + 232 ] = buf buf = r4[0] mem64[ input_0 + 296 ] = buf buf = r5[0] mem64[ input_0 + 360 ] = buf buf = r6[0] mem64[ input_0 + 424 ] = buf buf = r7[0] mem64[ input_0 + 488 ] = buf r0 = mem64[ input_0 + 48 ] x2 r1 = mem64[ input_0 + 112 ] x2 r2 = mem64[ input_0 + 176 ] x2 r3 = mem64[ input_0 + 240 ] x2 r4 = mem64[ input_0 + 304 ] x2 r5 = mem64[ input_0 + 368 ] x2 r6 = mem64[ input_0 + 432 ] x2 r7 = mem64[ input_0 + 496 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 48 ] = buf buf = r1[0] mem64[ input_0 + 112 ] = buf buf = r2[0] mem64[ input_0 + 176 ] = buf buf = r3[0] mem64[ input_0 + 240 ] = buf buf = r4[0] mem64[ input_0 + 304 ] = buf buf = r5[0] mem64[ input_0 + 368 ] = buf buf = r6[0] mem64[ input_0 + 432 ] = buf buf = r7[0] mem64[ input_0 + 496 ] = buf r0 = mem64[ input_0 + 56 ] x2 r1 = mem64[ input_0 + 120 ] x2 r2 = mem64[ input_0 + 184 ] x2 r3 = mem64[ input_0 + 248 ] x2 r4 = mem64[ input_0 + 312 ] x2 r5 = mem64[ input_0 + 376 ] x2 r6 = mem64[ input_0 + 440 ] x2 r7 = mem64[ input_0 + 504 ] x2 v00 = r0 & mask0 2x v10 = r4 << 32 2x v01 = r0 unsigned>> 32 v11 = r4 & mask1 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 2x v10 = r5 << 32 2x v01 = r1 unsigned>> 32 v11 = r5 & mask1 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 2x v10 = r6 << 32 2x v01 = r2 unsigned>> 32 v11 = r6 & mask1 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 2x v10 = r7 << 32 2x v01 = r3 unsigned>> 32 v11 = r7 & mask1 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 4x v10 = r2 << 16 4x v01 = r0 unsigned>> 16 v11 = r2 & mask3 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 4x v10 = r3 << 16 4x v01 = r1 unsigned>> 16 v11 = r3 & mask3 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 4x v10 = r6 << 16 4x v01 = r4 unsigned>> 16 v11 = r6 & mask3 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 4x v10 = r7 << 16 4x v01 = r5 unsigned>> 16 v11 = r7 & mask3 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 8x v10 = r1 << 8 8x v01 = r0 unsigned>> 8 v11 = r1 & mask5 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 8x v10 = r3 << 8 8x v01 = r2 unsigned>> 8 v11 = r3 & mask5 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 8x v10 = r5 << 8 8x v01 = r4 unsigned>> 8 v11 = r5 & mask5 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 8x v10 = r7 << 8 8x v01 = r6 unsigned>> 8 v11 = r7 & mask5 r6 = v00 | v10 r7 = v01 | v11 buf = r0[0] mem64[ input_0 + 56 ] = buf buf = r1[0] mem64[ input_0 + 120 ] = buf buf = r2[0] mem64[ input_0 + 184 ] = buf buf = r3[0] mem64[ input_0 + 248 ] = buf buf = r4[0] mem64[ input_0 + 312 ] = buf buf = r5[0] mem64[ input_0 + 376 ] = buf buf = r6[0] mem64[ input_0 + 440 ] = buf buf = r7[0] mem64[ input_0 + 504 ] = buf mask0 aligned= mem128[ MASK2_0 ] mask1 aligned= mem128[ MASK2_1 ] mask2 aligned= mem128[ MASK1_0 ] mask3 aligned= mem128[ MASK1_1 ] mask4 aligned= mem128[ MASK0_0 ] mask5 aligned= mem128[ MASK0_1 ] r0 = mem64[ input_0 + 0 ] x2 r1 = mem64[ input_0 + 8 ] x2 r2 = mem64[ input_0 + 16 ] x2 r3 = mem64[ input_0 + 24 ] x2 r4 = mem64[ input_0 + 32 ] x2 r5 = mem64[ input_0 + 40 ] x2 r6 = mem64[ input_0 + 48 ] x2 r7 = mem64[ input_0 + 56 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 0 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 16 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 32 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 48 ] = t0 r0 = mem64[ input_0 + 64 ] x2 r1 = mem64[ input_0 + 72 ] x2 r2 = mem64[ input_0 + 80 ] x2 r3 = mem64[ input_0 + 88 ] x2 r4 = mem64[ input_0 + 96 ] x2 r5 = mem64[ input_0 + 104 ] x2 r6 = mem64[ input_0 + 112 ] x2 r7 = mem64[ input_0 + 120 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 64 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 80 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 96 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 112 ] = t0 r0 = mem64[ input_0 + 128 ] x2 r1 = mem64[ input_0 + 136 ] x2 r2 = mem64[ input_0 + 144 ] x2 r3 = mem64[ input_0 + 152 ] x2 r4 = mem64[ input_0 + 160 ] x2 r5 = mem64[ input_0 + 168 ] x2 r6 = mem64[ input_0 + 176 ] x2 r7 = mem64[ input_0 + 184 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 128 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 144 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 160 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 176 ] = t0 r0 = mem64[ input_0 + 192 ] x2 r1 = mem64[ input_0 + 200 ] x2 r2 = mem64[ input_0 + 208 ] x2 r3 = mem64[ input_0 + 216 ] x2 r4 = mem64[ input_0 + 224 ] x2 r5 = mem64[ input_0 + 232 ] x2 r6 = mem64[ input_0 + 240 ] x2 r7 = mem64[ input_0 + 248 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 192 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 208 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 224 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 240 ] = t0 r0 = mem64[ input_0 + 256 ] x2 r1 = mem64[ input_0 + 264 ] x2 r2 = mem64[ input_0 + 272 ] x2 r3 = mem64[ input_0 + 280 ] x2 r4 = mem64[ input_0 + 288 ] x2 r5 = mem64[ input_0 + 296 ] x2 r6 = mem64[ input_0 + 304 ] x2 r7 = mem64[ input_0 + 312 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 256 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 272 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 288 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 304 ] = t0 r0 = mem64[ input_0 + 320 ] x2 r1 = mem64[ input_0 + 328 ] x2 r2 = mem64[ input_0 + 336 ] x2 r3 = mem64[ input_0 + 344 ] x2 r4 = mem64[ input_0 + 352 ] x2 r5 = mem64[ input_0 + 360 ] x2 r6 = mem64[ input_0 + 368 ] x2 r7 = mem64[ input_0 + 376 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 320 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 336 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 352 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 368 ] = t0 r0 = mem64[ input_0 + 384 ] x2 r1 = mem64[ input_0 + 392 ] x2 r2 = mem64[ input_0 + 400 ] x2 r3 = mem64[ input_0 + 408 ] x2 r4 = mem64[ input_0 + 416 ] x2 r5 = mem64[ input_0 + 424 ] x2 r6 = mem64[ input_0 + 432 ] x2 r7 = mem64[ input_0 + 440 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 384 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 400 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 416 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 432 ] = t0 r0 = mem64[ input_0 + 448 ] x2 r1 = mem64[ input_0 + 456 ] x2 r2 = mem64[ input_0 + 464 ] x2 r3 = mem64[ input_0 + 472 ] x2 r4 = mem64[ input_0 + 480 ] x2 r5 = mem64[ input_0 + 488 ] x2 r6 = mem64[ input_0 + 496 ] x2 r7 = mem64[ input_0 + 504 ] x2 v00 = r0 & mask0 v10 = r4 & mask0 2x v10 <<= 4 v01 = r0 & mask1 v11 = r4 & mask1 2x v01 unsigned>>= 4 r0 = v00 | v10 r4 = v01 | v11 v00 = r1 & mask0 v10 = r5 & mask0 2x v10 <<= 4 v01 = r1 & mask1 v11 = r5 & mask1 2x v01 unsigned>>= 4 r1 = v00 | v10 r5 = v01 | v11 v00 = r2 & mask0 v10 = r6 & mask0 2x v10 <<= 4 v01 = r2 & mask1 v11 = r6 & mask1 2x v01 unsigned>>= 4 r2 = v00 | v10 r6 = v01 | v11 v00 = r3 & mask0 v10 = r7 & mask0 2x v10 <<= 4 v01 = r3 & mask1 v11 = r7 & mask1 2x v01 unsigned>>= 4 r3 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask2 v10 = r2 & mask2 2x v10 <<= 2 v01 = r0 & mask3 v11 = r2 & mask3 2x v01 unsigned>>= 2 r0 = v00 | v10 r2 = v01 | v11 v00 = r1 & mask2 v10 = r3 & mask2 2x v10 <<= 2 v01 = r1 & mask3 v11 = r3 & mask3 2x v01 unsigned>>= 2 r1 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask2 v10 = r6 & mask2 2x v10 <<= 2 v01 = r4 & mask3 v11 = r6 & mask3 2x v01 unsigned>>= 2 r4 = v00 | v10 r6 = v01 | v11 v00 = r5 & mask2 v10 = r7 & mask2 2x v10 <<= 2 v01 = r5 & mask3 v11 = r7 & mask3 2x v01 unsigned>>= 2 r5 = v00 | v10 r7 = v01 | v11 v00 = r0 & mask4 v10 = r1 & mask4 2x v10 <<= 1 v01 = r0 & mask5 v11 = r1 & mask5 2x v01 unsigned>>= 1 r0 = v00 | v10 r1 = v01 | v11 v00 = r2 & mask4 v10 = r3 & mask4 2x v10 <<= 1 v01 = r2 & mask5 v11 = r3 & mask5 2x v01 unsigned>>= 1 r2 = v00 | v10 r3 = v01 | v11 v00 = r4 & mask4 v10 = r5 & mask4 2x v10 <<= 1 v01 = r4 & mask5 v11 = r5 & mask5 2x v01 unsigned>>= 1 r4 = v00 | v10 r5 = v01 | v11 v00 = r6 & mask4 v10 = r7 & mask4 2x v10 <<= 1 v01 = r6 & mask5 v11 = r7 & mask5 2x v01 unsigned>>= 1 r6 = v00 | v10 r7 = v01 | v11 t0 = r0[0]r1[0] mem128[ input_0 + 448 ] = t0 t0 = r2[0]r3[0] mem128[ input_0 + 464 ] = t0 t0 = r4[0]r5[0] mem128[ input_0 + 480 ] = t0 t0 = r6[0]r7[0] mem128[ input_0 + 496 ] = t0 return