-rw-r--r-- 31634 libmceliece-20241009/crypto_kem/348864/avx/transpose_64x64_asm.q raw
reg128 r0
reg128 r1
reg128 r2
reg128 r3
reg128 r4
reg128 r5
reg128 r6
reg128 r7
reg128 t0
reg128 t1
reg128 v00
reg128 v01
reg128 v10
reg128 v11
int64 buf
reg128 mask0
reg128 mask1
reg128 mask2
reg128 mask3
reg128 mask4
reg128 mask5
enter transpose_64x64_asm
mask0 aligned= mem128[ MASK5_0 ]
mask1 aligned= mem128[ MASK5_1 ]
mask2 aligned= mem128[ MASK4_0 ]
mask3 aligned= mem128[ MASK4_1 ]
mask4 aligned= mem128[ MASK3_0 ]
mask5 aligned= mem128[ MASK3_1 ]
r0 = mem64[ input_0 + 0 ] x2
r1 = mem64[ input_0 + 64 ] x2
r2 = mem64[ input_0 + 128 ] x2
r3 = mem64[ input_0 + 192 ] x2
r4 = mem64[ input_0 + 256 ] x2
r5 = mem64[ input_0 + 320 ] x2
r6 = mem64[ input_0 + 384 ] x2
r7 = mem64[ input_0 + 448 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 0 ] = buf
buf = r1[0]
mem64[ input_0 + 64 ] = buf
buf = r2[0]
mem64[ input_0 + 128 ] = buf
buf = r3[0]
mem64[ input_0 + 192 ] = buf
buf = r4[0]
mem64[ input_0 + 256 ] = buf
buf = r5[0]
mem64[ input_0 + 320 ] = buf
buf = r6[0]
mem64[ input_0 + 384 ] = buf
buf = r7[0]
mem64[ input_0 + 448 ] = buf
r0 = mem64[ input_0 + 8 ] x2
r1 = mem64[ input_0 + 72 ] x2
r2 = mem64[ input_0 + 136 ] x2
r3 = mem64[ input_0 + 200 ] x2
r4 = mem64[ input_0 + 264 ] x2
r5 = mem64[ input_0 + 328 ] x2
r6 = mem64[ input_0 + 392 ] x2
r7 = mem64[ input_0 + 456 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 8 ] = buf
buf = r1[0]
mem64[ input_0 + 72 ] = buf
buf = r2[0]
mem64[ input_0 + 136 ] = buf
buf = r3[0]
mem64[ input_0 + 200 ] = buf
buf = r4[0]
mem64[ input_0 + 264 ] = buf
buf = r5[0]
mem64[ input_0 + 328 ] = buf
buf = r6[0]
mem64[ input_0 + 392 ] = buf
buf = r7[0]
mem64[ input_0 + 456 ] = buf
r0 = mem64[ input_0 + 16 ] x2
r1 = mem64[ input_0 + 80 ] x2
r2 = mem64[ input_0 + 144 ] x2
r3 = mem64[ input_0 + 208 ] x2
r4 = mem64[ input_0 + 272 ] x2
r5 = mem64[ input_0 + 336 ] x2
r6 = mem64[ input_0 + 400 ] x2
r7 = mem64[ input_0 + 464 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 16 ] = buf
buf = r1[0]
mem64[ input_0 + 80 ] = buf
buf = r2[0]
mem64[ input_0 + 144 ] = buf
buf = r3[0]
mem64[ input_0 + 208 ] = buf
buf = r4[0]
mem64[ input_0 + 272 ] = buf
buf = r5[0]
mem64[ input_0 + 336 ] = buf
buf = r6[0]
mem64[ input_0 + 400 ] = buf
buf = r7[0]
mem64[ input_0 + 464 ] = buf
r0 = mem64[ input_0 + 24 ] x2
r1 = mem64[ input_0 + 88 ] x2
r2 = mem64[ input_0 + 152 ] x2
r3 = mem64[ input_0 + 216 ] x2
r4 = mem64[ input_0 + 280 ] x2
r5 = mem64[ input_0 + 344 ] x2
r6 = mem64[ input_0 + 408 ] x2
r7 = mem64[ input_0 + 472 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 24 ] = buf
buf = r1[0]
mem64[ input_0 + 88 ] = buf
buf = r2[0]
mem64[ input_0 + 152 ] = buf
buf = r3[0]
mem64[ input_0 + 216 ] = buf
buf = r4[0]
mem64[ input_0 + 280 ] = buf
buf = r5[0]
mem64[ input_0 + 344 ] = buf
buf = r6[0]
mem64[ input_0 + 408 ] = buf
buf = r7[0]
mem64[ input_0 + 472 ] = buf
r0 = mem64[ input_0 + 32 ] x2
r1 = mem64[ input_0 + 96 ] x2
r2 = mem64[ input_0 + 160 ] x2
r3 = mem64[ input_0 + 224 ] x2
r4 = mem64[ input_0 + 288 ] x2
r5 = mem64[ input_0 + 352 ] x2
r6 = mem64[ input_0 + 416 ] x2
r7 = mem64[ input_0 + 480 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 32 ] = buf
buf = r1[0]
mem64[ input_0 + 96 ] = buf
buf = r2[0]
mem64[ input_0 + 160 ] = buf
buf = r3[0]
mem64[ input_0 + 224 ] = buf
buf = r4[0]
mem64[ input_0 + 288 ] = buf
buf = r5[0]
mem64[ input_0 + 352 ] = buf
buf = r6[0]
mem64[ input_0 + 416 ] = buf
buf = r7[0]
mem64[ input_0 + 480 ] = buf
r0 = mem64[ input_0 + 40 ] x2
r1 = mem64[ input_0 + 104 ] x2
r2 = mem64[ input_0 + 168 ] x2
r3 = mem64[ input_0 + 232 ] x2
r4 = mem64[ input_0 + 296 ] x2
r5 = mem64[ input_0 + 360 ] x2
r6 = mem64[ input_0 + 424 ] x2
r7 = mem64[ input_0 + 488 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 40 ] = buf
buf = r1[0]
mem64[ input_0 + 104 ] = buf
buf = r2[0]
mem64[ input_0 + 168 ] = buf
buf = r3[0]
mem64[ input_0 + 232 ] = buf
buf = r4[0]
mem64[ input_0 + 296 ] = buf
buf = r5[0]
mem64[ input_0 + 360 ] = buf
buf = r6[0]
mem64[ input_0 + 424 ] = buf
buf = r7[0]
mem64[ input_0 + 488 ] = buf
r0 = mem64[ input_0 + 48 ] x2
r1 = mem64[ input_0 + 112 ] x2
r2 = mem64[ input_0 + 176 ] x2
r3 = mem64[ input_0 + 240 ] x2
r4 = mem64[ input_0 + 304 ] x2
r5 = mem64[ input_0 + 368 ] x2
r6 = mem64[ input_0 + 432 ] x2
r7 = mem64[ input_0 + 496 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 48 ] = buf
buf = r1[0]
mem64[ input_0 + 112 ] = buf
buf = r2[0]
mem64[ input_0 + 176 ] = buf
buf = r3[0]
mem64[ input_0 + 240 ] = buf
buf = r4[0]
mem64[ input_0 + 304 ] = buf
buf = r5[0]
mem64[ input_0 + 368 ] = buf
buf = r6[0]
mem64[ input_0 + 432 ] = buf
buf = r7[0]
mem64[ input_0 + 496 ] = buf
r0 = mem64[ input_0 + 56 ] x2
r1 = mem64[ input_0 + 120 ] x2
r2 = mem64[ input_0 + 184 ] x2
r3 = mem64[ input_0 + 248 ] x2
r4 = mem64[ input_0 + 312 ] x2
r5 = mem64[ input_0 + 376 ] x2
r6 = mem64[ input_0 + 440 ] x2
r7 = mem64[ input_0 + 504 ] x2
v00 = r0 & mask0
2x v10 = r4 << 32
2x v01 = r0 unsigned>> 32
v11 = r4 & mask1
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
2x v10 = r5 << 32
2x v01 = r1 unsigned>> 32
v11 = r5 & mask1
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
2x v10 = r6 << 32
2x v01 = r2 unsigned>> 32
v11 = r6 & mask1
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
2x v10 = r7 << 32
2x v01 = r3 unsigned>> 32
v11 = r7 & mask1
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
4x v10 = r2 << 16
4x v01 = r0 unsigned>> 16
v11 = r2 & mask3
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
4x v10 = r3 << 16
4x v01 = r1 unsigned>> 16
v11 = r3 & mask3
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
4x v10 = r6 << 16
4x v01 = r4 unsigned>> 16
v11 = r6 & mask3
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
4x v10 = r7 << 16
4x v01 = r5 unsigned>> 16
v11 = r7 & mask3
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
8x v10 = r1 << 8
8x v01 = r0 unsigned>> 8
v11 = r1 & mask5
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
8x v10 = r3 << 8
8x v01 = r2 unsigned>> 8
v11 = r3 & mask5
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
8x v10 = r5 << 8
8x v01 = r4 unsigned>> 8
v11 = r5 & mask5
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
8x v10 = r7 << 8
8x v01 = r6 unsigned>> 8
v11 = r7 & mask5
r6 = v00 | v10
r7 = v01 | v11
buf = r0[0]
mem64[ input_0 + 56 ] = buf
buf = r1[0]
mem64[ input_0 + 120 ] = buf
buf = r2[0]
mem64[ input_0 + 184 ] = buf
buf = r3[0]
mem64[ input_0 + 248 ] = buf
buf = r4[0]
mem64[ input_0 + 312 ] = buf
buf = r5[0]
mem64[ input_0 + 376 ] = buf
buf = r6[0]
mem64[ input_0 + 440 ] = buf
buf = r7[0]
mem64[ input_0 + 504 ] = buf
mask0 aligned= mem128[ MASK2_0 ]
mask1 aligned= mem128[ MASK2_1 ]
mask2 aligned= mem128[ MASK1_0 ]
mask3 aligned= mem128[ MASK1_1 ]
mask4 aligned= mem128[ MASK0_0 ]
mask5 aligned= mem128[ MASK0_1 ]
r0 = mem64[ input_0 + 0 ] x2
r1 = mem64[ input_0 + 8 ] x2
r2 = mem64[ input_0 + 16 ] x2
r3 = mem64[ input_0 + 24 ] x2
r4 = mem64[ input_0 + 32 ] x2
r5 = mem64[ input_0 + 40 ] x2
r6 = mem64[ input_0 + 48 ] x2
r7 = mem64[ input_0 + 56 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 0 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 16 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 32 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 48 ] = t0
r0 = mem64[ input_0 + 64 ] x2
r1 = mem64[ input_0 + 72 ] x2
r2 = mem64[ input_0 + 80 ] x2
r3 = mem64[ input_0 + 88 ] x2
r4 = mem64[ input_0 + 96 ] x2
r5 = mem64[ input_0 + 104 ] x2
r6 = mem64[ input_0 + 112 ] x2
r7 = mem64[ input_0 + 120 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 64 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 80 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 96 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 112 ] = t0
r0 = mem64[ input_0 + 128 ] x2
r1 = mem64[ input_0 + 136 ] x2
r2 = mem64[ input_0 + 144 ] x2
r3 = mem64[ input_0 + 152 ] x2
r4 = mem64[ input_0 + 160 ] x2
r5 = mem64[ input_0 + 168 ] x2
r6 = mem64[ input_0 + 176 ] x2
r7 = mem64[ input_0 + 184 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 128 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 144 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 160 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 176 ] = t0
r0 = mem64[ input_0 + 192 ] x2
r1 = mem64[ input_0 + 200 ] x2
r2 = mem64[ input_0 + 208 ] x2
r3 = mem64[ input_0 + 216 ] x2
r4 = mem64[ input_0 + 224 ] x2
r5 = mem64[ input_0 + 232 ] x2
r6 = mem64[ input_0 + 240 ] x2
r7 = mem64[ input_0 + 248 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 192 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 208 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 224 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 240 ] = t0
r0 = mem64[ input_0 + 256 ] x2
r1 = mem64[ input_0 + 264 ] x2
r2 = mem64[ input_0 + 272 ] x2
r3 = mem64[ input_0 + 280 ] x2
r4 = mem64[ input_0 + 288 ] x2
r5 = mem64[ input_0 + 296 ] x2
r6 = mem64[ input_0 + 304 ] x2
r7 = mem64[ input_0 + 312 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 256 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 272 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 288 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 304 ] = t0
r0 = mem64[ input_0 + 320 ] x2
r1 = mem64[ input_0 + 328 ] x2
r2 = mem64[ input_0 + 336 ] x2
r3 = mem64[ input_0 + 344 ] x2
r4 = mem64[ input_0 + 352 ] x2
r5 = mem64[ input_0 + 360 ] x2
r6 = mem64[ input_0 + 368 ] x2
r7 = mem64[ input_0 + 376 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 320 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 336 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 352 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 368 ] = t0
r0 = mem64[ input_0 + 384 ] x2
r1 = mem64[ input_0 + 392 ] x2
r2 = mem64[ input_0 + 400 ] x2
r3 = mem64[ input_0 + 408 ] x2
r4 = mem64[ input_0 + 416 ] x2
r5 = mem64[ input_0 + 424 ] x2
r6 = mem64[ input_0 + 432 ] x2
r7 = mem64[ input_0 + 440 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 384 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 400 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 416 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 432 ] = t0
r0 = mem64[ input_0 + 448 ] x2
r1 = mem64[ input_0 + 456 ] x2
r2 = mem64[ input_0 + 464 ] x2
r3 = mem64[ input_0 + 472 ] x2
r4 = mem64[ input_0 + 480 ] x2
r5 = mem64[ input_0 + 488 ] x2
r6 = mem64[ input_0 + 496 ] x2
r7 = mem64[ input_0 + 504 ] x2
v00 = r0 & mask0
v10 = r4 & mask0
2x v10 <<= 4
v01 = r0 & mask1
v11 = r4 & mask1
2x v01 unsigned>>= 4
r0 = v00 | v10
r4 = v01 | v11
v00 = r1 & mask0
v10 = r5 & mask0
2x v10 <<= 4
v01 = r1 & mask1
v11 = r5 & mask1
2x v01 unsigned>>= 4
r1 = v00 | v10
r5 = v01 | v11
v00 = r2 & mask0
v10 = r6 & mask0
2x v10 <<= 4
v01 = r2 & mask1
v11 = r6 & mask1
2x v01 unsigned>>= 4
r2 = v00 | v10
r6 = v01 | v11
v00 = r3 & mask0
v10 = r7 & mask0
2x v10 <<= 4
v01 = r3 & mask1
v11 = r7 & mask1
2x v01 unsigned>>= 4
r3 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask2
v10 = r2 & mask2
2x v10 <<= 2
v01 = r0 & mask3
v11 = r2 & mask3
2x v01 unsigned>>= 2
r0 = v00 | v10
r2 = v01 | v11
v00 = r1 & mask2
v10 = r3 & mask2
2x v10 <<= 2
v01 = r1 & mask3
v11 = r3 & mask3
2x v01 unsigned>>= 2
r1 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask2
v10 = r6 & mask2
2x v10 <<= 2
v01 = r4 & mask3
v11 = r6 & mask3
2x v01 unsigned>>= 2
r4 = v00 | v10
r6 = v01 | v11
v00 = r5 & mask2
v10 = r7 & mask2
2x v10 <<= 2
v01 = r5 & mask3
v11 = r7 & mask3
2x v01 unsigned>>= 2
r5 = v00 | v10
r7 = v01 | v11
v00 = r0 & mask4
v10 = r1 & mask4
2x v10 <<= 1
v01 = r0 & mask5
v11 = r1 & mask5
2x v01 unsigned>>= 1
r0 = v00 | v10
r1 = v01 | v11
v00 = r2 & mask4
v10 = r3 & mask4
2x v10 <<= 1
v01 = r2 & mask5
v11 = r3 & mask5
2x v01 unsigned>>= 1
r2 = v00 | v10
r3 = v01 | v11
v00 = r4 & mask4
v10 = r5 & mask4
2x v10 <<= 1
v01 = r4 & mask5
v11 = r5 & mask5
2x v01 unsigned>>= 1
r4 = v00 | v10
r5 = v01 | v11
v00 = r6 & mask4
v10 = r7 & mask4
2x v10 <<= 1
v01 = r6 & mask5
v11 = r7 & mask5
2x v01 unsigned>>= 1
r6 = v00 | v10
r7 = v01 | v11
t0 = r0[0]r1[0]
mem128[ input_0 + 448 ] = t0
t0 = r2[0]r3[0]
mem128[ input_0 + 464 ] = t0
t0 = r4[0]r5[0]
mem128[ input_0 + 480 ] = t0
t0 = r6[0]r7[0]
mem128[ input_0 + 496 ] = t0
return