-rw-r--r-- 31045 libmceliece-20240812/crypto_kem/460896/avx/transpose_64x128_sp_asm.q raw
reg128 x0
reg128 x1
reg128 x2
reg128 x3
reg128 x4
reg128 x5
reg128 x6
reg128 x7
reg128 t0
reg128 t1
reg128 v00
reg128 v01
reg128 v10
reg128 v11
reg128 mask0
reg128 mask1
reg128 mask2
reg128 mask3
reg128 mask4
reg128 mask5
enter transpose_64x128_sp_asm
mask0 aligned= mem128[ MASK5_0 ]
mask1 aligned= mem128[ MASK5_1 ]
mask2 aligned= mem128[ MASK4_0 ]
mask3 aligned= mem128[ MASK4_1 ]
mask4 aligned= mem128[ MASK3_0 ]
mask5 aligned= mem128[ MASK3_1 ]
x0 = mem128[ input_0 + 0 ]
x1 = mem128[ input_0 + 128 ]
x2 = mem128[ input_0 + 256 ]
x3 = mem128[ input_0 + 384 ]
x4 = mem128[ input_0 + 512 ]
x5 = mem128[ input_0 + 640 ]
x6 = mem128[ input_0 + 768 ]
x7 = mem128[ input_0 + 896 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 0 ] = x0
mem128[ input_0 + 128 ] = x1
mem128[ input_0 + 256 ] = x2
mem128[ input_0 + 384 ] = x3
mem128[ input_0 + 512 ] = x4
mem128[ input_0 + 640 ] = x5
mem128[ input_0 + 768 ] = x6
mem128[ input_0 + 896 ] = x7
x0 = mem128[ input_0 + 16 ]
x1 = mem128[ input_0 + 144 ]
x2 = mem128[ input_0 + 272 ]
x3 = mem128[ input_0 + 400 ]
x4 = mem128[ input_0 + 528 ]
x5 = mem128[ input_0 + 656 ]
x6 = mem128[ input_0 + 784 ]
x7 = mem128[ input_0 + 912 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 16 ] = x0
mem128[ input_0 + 144 ] = x1
mem128[ input_0 + 272 ] = x2
mem128[ input_0 + 400 ] = x3
mem128[ input_0 + 528 ] = x4
mem128[ input_0 + 656 ] = x5
mem128[ input_0 + 784 ] = x6
mem128[ input_0 + 912 ] = x7
x0 = mem128[ input_0 + 32 ]
x1 = mem128[ input_0 + 160 ]
x2 = mem128[ input_0 + 288 ]
x3 = mem128[ input_0 + 416 ]
x4 = mem128[ input_0 + 544 ]
x5 = mem128[ input_0 + 672 ]
x6 = mem128[ input_0 + 800 ]
x7 = mem128[ input_0 + 928 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 32 ] = x0
mem128[ input_0 + 160 ] = x1
mem128[ input_0 + 288 ] = x2
mem128[ input_0 + 416 ] = x3
mem128[ input_0 + 544 ] = x4
mem128[ input_0 + 672 ] = x5
mem128[ input_0 + 800 ] = x6
mem128[ input_0 + 928 ] = x7
x0 = mem128[ input_0 + 48 ]
x1 = mem128[ input_0 + 176 ]
x2 = mem128[ input_0 + 304 ]
x3 = mem128[ input_0 + 432 ]
x4 = mem128[ input_0 + 560 ]
x5 = mem128[ input_0 + 688 ]
x6 = mem128[ input_0 + 816 ]
x7 = mem128[ input_0 + 944 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 48 ] = x0
mem128[ input_0 + 176 ] = x1
mem128[ input_0 + 304 ] = x2
mem128[ input_0 + 432 ] = x3
mem128[ input_0 + 560 ] = x4
mem128[ input_0 + 688 ] = x5
mem128[ input_0 + 816 ] = x6
mem128[ input_0 + 944 ] = x7
x0 = mem128[ input_0 + 64 ]
x1 = mem128[ input_0 + 192 ]
x2 = mem128[ input_0 + 320 ]
x3 = mem128[ input_0 + 448 ]
x4 = mem128[ input_0 + 576 ]
x5 = mem128[ input_0 + 704 ]
x6 = mem128[ input_0 + 832 ]
x7 = mem128[ input_0 + 960 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 64 ] = x0
mem128[ input_0 + 192 ] = x1
mem128[ input_0 + 320 ] = x2
mem128[ input_0 + 448 ] = x3
mem128[ input_0 + 576 ] = x4
mem128[ input_0 + 704 ] = x5
mem128[ input_0 + 832 ] = x6
mem128[ input_0 + 960 ] = x7
x0 = mem128[ input_0 + 80 ]
x1 = mem128[ input_0 + 208 ]
x2 = mem128[ input_0 + 336 ]
x3 = mem128[ input_0 + 464 ]
x4 = mem128[ input_0 + 592 ]
x5 = mem128[ input_0 + 720 ]
x6 = mem128[ input_0 + 848 ]
x7 = mem128[ input_0 + 976 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 80 ] = x0
mem128[ input_0 + 208 ] = x1
mem128[ input_0 + 336 ] = x2
mem128[ input_0 + 464 ] = x3
mem128[ input_0 + 592 ] = x4
mem128[ input_0 + 720 ] = x5
mem128[ input_0 + 848 ] = x6
mem128[ input_0 + 976 ] = x7
x0 = mem128[ input_0 + 96 ]
x1 = mem128[ input_0 + 224 ]
x2 = mem128[ input_0 + 352 ]
x3 = mem128[ input_0 + 480 ]
x4 = mem128[ input_0 + 608 ]
x5 = mem128[ input_0 + 736 ]
x6 = mem128[ input_0 + 864 ]
x7 = mem128[ input_0 + 992 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 96 ] = x0
mem128[ input_0 + 224 ] = x1
mem128[ input_0 + 352 ] = x2
mem128[ input_0 + 480 ] = x3
mem128[ input_0 + 608 ] = x4
mem128[ input_0 + 736 ] = x5
mem128[ input_0 + 864 ] = x6
mem128[ input_0 + 992 ] = x7
x0 = mem128[ input_0 + 112 ]
x1 = mem128[ input_0 + 240 ]
x2 = mem128[ input_0 + 368 ]
x3 = mem128[ input_0 + 496 ]
x4 = mem128[ input_0 + 624 ]
x5 = mem128[ input_0 + 752 ]
x6 = mem128[ input_0 + 880 ]
x7 = mem128[ input_0 + 1008 ]
v00 = x0 & mask0
2x v10 = x4 << 32
2x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
2x v10 = x5 << 32
2x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
2x v10 = x6 << 32
2x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
2x v10 = x7 << 32
2x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
4x v10 = x2 << 16
4x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
4x v10 = x3 << 16
4x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
4x v10 = x6 << 16
4x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
4x v10 = x7 << 16
4x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
8x v10 = x1 << 8
8x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
8x v10 = x3 << 8
8x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
8x v10 = x5 << 8
8x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
8x v10 = x7 << 8
8x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 112 ] = x0
mem128[ input_0 + 240 ] = x1
mem128[ input_0 + 368 ] = x2
mem128[ input_0 + 496 ] = x3
mem128[ input_0 + 624 ] = x4
mem128[ input_0 + 752 ] = x5
mem128[ input_0 + 880 ] = x6
mem128[ input_0 + 1008 ] = x7
mask0 aligned= mem128[ MASK2_0 ]
mask1 aligned= mem128[ MASK2_1 ]
mask2 aligned= mem128[ MASK1_0 ]
mask3 aligned= mem128[ MASK1_1 ]
mask4 aligned= mem128[ MASK0_0 ]
mask5 aligned= mem128[ MASK0_1 ]
x0 = mem128[ input_0 + 0 ]
x1 = mem128[ input_0 + 16 ]
x2 = mem128[ input_0 + 32 ]
x3 = mem128[ input_0 + 48 ]
x4 = mem128[ input_0 + 64 ]
x5 = mem128[ input_0 + 80 ]
x6 = mem128[ input_0 + 96 ]
x7 = mem128[ input_0 + 112 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 0 ] = x0
mem128[ input_0 + 16 ] = x1
mem128[ input_0 + 32 ] = x2
mem128[ input_0 + 48 ] = x3
mem128[ input_0 + 64 ] = x4
mem128[ input_0 + 80 ] = x5
mem128[ input_0 + 96 ] = x6
mem128[ input_0 + 112 ] = x7
x0 = mem128[ input_0 + 128 ]
x1 = mem128[ input_0 + 144 ]
x2 = mem128[ input_0 + 160 ]
x3 = mem128[ input_0 + 176 ]
x4 = mem128[ input_0 + 192 ]
x5 = mem128[ input_0 + 208 ]
x6 = mem128[ input_0 + 224 ]
x7 = mem128[ input_0 + 240 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 128 ] = x0
mem128[ input_0 + 144 ] = x1
mem128[ input_0 + 160 ] = x2
mem128[ input_0 + 176 ] = x3
mem128[ input_0 + 192 ] = x4
mem128[ input_0 + 208 ] = x5
mem128[ input_0 + 224 ] = x6
mem128[ input_0 + 240 ] = x7
x0 = mem128[ input_0 + 256 ]
x1 = mem128[ input_0 + 272 ]
x2 = mem128[ input_0 + 288 ]
x3 = mem128[ input_0 + 304 ]
x4 = mem128[ input_0 + 320 ]
x5 = mem128[ input_0 + 336 ]
x6 = mem128[ input_0 + 352 ]
x7 = mem128[ input_0 + 368 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 256 ] = x0
mem128[ input_0 + 272 ] = x1
mem128[ input_0 + 288 ] = x2
mem128[ input_0 + 304 ] = x3
mem128[ input_0 + 320 ] = x4
mem128[ input_0 + 336 ] = x5
mem128[ input_0 + 352 ] = x6
mem128[ input_0 + 368 ] = x7
x0 = mem128[ input_0 + 384 ]
x1 = mem128[ input_0 + 400 ]
x2 = mem128[ input_0 + 416 ]
x3 = mem128[ input_0 + 432 ]
x4 = mem128[ input_0 + 448 ]
x5 = mem128[ input_0 + 464 ]
x6 = mem128[ input_0 + 480 ]
x7 = mem128[ input_0 + 496 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 384 ] = x0
mem128[ input_0 + 400 ] = x1
mem128[ input_0 + 416 ] = x2
mem128[ input_0 + 432 ] = x3
mem128[ input_0 + 448 ] = x4
mem128[ input_0 + 464 ] = x5
mem128[ input_0 + 480 ] = x6
mem128[ input_0 + 496 ] = x7
x0 = mem128[ input_0 + 512 ]
x1 = mem128[ input_0 + 528 ]
x2 = mem128[ input_0 + 544 ]
x3 = mem128[ input_0 + 560 ]
x4 = mem128[ input_0 + 576 ]
x5 = mem128[ input_0 + 592 ]
x6 = mem128[ input_0 + 608 ]
x7 = mem128[ input_0 + 624 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 512 ] = x0
mem128[ input_0 + 528 ] = x1
mem128[ input_0 + 544 ] = x2
mem128[ input_0 + 560 ] = x3
mem128[ input_0 + 576 ] = x4
mem128[ input_0 + 592 ] = x5
mem128[ input_0 + 608 ] = x6
mem128[ input_0 + 624 ] = x7
x0 = mem128[ input_0 + 640 ]
x1 = mem128[ input_0 + 656 ]
x2 = mem128[ input_0 + 672 ]
x3 = mem128[ input_0 + 688 ]
x4 = mem128[ input_0 + 704 ]
x5 = mem128[ input_0 + 720 ]
x6 = mem128[ input_0 + 736 ]
x7 = mem128[ input_0 + 752 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 640 ] = x0
mem128[ input_0 + 656 ] = x1
mem128[ input_0 + 672 ] = x2
mem128[ input_0 + 688 ] = x3
mem128[ input_0 + 704 ] = x4
mem128[ input_0 + 720 ] = x5
mem128[ input_0 + 736 ] = x6
mem128[ input_0 + 752 ] = x7
x0 = mem128[ input_0 + 768 ]
x1 = mem128[ input_0 + 784 ]
x2 = mem128[ input_0 + 800 ]
x3 = mem128[ input_0 + 816 ]
x4 = mem128[ input_0 + 832 ]
x5 = mem128[ input_0 + 848 ]
x6 = mem128[ input_0 + 864 ]
x7 = mem128[ input_0 + 880 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 768 ] = x0
mem128[ input_0 + 784 ] = x1
mem128[ input_0 + 800 ] = x2
mem128[ input_0 + 816 ] = x3
mem128[ input_0 + 832 ] = x4
mem128[ input_0 + 848 ] = x5
mem128[ input_0 + 864 ] = x6
mem128[ input_0 + 880 ] = x7
x0 = mem128[ input_0 + 896 ]
x1 = mem128[ input_0 + 912 ]
x2 = mem128[ input_0 + 928 ]
x3 = mem128[ input_0 + 944 ]
x4 = mem128[ input_0 + 960 ]
x5 = mem128[ input_0 + 976 ]
x6 = mem128[ input_0 + 992 ]
x7 = mem128[ input_0 + 1008 ]
v00 = x0 & mask0
v10 = x4 & mask0
2x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
2x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
2x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
2x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
2x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
2x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
2x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
2x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
2x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
2x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
2x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
2x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
2x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
2x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
2x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
2x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
2x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
2x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
2x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
2x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
2x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
2x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
2x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
2x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem128[ input_0 + 896 ] = x0
mem128[ input_0 + 912 ] = x1
mem128[ input_0 + 928 ] = x2
mem128[ input_0 + 944 ] = x3
mem128[ input_0 + 960 ] = x4
mem128[ input_0 + 976 ] = x5
mem128[ input_0 + 992 ] = x6
mem128[ input_0 + 1008 ] = x7
return