-rw-r--r-- 31245 libmceliece-20240812/crypto_kem/348864/avx/transpose_64x256_sp_asm.q raw
reg256 x0
reg256 x1
reg256 x2
reg256 x3
reg256 x4
reg256 x5
reg256 x6
reg256 x7
reg256 t0
reg256 t1
reg256 v00
reg256 v01
reg256 v10
reg256 v11
reg256 mask0
reg256 mask1
reg256 mask2
reg256 mask3
reg256 mask4
reg256 mask5
enter transpose_64x256_sp_asm
mask0 aligned= mem256[ MASK5_0 ]
mask1 aligned= mem256[ MASK5_1 ]
mask2 aligned= mem256[ MASK4_0 ]
mask3 aligned= mem256[ MASK4_1 ]
mask4 aligned= mem256[ MASK3_0 ]
mask5 aligned= mem256[ MASK3_1 ]
x0 = mem256[ input_0 + 0 ]
x1 = mem256[ input_0 + 256 ]
x2 = mem256[ input_0 + 512 ]
x3 = mem256[ input_0 + 768 ]
x4 = mem256[ input_0 + 1024 ]
x5 = mem256[ input_0 + 1280 ]
x6 = mem256[ input_0 + 1536 ]
x7 = mem256[ input_0 + 1792 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 0 ] = x0
mem256[ input_0 + 256 ] = x1
mem256[ input_0 + 512 ] = x2
mem256[ input_0 + 768 ] = x3
mem256[ input_0 + 1024 ] = x4
mem256[ input_0 + 1280 ] = x5
mem256[ input_0 + 1536 ] = x6
mem256[ input_0 + 1792 ] = x7
x0 = mem256[ input_0 + 32 ]
x1 = mem256[ input_0 + 288 ]
x2 = mem256[ input_0 + 544 ]
x3 = mem256[ input_0 + 800 ]
x4 = mem256[ input_0 + 1056 ]
x5 = mem256[ input_0 + 1312 ]
x6 = mem256[ input_0 + 1568 ]
x7 = mem256[ input_0 + 1824 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 32 ] = x0
mem256[ input_0 + 288 ] = x1
mem256[ input_0 + 544 ] = x2
mem256[ input_0 + 800 ] = x3
mem256[ input_0 + 1056 ] = x4
mem256[ input_0 + 1312 ] = x5
mem256[ input_0 + 1568 ] = x6
mem256[ input_0 + 1824 ] = x7
x0 = mem256[ input_0 + 64 ]
x1 = mem256[ input_0 + 320 ]
x2 = mem256[ input_0 + 576 ]
x3 = mem256[ input_0 + 832 ]
x4 = mem256[ input_0 + 1088 ]
x5 = mem256[ input_0 + 1344 ]
x6 = mem256[ input_0 + 1600 ]
x7 = mem256[ input_0 + 1856 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 64 ] = x0
mem256[ input_0 + 320 ] = x1
mem256[ input_0 + 576 ] = x2
mem256[ input_0 + 832 ] = x3
mem256[ input_0 + 1088 ] = x4
mem256[ input_0 + 1344 ] = x5
mem256[ input_0 + 1600 ] = x6
mem256[ input_0 + 1856 ] = x7
x0 = mem256[ input_0 + 96 ]
x1 = mem256[ input_0 + 352 ]
x2 = mem256[ input_0 + 608 ]
x3 = mem256[ input_0 + 864 ]
x4 = mem256[ input_0 + 1120 ]
x5 = mem256[ input_0 + 1376 ]
x6 = mem256[ input_0 + 1632 ]
x7 = mem256[ input_0 + 1888 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 96 ] = x0
mem256[ input_0 + 352 ] = x1
mem256[ input_0 + 608 ] = x2
mem256[ input_0 + 864 ] = x3
mem256[ input_0 + 1120 ] = x4
mem256[ input_0 + 1376 ] = x5
mem256[ input_0 + 1632 ] = x6
mem256[ input_0 + 1888 ] = x7
x0 = mem256[ input_0 + 128 ]
x1 = mem256[ input_0 + 384 ]
x2 = mem256[ input_0 + 640 ]
x3 = mem256[ input_0 + 896 ]
x4 = mem256[ input_0 + 1152 ]
x5 = mem256[ input_0 + 1408 ]
x6 = mem256[ input_0 + 1664 ]
x7 = mem256[ input_0 + 1920 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 128 ] = x0
mem256[ input_0 + 384 ] = x1
mem256[ input_0 + 640 ] = x2
mem256[ input_0 + 896 ] = x3
mem256[ input_0 + 1152 ] = x4
mem256[ input_0 + 1408 ] = x5
mem256[ input_0 + 1664 ] = x6
mem256[ input_0 + 1920 ] = x7
x0 = mem256[ input_0 + 160 ]
x1 = mem256[ input_0 + 416 ]
x2 = mem256[ input_0 + 672 ]
x3 = mem256[ input_0 + 928 ]
x4 = mem256[ input_0 + 1184 ]
x5 = mem256[ input_0 + 1440 ]
x6 = mem256[ input_0 + 1696 ]
x7 = mem256[ input_0 + 1952 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 160 ] = x0
mem256[ input_0 + 416 ] = x1
mem256[ input_0 + 672 ] = x2
mem256[ input_0 + 928 ] = x3
mem256[ input_0 + 1184 ] = x4
mem256[ input_0 + 1440 ] = x5
mem256[ input_0 + 1696 ] = x6
mem256[ input_0 + 1952 ] = x7
x0 = mem256[ input_0 + 192 ]
x1 = mem256[ input_0 + 448 ]
x2 = mem256[ input_0 + 704 ]
x3 = mem256[ input_0 + 960 ]
x4 = mem256[ input_0 + 1216 ]
x5 = mem256[ input_0 + 1472 ]
x6 = mem256[ input_0 + 1728 ]
x7 = mem256[ input_0 + 1984 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 192 ] = x0
mem256[ input_0 + 448 ] = x1
mem256[ input_0 + 704 ] = x2
mem256[ input_0 + 960 ] = x3
mem256[ input_0 + 1216 ] = x4
mem256[ input_0 + 1472 ] = x5
mem256[ input_0 + 1728 ] = x6
mem256[ input_0 + 1984 ] = x7
x0 = mem256[ input_0 + 224 ]
x1 = mem256[ input_0 + 480 ]
x2 = mem256[ input_0 + 736 ]
x3 = mem256[ input_0 + 992 ]
x4 = mem256[ input_0 + 1248 ]
x5 = mem256[ input_0 + 1504 ]
x6 = mem256[ input_0 + 1760 ]
x7 = mem256[ input_0 + 2016 ]
v00 = x0 & mask0
4x v10 = x4 << 32
4x v01 = x0 unsigned>> 32
v11 = x4 & mask1
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
4x v10 = x5 << 32
4x v01 = x1 unsigned>> 32
v11 = x5 & mask1
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
4x v10 = x6 << 32
4x v01 = x2 unsigned>> 32
v11 = x6 & mask1
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
4x v10 = x7 << 32
4x v01 = x3 unsigned>> 32
v11 = x7 & mask1
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
8x v10 = x2 << 16
8x v01 = x0 unsigned>> 16
v11 = x2 & mask3
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
8x v10 = x3 << 16
8x v01 = x1 unsigned>> 16
v11 = x3 & mask3
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
8x v10 = x6 << 16
8x v01 = x4 unsigned>> 16
v11 = x6 & mask3
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
8x v10 = x7 << 16
8x v01 = x5 unsigned>> 16
v11 = x7 & mask3
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
16x v10 = x1 << 8
16x v01 = x0 unsigned>> 8
v11 = x1 & mask5
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
16x v10 = x3 << 8
16x v01 = x2 unsigned>> 8
v11 = x3 & mask5
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
16x v10 = x5 << 8
16x v01 = x4 unsigned>> 8
v11 = x5 & mask5
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
16x v10 = x7 << 8
16x v01 = x6 unsigned>> 8
v11 = x7 & mask5
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 224 ] = x0
mem256[ input_0 + 480 ] = x1
mem256[ input_0 + 736 ] = x2
mem256[ input_0 + 992 ] = x3
mem256[ input_0 + 1248 ] = x4
mem256[ input_0 + 1504 ] = x5
mem256[ input_0 + 1760 ] = x6
mem256[ input_0 + 2016 ] = x7
mask0 aligned= mem256[ MASK2_0 ]
mask1 aligned= mem256[ MASK2_1 ]
mask2 aligned= mem256[ MASK1_0 ]
mask3 aligned= mem256[ MASK1_1 ]
mask4 aligned= mem256[ MASK0_0 ]
mask5 aligned= mem256[ MASK0_1 ]
x0 = mem256[ input_0 + 0 ]
x1 = mem256[ input_0 + 32 ]
x2 = mem256[ input_0 + 64 ]
x3 = mem256[ input_0 + 96 ]
x4 = mem256[ input_0 + 128 ]
x5 = mem256[ input_0 + 160 ]
x6 = mem256[ input_0 + 192 ]
x7 = mem256[ input_0 + 224 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 0 ] = x0
mem256[ input_0 + 32 ] = x1
mem256[ input_0 + 64 ] = x2
mem256[ input_0 + 96 ] = x3
mem256[ input_0 + 128 ] = x4
mem256[ input_0 + 160 ] = x5
mem256[ input_0 + 192 ] = x6
mem256[ input_0 + 224 ] = x7
x0 = mem256[ input_0 + 256 ]
x1 = mem256[ input_0 + 288 ]
x2 = mem256[ input_0 + 320 ]
x3 = mem256[ input_0 + 352 ]
x4 = mem256[ input_0 + 384 ]
x5 = mem256[ input_0 + 416 ]
x6 = mem256[ input_0 + 448 ]
x7 = mem256[ input_0 + 480 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 256 ] = x0
mem256[ input_0 + 288 ] = x1
mem256[ input_0 + 320 ] = x2
mem256[ input_0 + 352 ] = x3
mem256[ input_0 + 384 ] = x4
mem256[ input_0 + 416 ] = x5
mem256[ input_0 + 448 ] = x6
mem256[ input_0 + 480 ] = x7
x0 = mem256[ input_0 + 512 ]
x1 = mem256[ input_0 + 544 ]
x2 = mem256[ input_0 + 576 ]
x3 = mem256[ input_0 + 608 ]
x4 = mem256[ input_0 + 640 ]
x5 = mem256[ input_0 + 672 ]
x6 = mem256[ input_0 + 704 ]
x7 = mem256[ input_0 + 736 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 512 ] = x0
mem256[ input_0 + 544 ] = x1
mem256[ input_0 + 576 ] = x2
mem256[ input_0 + 608 ] = x3
mem256[ input_0 + 640 ] = x4
mem256[ input_0 + 672 ] = x5
mem256[ input_0 + 704 ] = x6
mem256[ input_0 + 736 ] = x7
x0 = mem256[ input_0 + 768 ]
x1 = mem256[ input_0 + 800 ]
x2 = mem256[ input_0 + 832 ]
x3 = mem256[ input_0 + 864 ]
x4 = mem256[ input_0 + 896 ]
x5 = mem256[ input_0 + 928 ]
x6 = mem256[ input_0 + 960 ]
x7 = mem256[ input_0 + 992 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 768 ] = x0
mem256[ input_0 + 800 ] = x1
mem256[ input_0 + 832 ] = x2
mem256[ input_0 + 864 ] = x3
mem256[ input_0 + 896 ] = x4
mem256[ input_0 + 928 ] = x5
mem256[ input_0 + 960 ] = x6
mem256[ input_0 + 992 ] = x7
x0 = mem256[ input_0 + 1024 ]
x1 = mem256[ input_0 + 1056 ]
x2 = mem256[ input_0 + 1088 ]
x3 = mem256[ input_0 + 1120 ]
x4 = mem256[ input_0 + 1152 ]
x5 = mem256[ input_0 + 1184 ]
x6 = mem256[ input_0 + 1216 ]
x7 = mem256[ input_0 + 1248 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 1024 ] = x0
mem256[ input_0 + 1056 ] = x1
mem256[ input_0 + 1088 ] = x2
mem256[ input_0 + 1120 ] = x3
mem256[ input_0 + 1152 ] = x4
mem256[ input_0 + 1184 ] = x5
mem256[ input_0 + 1216 ] = x6
mem256[ input_0 + 1248 ] = x7
x0 = mem256[ input_0 + 1280 ]
x1 = mem256[ input_0 + 1312 ]
x2 = mem256[ input_0 + 1344 ]
x3 = mem256[ input_0 + 1376 ]
x4 = mem256[ input_0 + 1408 ]
x5 = mem256[ input_0 + 1440 ]
x6 = mem256[ input_0 + 1472 ]
x7 = mem256[ input_0 + 1504 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 1280 ] = x0
mem256[ input_0 + 1312 ] = x1
mem256[ input_0 + 1344 ] = x2
mem256[ input_0 + 1376 ] = x3
mem256[ input_0 + 1408 ] = x4
mem256[ input_0 + 1440 ] = x5
mem256[ input_0 + 1472 ] = x6
mem256[ input_0 + 1504 ] = x7
x0 = mem256[ input_0 + 1536 ]
x1 = mem256[ input_0 + 1568 ]
x2 = mem256[ input_0 + 1600 ]
x3 = mem256[ input_0 + 1632 ]
x4 = mem256[ input_0 + 1664 ]
x5 = mem256[ input_0 + 1696 ]
x6 = mem256[ input_0 + 1728 ]
x7 = mem256[ input_0 + 1760 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 1536 ] = x0
mem256[ input_0 + 1568 ] = x1
mem256[ input_0 + 1600 ] = x2
mem256[ input_0 + 1632 ] = x3
mem256[ input_0 + 1664 ] = x4
mem256[ input_0 + 1696 ] = x5
mem256[ input_0 + 1728 ] = x6
mem256[ input_0 + 1760 ] = x7
x0 = mem256[ input_0 + 1792 ]
x1 = mem256[ input_0 + 1824 ]
x2 = mem256[ input_0 + 1856 ]
x3 = mem256[ input_0 + 1888 ]
x4 = mem256[ input_0 + 1920 ]
x5 = mem256[ input_0 + 1952 ]
x6 = mem256[ input_0 + 1984 ]
x7 = mem256[ input_0 + 2016 ]
v00 = x0 & mask0
v10 = x4 & mask0
4x v10 <<= 4
v01 = x0 & mask1
v11 = x4 & mask1
4x v01 unsigned>>= 4
x0 = v00 | v10
x4 = v01 | v11
v00 = x1 & mask0
v10 = x5 & mask0
4x v10 <<= 4
v01 = x1 & mask1
v11 = x5 & mask1
4x v01 unsigned>>= 4
x1 = v00 | v10
x5 = v01 | v11
v00 = x2 & mask0
v10 = x6 & mask0
4x v10 <<= 4
v01 = x2 & mask1
v11 = x6 & mask1
4x v01 unsigned>>= 4
x2 = v00 | v10
x6 = v01 | v11
v00 = x3 & mask0
v10 = x7 & mask0
4x v10 <<= 4
v01 = x3 & mask1
v11 = x7 & mask1
4x v01 unsigned>>= 4
x3 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask2
v10 = x2 & mask2
4x v10 <<= 2
v01 = x0 & mask3
v11 = x2 & mask3
4x v01 unsigned>>= 2
x0 = v00 | v10
x2 = v01 | v11
v00 = x1 & mask2
v10 = x3 & mask2
4x v10 <<= 2
v01 = x1 & mask3
v11 = x3 & mask3
4x v01 unsigned>>= 2
x1 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask2
v10 = x6 & mask2
4x v10 <<= 2
v01 = x4 & mask3
v11 = x6 & mask3
4x v01 unsigned>>= 2
x4 = v00 | v10
x6 = v01 | v11
v00 = x5 & mask2
v10 = x7 & mask2
4x v10 <<= 2
v01 = x5 & mask3
v11 = x7 & mask3
4x v01 unsigned>>= 2
x5 = v00 | v10
x7 = v01 | v11
v00 = x0 & mask4
v10 = x1 & mask4
4x v10 <<= 1
v01 = x0 & mask5
v11 = x1 & mask5
4x v01 unsigned>>= 1
x0 = v00 | v10
x1 = v01 | v11
v00 = x2 & mask4
v10 = x3 & mask4
4x v10 <<= 1
v01 = x2 & mask5
v11 = x3 & mask5
4x v01 unsigned>>= 1
x2 = v00 | v10
x3 = v01 | v11
v00 = x4 & mask4
v10 = x5 & mask4
4x v10 <<= 1
v01 = x4 & mask5
v11 = x5 & mask5
4x v01 unsigned>>= 1
x4 = v00 | v10
x5 = v01 | v11
v00 = x6 & mask4
v10 = x7 & mask4
4x v10 <<= 1
v01 = x6 & mask5
v11 = x7 & mask5
4x v01 unsigned>>= 1
x6 = v00 | v10
x7 = v01 | v11
mem256[ input_0 + 1792 ] = x0
mem256[ input_0 + 1824 ] = x1
mem256[ input_0 + 1856 ] = x2
mem256[ input_0 + 1888 ] = x3
mem256[ input_0 + 1920 ] = x4
mem256[ input_0 + 1952 ] = x5
mem256[ input_0 + 1984 ] = x6
mem256[ input_0 + 2016 ] = x7
return