reg256 x0 reg256 x1 reg256 x2 reg256 x3 reg256 x4 reg256 x5 reg256 x6 reg256 x7 reg256 t0 reg256 t1 reg256 v00 reg256 v01 reg256 v10 reg256 v11 reg256 mask0 reg256 mask1 reg256 mask2 reg256 mask3 reg256 mask4 reg256 mask5 enter transpose_64x256_sp_asm mask0 aligned= mem256[ MASK5_0 ] mask1 aligned= mem256[ MASK5_1 ] mask2 aligned= mem256[ MASK4_0 ] mask3 aligned= mem256[ MASK4_1 ] mask4 aligned= mem256[ MASK3_0 ] mask5 aligned= mem256[ MASK3_1 ] x0 = mem256[ input_0 + 0 ] x1 = mem256[ input_0 + 256 ] x2 = mem256[ input_0 + 512 ] x3 = mem256[ input_0 + 768 ] x4 = mem256[ input_0 + 1024 ] x5 = mem256[ input_0 + 1280 ] x6 = mem256[ input_0 + 1536 ] x7 = mem256[ input_0 + 1792 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 0 ] = x0 mem256[ input_0 + 256 ] = x1 mem256[ input_0 + 512 ] = x2 mem256[ input_0 + 768 ] = x3 mem256[ input_0 + 1024 ] = x4 mem256[ input_0 + 1280 ] = x5 mem256[ input_0 + 1536 ] = x6 mem256[ input_0 + 1792 ] = x7 x0 = mem256[ input_0 + 32 ] x1 = mem256[ input_0 + 288 ] x2 = mem256[ input_0 + 544 ] x3 = mem256[ input_0 + 800 ] x4 = mem256[ input_0 + 1056 ] x5 = mem256[ input_0 + 1312 ] x6 = mem256[ input_0 + 1568 ] x7 = mem256[ input_0 + 1824 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 32 ] = x0 mem256[ input_0 + 288 ] = x1 mem256[ input_0 + 544 ] = x2 mem256[ input_0 + 800 ] = x3 mem256[ input_0 + 1056 ] = x4 mem256[ input_0 + 1312 ] = x5 mem256[ input_0 + 1568 ] = x6 mem256[ input_0 + 1824 ] = x7 x0 = mem256[ input_0 + 64 ] x1 = mem256[ input_0 + 320 ] x2 = mem256[ input_0 + 576 ] x3 = mem256[ input_0 + 832 ] x4 = mem256[ input_0 + 1088 ] x5 = mem256[ input_0 + 1344 ] x6 = mem256[ input_0 + 1600 ] x7 = mem256[ input_0 + 1856 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 64 ] = x0 mem256[ input_0 + 320 ] = x1 mem256[ input_0 + 576 ] = x2 mem256[ input_0 + 832 ] = x3 mem256[ input_0 + 1088 ] = x4 mem256[ input_0 + 1344 ] = x5 mem256[ input_0 + 1600 ] = x6 mem256[ input_0 + 1856 ] = x7 x0 = mem256[ input_0 + 96 ] x1 = mem256[ input_0 + 352 ] x2 = mem256[ input_0 + 608 ] x3 = mem256[ input_0 + 864 ] x4 = mem256[ input_0 + 1120 ] x5 = mem256[ input_0 + 1376 ] x6 = mem256[ input_0 + 1632 ] x7 = mem256[ input_0 + 1888 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 96 ] = x0 mem256[ input_0 + 352 ] = x1 mem256[ input_0 + 608 ] = x2 mem256[ input_0 + 864 ] = x3 mem256[ input_0 + 1120 ] = x4 mem256[ input_0 + 1376 ] = x5 mem256[ input_0 + 1632 ] = x6 mem256[ input_0 + 1888 ] = x7 x0 = mem256[ input_0 + 128 ] x1 = mem256[ input_0 + 384 ] x2 = mem256[ input_0 + 640 ] x3 = mem256[ input_0 + 896 ] x4 = mem256[ input_0 + 1152 ] x5 = mem256[ input_0 + 1408 ] x6 = mem256[ input_0 + 1664 ] x7 = mem256[ input_0 + 1920 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 128 ] = x0 mem256[ input_0 + 384 ] = x1 mem256[ input_0 + 640 ] = x2 mem256[ input_0 + 896 ] = x3 mem256[ input_0 + 1152 ] = x4 mem256[ input_0 + 1408 ] = x5 mem256[ input_0 + 1664 ] = x6 mem256[ input_0 + 1920 ] = x7 x0 = mem256[ input_0 + 160 ] x1 = mem256[ input_0 + 416 ] x2 = mem256[ input_0 + 672 ] x3 = mem256[ input_0 + 928 ] x4 = mem256[ input_0 + 1184 ] x5 = mem256[ input_0 + 1440 ] x6 = mem256[ input_0 + 1696 ] x7 = mem256[ input_0 + 1952 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 160 ] = x0 mem256[ input_0 + 416 ] = x1 mem256[ input_0 + 672 ] = x2 mem256[ input_0 + 928 ] = x3 mem256[ input_0 + 1184 ] = x4 mem256[ input_0 + 1440 ] = x5 mem256[ input_0 + 1696 ] = x6 mem256[ input_0 + 1952 ] = x7 x0 = mem256[ input_0 + 192 ] x1 = mem256[ input_0 + 448 ] x2 = mem256[ input_0 + 704 ] x3 = mem256[ input_0 + 960 ] x4 = mem256[ input_0 + 1216 ] x5 = mem256[ input_0 + 1472 ] x6 = mem256[ input_0 + 1728 ] x7 = mem256[ input_0 + 1984 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 192 ] = x0 mem256[ input_0 + 448 ] = x1 mem256[ input_0 + 704 ] = x2 mem256[ input_0 + 960 ] = x3 mem256[ input_0 + 1216 ] = x4 mem256[ input_0 + 1472 ] = x5 mem256[ input_0 + 1728 ] = x6 mem256[ input_0 + 1984 ] = x7 x0 = mem256[ input_0 + 224 ] x1 = mem256[ input_0 + 480 ] x2 = mem256[ input_0 + 736 ] x3 = mem256[ input_0 + 992 ] x4 = mem256[ input_0 + 1248 ] x5 = mem256[ input_0 + 1504 ] x6 = mem256[ input_0 + 1760 ] x7 = mem256[ input_0 + 2016 ] v00 = x0 & mask0 4x v10 = x4 << 32 4x v01 = x0 unsigned>> 32 v11 = x4 & mask1 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 4x v10 = x5 << 32 4x v01 = x1 unsigned>> 32 v11 = x5 & mask1 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 4x v10 = x6 << 32 4x v01 = x2 unsigned>> 32 v11 = x6 & mask1 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 4x v10 = x7 << 32 4x v01 = x3 unsigned>> 32 v11 = x7 & mask1 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 8x v10 = x2 << 16 8x v01 = x0 unsigned>> 16 v11 = x2 & mask3 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 8x v10 = x3 << 16 8x v01 = x1 unsigned>> 16 v11 = x3 & mask3 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 8x v10 = x6 << 16 8x v01 = x4 unsigned>> 16 v11 = x6 & mask3 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 8x v10 = x7 << 16 8x v01 = x5 unsigned>> 16 v11 = x7 & mask3 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 16x v10 = x1 << 8 16x v01 = x0 unsigned>> 8 v11 = x1 & mask5 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 16x v10 = x3 << 8 16x v01 = x2 unsigned>> 8 v11 = x3 & mask5 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 16x v10 = x5 << 8 16x v01 = x4 unsigned>> 8 v11 = x5 & mask5 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 16x v10 = x7 << 8 16x v01 = x6 unsigned>> 8 v11 = x7 & mask5 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 224 ] = x0 mem256[ input_0 + 480 ] = x1 mem256[ input_0 + 736 ] = x2 mem256[ input_0 + 992 ] = x3 mem256[ input_0 + 1248 ] = x4 mem256[ input_0 + 1504 ] = x5 mem256[ input_0 + 1760 ] = x6 mem256[ input_0 + 2016 ] = x7 mask0 aligned= mem256[ MASK2_0 ] mask1 aligned= mem256[ MASK2_1 ] mask2 aligned= mem256[ MASK1_0 ] mask3 aligned= mem256[ MASK1_1 ] mask4 aligned= mem256[ MASK0_0 ] mask5 aligned= mem256[ MASK0_1 ] x0 = mem256[ input_0 + 0 ] x1 = mem256[ input_0 + 32 ] x2 = mem256[ input_0 + 64 ] x3 = mem256[ input_0 + 96 ] x4 = mem256[ input_0 + 128 ] x5 = mem256[ input_0 + 160 ] x6 = mem256[ input_0 + 192 ] x7 = mem256[ input_0 + 224 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 0 ] = x0 mem256[ input_0 + 32 ] = x1 mem256[ input_0 + 64 ] = x2 mem256[ input_0 + 96 ] = x3 mem256[ input_0 + 128 ] = x4 mem256[ input_0 + 160 ] = x5 mem256[ input_0 + 192 ] = x6 mem256[ input_0 + 224 ] = x7 x0 = mem256[ input_0 + 256 ] x1 = mem256[ input_0 + 288 ] x2 = mem256[ input_0 + 320 ] x3 = mem256[ input_0 + 352 ] x4 = mem256[ input_0 + 384 ] x5 = mem256[ input_0 + 416 ] x6 = mem256[ input_0 + 448 ] x7 = mem256[ input_0 + 480 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 256 ] = x0 mem256[ input_0 + 288 ] = x1 mem256[ input_0 + 320 ] = x2 mem256[ input_0 + 352 ] = x3 mem256[ input_0 + 384 ] = x4 mem256[ input_0 + 416 ] = x5 mem256[ input_0 + 448 ] = x6 mem256[ input_0 + 480 ] = x7 x0 = mem256[ input_0 + 512 ] x1 = mem256[ input_0 + 544 ] x2 = mem256[ input_0 + 576 ] x3 = mem256[ input_0 + 608 ] x4 = mem256[ input_0 + 640 ] x5 = mem256[ input_0 + 672 ] x6 = mem256[ input_0 + 704 ] x7 = mem256[ input_0 + 736 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 512 ] = x0 mem256[ input_0 + 544 ] = x1 mem256[ input_0 + 576 ] = x2 mem256[ input_0 + 608 ] = x3 mem256[ input_0 + 640 ] = x4 mem256[ input_0 + 672 ] = x5 mem256[ input_0 + 704 ] = x6 mem256[ input_0 + 736 ] = x7 x0 = mem256[ input_0 + 768 ] x1 = mem256[ input_0 + 800 ] x2 = mem256[ input_0 + 832 ] x3 = mem256[ input_0 + 864 ] x4 = mem256[ input_0 + 896 ] x5 = mem256[ input_0 + 928 ] x6 = mem256[ input_0 + 960 ] x7 = mem256[ input_0 + 992 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 768 ] = x0 mem256[ input_0 + 800 ] = x1 mem256[ input_0 + 832 ] = x2 mem256[ input_0 + 864 ] = x3 mem256[ input_0 + 896 ] = x4 mem256[ input_0 + 928 ] = x5 mem256[ input_0 + 960 ] = x6 mem256[ input_0 + 992 ] = x7 x0 = mem256[ input_0 + 1024 ] x1 = mem256[ input_0 + 1056 ] x2 = mem256[ input_0 + 1088 ] x3 = mem256[ input_0 + 1120 ] x4 = mem256[ input_0 + 1152 ] x5 = mem256[ input_0 + 1184 ] x6 = mem256[ input_0 + 1216 ] x7 = mem256[ input_0 + 1248 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 1024 ] = x0 mem256[ input_0 + 1056 ] = x1 mem256[ input_0 + 1088 ] = x2 mem256[ input_0 + 1120 ] = x3 mem256[ input_0 + 1152 ] = x4 mem256[ input_0 + 1184 ] = x5 mem256[ input_0 + 1216 ] = x6 mem256[ input_0 + 1248 ] = x7 x0 = mem256[ input_0 + 1280 ] x1 = mem256[ input_0 + 1312 ] x2 = mem256[ input_0 + 1344 ] x3 = mem256[ input_0 + 1376 ] x4 = mem256[ input_0 + 1408 ] x5 = mem256[ input_0 + 1440 ] x6 = mem256[ input_0 + 1472 ] x7 = mem256[ input_0 + 1504 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 1280 ] = x0 mem256[ input_0 + 1312 ] = x1 mem256[ input_0 + 1344 ] = x2 mem256[ input_0 + 1376 ] = x3 mem256[ input_0 + 1408 ] = x4 mem256[ input_0 + 1440 ] = x5 mem256[ input_0 + 1472 ] = x6 mem256[ input_0 + 1504 ] = x7 x0 = mem256[ input_0 + 1536 ] x1 = mem256[ input_0 + 1568 ] x2 = mem256[ input_0 + 1600 ] x3 = mem256[ input_0 + 1632 ] x4 = mem256[ input_0 + 1664 ] x5 = mem256[ input_0 + 1696 ] x6 = mem256[ input_0 + 1728 ] x7 = mem256[ input_0 + 1760 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 1536 ] = x0 mem256[ input_0 + 1568 ] = x1 mem256[ input_0 + 1600 ] = x2 mem256[ input_0 + 1632 ] = x3 mem256[ input_0 + 1664 ] = x4 mem256[ input_0 + 1696 ] = x5 mem256[ input_0 + 1728 ] = x6 mem256[ input_0 + 1760 ] = x7 x0 = mem256[ input_0 + 1792 ] x1 = mem256[ input_0 + 1824 ] x2 = mem256[ input_0 + 1856 ] x3 = mem256[ input_0 + 1888 ] x4 = mem256[ input_0 + 1920 ] x5 = mem256[ input_0 + 1952 ] x6 = mem256[ input_0 + 1984 ] x7 = mem256[ input_0 + 2016 ] v00 = x0 & mask0 v10 = x4 & mask0 4x v10 <<= 4 v01 = x0 & mask1 v11 = x4 & mask1 4x v01 unsigned>>= 4 x0 = v00 | v10 x4 = v01 | v11 v00 = x1 & mask0 v10 = x5 & mask0 4x v10 <<= 4 v01 = x1 & mask1 v11 = x5 & mask1 4x v01 unsigned>>= 4 x1 = v00 | v10 x5 = v01 | v11 v00 = x2 & mask0 v10 = x6 & mask0 4x v10 <<= 4 v01 = x2 & mask1 v11 = x6 & mask1 4x v01 unsigned>>= 4 x2 = v00 | v10 x6 = v01 | v11 v00 = x3 & mask0 v10 = x7 & mask0 4x v10 <<= 4 v01 = x3 & mask1 v11 = x7 & mask1 4x v01 unsigned>>= 4 x3 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask2 v10 = x2 & mask2 4x v10 <<= 2 v01 = x0 & mask3 v11 = x2 & mask3 4x v01 unsigned>>= 2 x0 = v00 | v10 x2 = v01 | v11 v00 = x1 & mask2 v10 = x3 & mask2 4x v10 <<= 2 v01 = x1 & mask3 v11 = x3 & mask3 4x v01 unsigned>>= 2 x1 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask2 v10 = x6 & mask2 4x v10 <<= 2 v01 = x4 & mask3 v11 = x6 & mask3 4x v01 unsigned>>= 2 x4 = v00 | v10 x6 = v01 | v11 v00 = x5 & mask2 v10 = x7 & mask2 4x v10 <<= 2 v01 = x5 & mask3 v11 = x7 & mask3 4x v01 unsigned>>= 2 x5 = v00 | v10 x7 = v01 | v11 v00 = x0 & mask4 v10 = x1 & mask4 4x v10 <<= 1 v01 = x0 & mask5 v11 = x1 & mask5 4x v01 unsigned>>= 1 x0 = v00 | v10 x1 = v01 | v11 v00 = x2 & mask4 v10 = x3 & mask4 4x v10 <<= 1 v01 = x2 & mask5 v11 = x3 & mask5 4x v01 unsigned>>= 1 x2 = v00 | v10 x3 = v01 | v11 v00 = x4 & mask4 v10 = x5 & mask4 4x v10 <<= 1 v01 = x4 & mask5 v11 = x5 & mask5 4x v01 unsigned>>= 1 x4 = v00 | v10 x5 = v01 | v11 v00 = x6 & mask4 v10 = x7 & mask4 4x v10 <<= 1 v01 = x6 & mask5 v11 = x7 & mask5 4x v01 unsigned>>= 1 x6 = v00 | v10 x7 = v01 | v11 mem256[ input_0 + 1792 ] = x0 mem256[ input_0 + 1824 ] = x1 mem256[ input_0 + 1856 ] = x2 mem256[ input_0 + 1888 ] = x3 mem256[ input_0 + 1920 ] = x4 mem256[ input_0 + 1952 ] = x5 mem256[ input_0 + 1984 ] = x6 mem256[ input_0 + 2016 ] = x7 return