- __m128i T00, T01;
-
- T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
- in00[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
- in01[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
- in02[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
- in03[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
- in04[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
- in05[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
- in06[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
- in07[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
- in08[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
- in09[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
- in10[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
- in11[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
- in12[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
- in13[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
- in14[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
- in15[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
- in16[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
- in17[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
- in18[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
- in19[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
- in20[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
- in21[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
- in22[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
- in23[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
- in24[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
- in25[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
- in26[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
- in27[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
- in28[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
- in29[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
- in30[i] = _mm_packs_epi32(T00, T01);
-
- T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
- in31[i] = _mm_packs_epi32(T00, T01);
+ in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
+ in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
+ in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
+ in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
+ in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
+ in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
+ in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
+ in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
+ in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
+ in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
+ in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+ in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+ in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+ in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+ in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+ in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+ in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+ in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+ in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+ in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+ in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+ in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+ in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+ in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+ in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+ in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+ in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+ in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+ in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+ in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+ in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+ in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);