- _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
- _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
+ T40 = _mm_packs_epi32(T40, T40);
+ T41 = _mm_packs_epi32(T41, T41);
+ _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
+ _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);