42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP 43 #define LSTSSE_LEVENSHTEIN_SSE_HPP 53 #include <tmmintrin.h> 56 #include <smmintrin.h> 59 #include <immintrin.h> 77 template<
typename Iterator1,
typename Iterator2>
78 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
91 template<
typename Container1,
typename Container2>
92 std::size_t
levenshtein(
const Container1& a,
const Container2& b);
104 template <
typename T, std::
size_t N = 16>
108 #include <mm_malloc.h> 109 template <
typename T, std::
size_t N>
112 typedef T value_type;
113 typedef std::size_t size_type;
114 typedef std::ptrdiff_t difference_type;
117 typedef const T* const_pointer;
119 typedef T& reference;
120 typedef const T& const_reference;
124 template <
typename T2>
129 inline pointer address (reference r) {
133 inline const_pointer address (const_reference r)
const {
137 inline pointer allocate (size_type n) {
140 return reinterpret_cast<pointer
>(
reinterpret_cast<char*
>(_mm_malloc(n *
sizeof(value_type) + 8 * N, N)) + 4*N);
143 inline void deallocate (pointer p, size_type) {
144 _mm_free(reinterpret_cast<char*>(p) - 4*N);
147 inline void construct (pointer p,
const value_type& value) {
148 new (p) value_type (value);
151 inline void destroy (pointer p) {
155 constexpr
inline size_type max_size () const noexcept {
156 return size_type (-1) /
sizeof (value_type);
159 template <
typename T2>
165 return !(*
this == other);
172 return other.usesMMAlloc;
175 static constexpr
bool usesMMAlloc =
true;
179 template <
typename T>
182 static constexpr
bool usesMMAlloc =
false;
183 template <
typename T2>
192 constexpr std::size_t alignment = 1;
233 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
235 static inline void perform(
const Iterator1& a,
const Iterator2& b,
236 std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag,
const Vec2& diag2)
238 std::uint32_t min = std::min(diag2[i], diag2[i-1]);
239 if (min < diag[i-1]) {
243 diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
263 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
264 __m128i compare = _mm_cmpgt_epi32(a, b);
265 __m128i aIsSmaller = _mm_andnot_si128(a, compare);
266 __m128i bIsSmaller = _mm_and_si128 (b, compare);
267 return _mm_or_si128(aIsSmaller, bIsSmaller);
277 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
278 const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
279 __m256i combined = _mm256_blend_epi32(a, b, 0x80);
280 return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
294 std::size_t& i, std::size_t j, std::size_t bLen,
295 std::uint32_t* diag,
const std::uint32_t* diag2)
299 if (i >= 32 && bLen - j >= 32) {
300 performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
306 if (i >= 16 && bLen - j >= 16) {
307 performSSE(a, b, i, j, bLen, diag, diag2);
313 ::perform(a, b, i, j, bLen, diag, diag2);
317 static inline void performSSE(
const T* a,
const T* b,
318 std::size_t& i, std::size_t j, std::size_t bLen,
319 std::uint32_t* diag,
const std::uint32_t* diag2)
321 const __m128i one128_epi32 = _mm_set1_epi32(1);
322 const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
328 const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
334 const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
340 const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
347 __m128i substitutionCost32[4];
353 if (
sizeof(T) <= 2) {
354 __m128i substitutionCost16LX, substitutionCost16HX;
356 if (
sizeof(T) == 1) {
357 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
358 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
359 a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
373 __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
374 __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
376 substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
377 substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
379 __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
380 __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
381 __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
382 __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
383 a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
384 a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
385 __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
386 __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
387 substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
388 substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
391 substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
392 substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
393 substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
394 substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
396 assert(
sizeof(T) == 4);
398 for (k = 0; k < 4; ++k) {
399 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
400 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
401 b_ = _mm_shuffle_epi32(b_, 0x1b);
402 substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
406 __m128i diag_[5], diag2_[5];
407 for (k = 0; k < 5; ++k) {
408 diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
410 for (k = 0; k < 5; ++k) {
411 diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
417 for (k = 0; k < 5; ++k) {
418 std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
419 std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
420 std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
421 std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
422 assert(diag0 == diag[i-k*4-3]);
423 assert(diag1 == diag[i-k*4-2]);
424 assert(diag2 == diag[i-k*4-1]);
425 assert(diag3 == diag[i-k*4-0]);
427 for (k = 0; k < 5; ++k) {
428 std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
429 std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
430 std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
431 std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
432 assert(diag20 == diag2[i-k*4-3]);
433 assert(diag21 == diag2[i-k*4-2]);
434 assert(diag22 == diag2[i-k*4-1]);
435 assert(diag23 == diag2[i-k*4-0]);
438 for (k = 0; k < 4; ++k) {
439 int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
440 int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
441 int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
442 int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
443 assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
444 assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
445 assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
446 assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
448 #endif // LSTSSE_DEBUG 463 for (k = 0; k < 4; ++k) {
464 __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
465 __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
467 __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
468 __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
469 min = _mm_add_epi32(min, one128_epi32);
472 std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
473 std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
474 std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
475 std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
476 assert(diag_i_m10 == diag[i-k*4-4]);
477 assert(diag_i_m11 == diag[i-k*4-3]);
478 assert(diag_i_m12 == diag[i-k*4-2]);
479 assert(diag_i_m13 == diag[i-k*4-1]);
481 std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
482 std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
483 std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
484 std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
485 assert(diag2_i_m10 == diag2[i-k*4-4]);
486 assert(diag2_i_m11 == diag2[i-k*4-3]);
487 assert(diag2_i_m12 == diag2[i-k*4-2]);
488 assert(diag2_i_m13 == diag2[i-k*4-1]);
490 std::uint32_t result30 = _mm_extract_epi32(result3, 0);
491 std::uint32_t result31 = _mm_extract_epi32(result3, 1);
492 std::uint32_t result32 = _mm_extract_epi32(result3, 2);
493 std::uint32_t result33 = _mm_extract_epi32(result3, 3);
495 assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
496 assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
497 assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
498 assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
499 #endif // LSTSSE_DEBUG 501 _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
511 static inline void performSSE_AVX2(
const T* a,
const T* b,
512 std::size_t& i, std::size_t j, std::size_t bLen,
513 std::uint32_t* diag,
const std::uint32_t* diag2)
515 const __m256i one256_epi32 = _mm256_set1_epi32(1);
516 const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
527 const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
538 const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
549 __m256i substitutionCost32[4];
552 if (
sizeof(T) <= 2) {
553 if (
sizeof(T) == 1) {
554 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
555 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
556 a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
557 a_ = _mm256_permute2x128_si256(a_, a_, 1);
559 __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
560 __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
561 __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
562 __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
563 substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
564 substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
565 substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
566 substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
568 __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
569 __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
570 __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
571 __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
572 a0 = _mm256_permute2x128_si256(a0, a0, 1);
573 a1 = _mm256_permute2x128_si256(a1, a1, 1);
574 b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
575 b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
576 __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
577 __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
578 __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
579 __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
580 __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
581 __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
582 substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
583 substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
584 substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
585 substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
588 assert(
sizeof(T) == 4);
590 for (k = 0; k < 4; ++k) {
591 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
592 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
593 b_ = _mm256_shuffle_epi32(b_, 0x1b);
594 b_ = _mm256_permute2x128_si256(b_, b_, 1);
595 substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
599 __m256i diag_[5], diag2_[5];
600 for (k = 0; k < 5; ++k) {
601 diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
603 for (k = 0; k < 5; ++k) {
604 diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
608 for (k = 0; k < 5; ++k) {
609 std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
610 std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
611 std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
612 std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
613 std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
614 std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
615 std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
616 std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
617 assert(diag0 == diag[i-k*8-7]);
618 assert(diag1 == diag[i-k*8-6]);
619 assert(diag2 == diag[i-k*8-5]);
620 assert(diag3 == diag[i-k*8-4]);
621 assert(diag4 == diag[i-k*8-3]);
622 assert(diag5 == diag[i-k*8-2]);
623 assert(diag6 == diag[i-k*8-1]);
624 assert(diag7 == diag[i-k*8-0]);
626 for (k = 0; k < 5; ++k) {
627 std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
628 std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
629 std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
630 std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
631 std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
632 std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
633 std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
634 std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
635 assert(diag20 == diag2[i-k*8-7]);
636 assert(diag21 == diag2[i-k*8-6]);
637 assert(diag22 == diag2[i-k*8-5]);
638 assert(diag23 == diag2[i-k*8-4]);
639 assert(diag24 == diag2[i-k*8-3]);
640 assert(diag25 == diag2[i-k*8-2]);
641 assert(diag26 == diag2[i-k*8-1]);
642 assert(diag27 == diag2[i-k*8-0]);
645 for (k = 0; k < 4; ++k) {
646 int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
647 int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
648 int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
649 int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
650 int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
651 int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
652 int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
653 int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
654 assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
655 assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
656 assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
657 assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
658 assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
659 assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
660 assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
661 assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
663 #endif // LSTSSE_DEBUG 665 for (k = 0; k < 4; ++k) {
666 __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
667 __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
669 __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
670 __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
671 min = _mm256_add_epi32(min, one256_epi32);
674 std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
675 std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
676 std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
677 std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
678 std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
679 std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
680 std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
681 std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
682 assert(diag_i_m10 == diag[i-k*8-8]);
683 assert(diag_i_m11 == diag[i-k*8-7]);
684 assert(diag_i_m12 == diag[i-k*8-6]);
685 assert(diag_i_m13 == diag[i-k*8-5]);
686 assert(diag_i_m14 == diag[i-k*8-4]);
687 assert(diag_i_m15 == diag[i-k*8-3]);
688 assert(diag_i_m16 == diag[i-k*8-2]);
689 assert(diag_i_m17 == diag[i-k*8-1]);
691 std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
692 std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
693 std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
694 std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
695 std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
696 std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
697 std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
698 std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
699 assert(diag2_i_m10 == diag2[i-k*8-8]);
700 assert(diag2_i_m11 == diag2[i-k*8-7]);
701 assert(diag2_i_m12 == diag2[i-k*8-6]);
702 assert(diag2_i_m13 == diag2[i-k*8-5]);
703 assert(diag2_i_m14 == diag2[i-k*8-4]);
704 assert(diag2_i_m15 == diag2[i-k*8-3]);
705 assert(diag2_i_m16 == diag2[i-k*8-2]);
706 assert(diag2_i_m17 == diag2[i-k*8-1]);
708 std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
709 std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
710 std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
711 std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
712 std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
713 std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
714 std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
715 std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
717 assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
718 assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
719 assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
720 assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
721 assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
722 assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
723 assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
724 assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
725 #endif // LSTSSE_DEBUG 727 _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
745 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
752 template<
typename Alloc1,
typename Alloc2,
typename T>
754 static inline void perform(
const T* a,
const T* b,
755 std::size_t& i, std::size_t j, std::size_t bLen,
756 std::vector<std::uint32_t, Alloc1>& diag,
757 const std::vector<std::uint32_t, Alloc2>& diag2) {
767 template<
typename Alloc1,
typename Alloc2,
typename T>
768 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
769 : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
770 LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
771 LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
778 template<
typename Alloc1,
typename Alloc2,
typename T>
780 :
LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
786 template<
typename T,
typename Iterator1,
typename Iterator2>
788 const std::size_t aLen = aEnd - a;
789 const std::size_t bLen = bEnd - b;
792 assert(aLen <= bLen);
795 std::vector<T, Alloc> diag (aLen + 1, T(0));
796 std::vector<T, Alloc> diag2 (aLen + 1, T(0));
802 assert(k <= aLen + bLen);
804 std::size_t startRow = k > bLen ? k - bLen : 1;
805 std::size_t endRow = k > aLen ? aLen : k - 1;
807 assert(endRow >= startRow || k == 1);
809 for (i = endRow; i >= startRow; ) {
817 ::perform(a, b, i, j, bLen, diag, diag2);
826 if (k == aLen + bLen) {
827 assert(startRow == endRow);
828 return diag[startRow];
832 std::swap(diag, diag2);
843 template<
typename T,
typename Iterator1,
typename Iterator2>
847 std::size_t i = 0, j = 0;
850 for (Iterator1 it = a; it != aEnd; ++it) {
856 for (; b != bEnd; ++b) {
861 for (Iterator1 it = a; it != aEnd; ++it, ++i) {
862 T tmp2 = *b == *it ? tmp : tmp + 1;
864 ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
875 template<
typename Iterator1,
typename Iterator2>
876 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
877 std::random_access_iterator_tag, std::random_access_iterator_tag) {
878 if (aEnd - a > bEnd - b) {
883 while (a < aEnd && a[0] == b[0])
886 while (a < aEnd && aEnd[-1] == bEnd[-1])
889 std::size_t aLen = aEnd - a;
890 std::size_t bLen = bEnd - b;
897 return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
900 if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
901 return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
903 return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
910 template<
typename Iterator1,
typename Iterator2>
911 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
912 std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
914 while (a != aEnd && b != bEnd && *a == *b)
917 while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
921 return std::distance(b, bEnd);
925 return std::distance(a, aEnd);
928 if (std::next(a) == aEnd) {
929 std::size_t ret = 0, found = 0;
930 for (; b != bEnd; ++b, ++ret)
936 if (std::next(b) == bEnd) {
937 std::size_t ret = 0, found = 0;
938 for (; a != aEnd; ++a, ++ret)
944 return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
952 template<
typename U>
static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
953 template<
typename U>
static auto test(
const U* a) ->
char2;
955 static constexpr
bool value =
sizeof(test(static_cast<T*>(
nullptr))) == 1;
958 template<
bool useDataAndSize>
966 template<
typename Container1,
typename Container2>
967 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
968 return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
977 template<
typename Container1,
typename Container2>
978 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
979 return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
983 template<
typename Iterator1,
typename Iterator2>
984 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
986 typename std::iterator_traits<Iterator1>::iterator_category(),
987 typename std::iterator_traits<Iterator2>::iterator_category());
990 template<
typename Container1,
typename Container2>
991 std::size_t
levenshtein(
const Container1& a,
const Container2& b) {
Definition: levenshtein-sse.hpp:951
Definition: levenshtein-sse.hpp:62
Definition: levenshtein-sse.hpp:753
Definition: levenshtein-sse.hpp:949
AlignmentAllocator< T2, 1 > other
Definition: levenshtein-sse.hpp:185
Definition: levenshtein-sse.hpp:291
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:235
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:978
Definition: levenshtein-sse.hpp:105
Definition: levenshtein-sse.hpp:234
Definition: levenshtein-sse.hpp:746
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:984
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:844
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:787
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:754
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:967
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:293
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:192
Definition: levenshtein-sse.hpp:959