42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP 43 #define LSTSSE_LEVENSHTEIN_SSE_HPP 53 #include <tmmintrin.h> 56 #include <smmintrin.h> 59 #include <immintrin.h> 77 template<
typename Iterator1,
typename Iterator2>
78 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
91 template<
typename Container1,
typename Container2>
92 std::size_t
levenshtein(
const Container1& a,
const Container2& b);
104 template <
typename T, std::
size_t N = 16>
108 #include <mm_malloc.h> 109 template <
typename T, std::
size_t N>
112 typedef T value_type;
113 typedef std::size_t size_type;
114 typedef std::ptrdiff_t difference_type;
117 typedef const T* const_pointer;
119 typedef T& reference;
120 typedef const T& const_reference;
124 template <
typename T2>
129 inline pointer address (reference r) {
133 inline const_pointer address (const_reference r)
const {
137 inline pointer allocate (size_type n) {
140 return reinterpret_cast<pointer
>(
reinterpret_cast<char*
>(_mm_malloc(n *
sizeof(value_type) + 8 * N, N)) + 4*N);
143 inline void deallocate (pointer p, size_type) {
144 _mm_free(reinterpret_cast<char*>(p) - 4*N);
147 inline void construct (pointer p,
const value_type& value) {
148 new (p) value_type (value);
151 inline void destroy (pointer p) {
155 constexpr
inline size_type max_size () const noexcept {
156 return size_type (-1) /
sizeof (value_type);
159 template <
typename T2>
165 return !(*
this == other);
172 return other.usesMMAlloc;
175 static constexpr
bool usesMMAlloc =
true;
179 template <
typename T>
182 static constexpr
bool usesMMAlloc =
false;
188 constexpr std::size_t alignment = 1;
229 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
231 static inline void perform(
const Iterator1& a,
const Iterator2& b,
232 std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag,
const Vec2& diag2)
234 std::uint32_t min = std::min(diag2[i], diag2[i-1]);
235 if (min < diag[i-1]) {
239 diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
259 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
260 __m128i compare = _mm_cmpgt_epi32(a, b);
261 __m128i aIsSmaller = _mm_andnot_si128(a, compare);
262 __m128i bIsSmaller = _mm_and_si128 (b, compare);
263 return _mm_or_si128(aIsSmaller, bIsSmaller);
273 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
274 const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
275 __m256i combined = _mm256_blend_epi32(a, b, 0x80);
276 return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
290 std::size_t& i, std::size_t j, std::size_t bLen,
291 std::uint32_t* diag,
const std::uint32_t* diag2)
295 if (i >= 32 && bLen - j >= 32) {
296 performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
302 if (i >= 16 && bLen - j >= 16) {
303 performSSE(a, b, i, j, bLen, diag, diag2);
309 ::perform(a, b, i, j, bLen, diag, diag2);
313 static inline void performSSE(
const T* a,
const T* b,
314 std::size_t& i, std::size_t j, std::size_t bLen,
315 std::uint32_t* diag,
const std::uint32_t* diag2)
317 const __m128i one128_epi32 = _mm_set1_epi32(1);
318 const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
324 const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
330 const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
336 const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
343 __m128i substitutionCost32[4];
349 if (
sizeof(T) <= 2) {
350 __m128i substitutionCost16LX, substitutionCost16HX;
352 if (
sizeof(T) == 1) {
353 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
354 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
355 a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
369 __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
370 __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
372 substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
373 substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
375 __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
376 __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
377 __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
378 __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
379 a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
380 a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
381 __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
382 __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
383 substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
384 substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
387 substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
388 substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
389 substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
390 substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
392 assert(
sizeof(T) == 4);
394 for (k = 0; k < 4; ++k) {
395 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
396 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
397 b_ = _mm_shuffle_epi32(b_, 0x1b);
398 substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
402 __m128i diag_[5], diag2_[5];
403 for (k = 0; k < 5; ++k) {
404 diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
406 for (k = 0; k < 5; ++k) {
407 diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
413 for (k = 0; k < 5; ++k) {
414 std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
415 std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
416 std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
417 std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
418 assert(diag0 == diag[i-k*4-3]);
419 assert(diag1 == diag[i-k*4-2]);
420 assert(diag2 == diag[i-k*4-1]);
421 assert(diag3 == diag[i-k*4-0]);
423 for (k = 0; k < 5; ++k) {
424 std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
425 std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
426 std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
427 std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
428 assert(diag20 == diag2[i-k*4-3]);
429 assert(diag21 == diag2[i-k*4-2]);
430 assert(diag22 == diag2[i-k*4-1]);
431 assert(diag23 == diag2[i-k*4-0]);
434 for (k = 0; k < 4; ++k) {
435 int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
436 int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
437 int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
438 int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
439 assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
440 assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
441 assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
442 assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
444 #endif // LSTSSE_DEBUG 459 for (k = 0; k < 4; ++k) {
460 __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
461 __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
463 __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
464 __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
465 min = _mm_add_epi32(min, one128_epi32);
468 std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
469 std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
470 std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
471 std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
472 assert(diag_i_m10 == diag[i-k*4-4]);
473 assert(diag_i_m11 == diag[i-k*4-3]);
474 assert(diag_i_m12 == diag[i-k*4-2]);
475 assert(diag_i_m13 == diag[i-k*4-1]);
477 std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
478 std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
479 std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
480 std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
481 assert(diag2_i_m10 == diag2[i-k*4-4]);
482 assert(diag2_i_m11 == diag2[i-k*4-3]);
483 assert(diag2_i_m12 == diag2[i-k*4-2]);
484 assert(diag2_i_m13 == diag2[i-k*4-1]);
486 std::uint32_t result30 = _mm_extract_epi32(result3, 0);
487 std::uint32_t result31 = _mm_extract_epi32(result3, 1);
488 std::uint32_t result32 = _mm_extract_epi32(result3, 2);
489 std::uint32_t result33 = _mm_extract_epi32(result3, 3);
491 assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
492 assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
493 assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
494 assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
495 #endif // LSTSSE_DEBUG 497 _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
507 static inline void performSSE_AVX2(
const T* a,
const T* b,
508 std::size_t& i, std::size_t j, std::size_t bLen,
509 std::uint32_t* diag,
const std::uint32_t* diag2)
511 const __m256i one256_epi32 = _mm256_set1_epi32(1);
512 const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
523 const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
534 const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
545 __m256i substitutionCost32[4];
548 if (
sizeof(T) <= 2) {
549 if (
sizeof(T) == 1) {
550 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
551 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
552 a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
553 a_ = _mm256_permute2x128_si256(a_, a_, 1);
555 __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
556 __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
557 __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
558 __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
559 substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
560 substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
561 substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
562 substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
564 __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
565 __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
566 __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
567 __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
568 a0 = _mm256_permute2x128_si256(a0, a0, 1);
569 a1 = _mm256_permute2x128_si256(a1, a1, 1);
570 b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
571 b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
572 __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
573 __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
574 __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
575 __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
576 __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
577 __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
578 substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
579 substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
580 substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
581 substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
584 assert(
sizeof(T) == 4);
586 for (k = 0; k < 4; ++k) {
587 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
588 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
589 b_ = _mm256_shuffle_epi32(b_, 0x1b);
590 b_ = _mm256_permute2x128_si256(b_, b_, 1);
591 substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
595 __m256i diag_[5], diag2_[5];
596 for (k = 0; k < 5; ++k) {
597 diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
599 for (k = 0; k < 5; ++k) {
600 diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
604 for (k = 0; k < 5; ++k) {
605 std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
606 std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
607 std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
608 std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
609 std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
610 std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
611 std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
612 std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
613 assert(diag0 == diag[i-k*8-7]);
614 assert(diag1 == diag[i-k*8-6]);
615 assert(diag2 == diag[i-k*8-5]);
616 assert(diag3 == diag[i-k*8-4]);
617 assert(diag4 == diag[i-k*8-3]);
618 assert(diag5 == diag[i-k*8-2]);
619 assert(diag6 == diag[i-k*8-1]);
620 assert(diag7 == diag[i-k*8-0]);
622 for (k = 0; k < 5; ++k) {
623 std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
624 std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
625 std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
626 std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
627 std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
628 std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
629 std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
630 std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
631 assert(diag20 == diag2[i-k*8-7]);
632 assert(diag21 == diag2[i-k*8-6]);
633 assert(diag22 == diag2[i-k*8-5]);
634 assert(diag23 == diag2[i-k*8-4]);
635 assert(diag24 == diag2[i-k*8-3]);
636 assert(diag25 == diag2[i-k*8-2]);
637 assert(diag26 == diag2[i-k*8-1]);
638 assert(diag27 == diag2[i-k*8-0]);
641 for (k = 0; k < 4; ++k) {
642 int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
643 int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
644 int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
645 int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
646 int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
647 int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
648 int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
649 int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
650 assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
651 assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
652 assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
653 assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
654 assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
655 assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
656 assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
657 assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
659 #endif // LSTSSE_DEBUG 661 for (k = 0; k < 4; ++k) {
662 __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
663 __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
665 __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
666 __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
667 min = _mm256_add_epi32(min, one256_epi32);
670 std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
671 std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
672 std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
673 std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
674 std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
675 std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
676 std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
677 std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
678 assert(diag_i_m10 == diag[i-k*8-8]);
679 assert(diag_i_m11 == diag[i-k*8-7]);
680 assert(diag_i_m12 == diag[i-k*8-6]);
681 assert(diag_i_m13 == diag[i-k*8-5]);
682 assert(diag_i_m14 == diag[i-k*8-4]);
683 assert(diag_i_m15 == diag[i-k*8-3]);
684 assert(diag_i_m16 == diag[i-k*8-2]);
685 assert(diag_i_m17 == diag[i-k*8-1]);
687 std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
688 std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
689 std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
690 std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
691 std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
692 std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
693 std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
694 std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
695 assert(diag2_i_m10 == diag2[i-k*8-8]);
696 assert(diag2_i_m11 == diag2[i-k*8-7]);
697 assert(diag2_i_m12 == diag2[i-k*8-6]);
698 assert(diag2_i_m13 == diag2[i-k*8-5]);
699 assert(diag2_i_m14 == diag2[i-k*8-4]);
700 assert(diag2_i_m15 == diag2[i-k*8-3]);
701 assert(diag2_i_m16 == diag2[i-k*8-2]);
702 assert(diag2_i_m17 == diag2[i-k*8-1]);
704 std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
705 std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
706 std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
707 std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
708 std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
709 std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
710 std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
711 std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
713 assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
714 assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
715 assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
716 assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
717 assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
718 assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
719 assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
720 assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
721 #endif // LSTSSE_DEBUG 723 _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
741 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
748 template<
typename Alloc1,
typename Alloc2,
typename T>
750 static inline void perform(
const T* a,
const T* b,
751 std::size_t& i, std::size_t j, std::size_t bLen,
752 std::vector<std::uint32_t, Alloc1>& diag,
753 const std::vector<std::uint32_t, Alloc2>& diag2) {
763 template<
typename Alloc1,
typename Alloc2,
typename T>
764 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
765 : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
766 LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
767 LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
774 template<
typename Alloc1,
typename Alloc2,
typename T>
776 :
LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
782 template<
typename T,
typename Iterator1,
typename Iterator2>
784 const std::size_t aLen = aEnd - a;
785 const std::size_t bLen = bEnd - b;
788 assert(aLen <= bLen);
791 std::vector<T, Alloc> diag (aLen + 1, T(0));
792 std::vector<T, Alloc> diag2 (aLen + 1, T(0));
798 assert(k <= aLen + bLen);
800 std::size_t startRow = k > bLen ? k - bLen : 1;
801 std::size_t endRow = k > aLen ? aLen : k - 1;
803 assert(endRow >= startRow || k == 1);
805 for (i = endRow; i >= startRow; ) {
813 ::perform(a, b, i, j, bLen, diag, diag2);
822 if (k == aLen + bLen) {
823 assert(startRow == endRow);
824 return diag[startRow];
828 std::swap(diag, diag2);
839 template<
typename T,
typename Iterator1,
typename Iterator2>
843 std::size_t i = 0, j = 0;
846 for (Iterator1 it = a; it != aEnd; ++it) {
852 for (; b != bEnd; ++b) {
857 for (Iterator1 it = a; it != aEnd; ++it, ++i) {
858 T tmp2 = *b == *it ? tmp : tmp + 1;
860 ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
871 template<
typename Iterator1,
typename Iterator2>
872 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
873 std::random_access_iterator_tag, std::random_access_iterator_tag) {
874 if (aEnd - a > bEnd - b) {
879 while (a < aEnd && a[0] == b[0])
882 while (a < aEnd && aEnd[-1] == bEnd[-1])
885 std::size_t aLen = aEnd - a;
886 std::size_t bLen = bEnd - b;
893 return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
896 if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
897 return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
899 return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
906 template<
typename Iterator1,
typename Iterator2>
907 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
908 std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
910 while (a != aEnd && b != bEnd && *a == *b)
913 while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
917 return std::distance(b, bEnd);
921 return std::distance(a, aEnd);
924 if (std::next(a) == aEnd) {
925 std::size_t ret = 0, found = 0;
926 for (; b != bEnd; ++b, ++ret)
932 if (std::next(b) == bEnd) {
933 std::size_t ret = 0, found = 0;
934 for (; a != aEnd; ++a, ++ret)
940 return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
948 template<
typename U>
static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
949 template<
typename U>
static auto test(
const U* a) ->
char2;
951 static constexpr
bool value =
sizeof(test(static_cast<T*>(
nullptr))) == 1;
954 template<
bool useDataAndSize>
962 template<
typename Container1,
typename Container2>
963 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
964 return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
973 template<
typename Container1,
typename Container2>
974 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
975 return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
979 template<
typename Iterator1,
typename Iterator2>
980 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
982 typename std::iterator_traits<Iterator1>::iterator_category(),
983 typename std::iterator_traits<Iterator2>::iterator_category());
986 template<
typename Container1,
typename Container2>
987 std::size_t
levenshtein(
const Container1& a,
const Container2& b) {
Definition: levenshtein-sse.hpp:947
Definition: levenshtein-sse.hpp:62
Definition: levenshtein-sse.hpp:749
Definition: levenshtein-sse.hpp:945
Definition: levenshtein-sse.hpp:287
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:231
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:974
Definition: levenshtein-sse.hpp:105
Definition: levenshtein-sse.hpp:230
Definition: levenshtein-sse.hpp:742
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:980
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:840
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:783
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:750
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:963
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:289
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:188
Definition: levenshtein-sse.hpp:955