42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP 43 #define LSTSSE_LEVENSHTEIN_SSE_HPP 51 #include <tmmintrin.h> 54 #include <smmintrin.h> 57 #include <immintrin.h> 75 template<
typename Iterator1,
typename Iterator2>
76 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
89 template<
typename Container1,
typename Container2>
90 std::size_t
levenshtein(
const Container1& a,
const Container2& b);
102 template <
typename T, std::
size_t N = 16>
106 #include <mm_malloc.h> 107 template <
typename T, std::
size_t N>
110 typedef T value_type;
111 typedef std::size_t size_type;
112 typedef std::ptrdiff_t difference_type;
115 typedef const T* const_pointer;
117 typedef T& reference;
118 typedef const T& const_reference;
122 template <
typename T2>
127 inline pointer address (reference r) {
131 inline const_pointer address (const_reference r)
const {
135 inline pointer allocate (size_type n) {
138 return reinterpret_cast<pointer
>(
reinterpret_cast<char*
>(_mm_malloc(n *
sizeof(value_type) + 8 * N, N)) + 4*N);
141 inline void deallocate (pointer p, size_type) {
142 _mm_free(reinterpret_cast<char*>(p) - 4*N);
145 inline void construct (pointer p,
const value_type& value) {
146 new (p) value_type (value);
149 inline void destroy (pointer p) {
153 constexpr
inline size_type max_size () const noexcept {
154 return size_type (-1) /
sizeof (value_type);
157 template <
typename T2>
163 return !(*
this == other);
170 return other.usesMMAlloc;
173 static constexpr
bool usesMMAlloc =
true;
177 template <
typename T>
180 static constexpr
bool usesMMAlloc =
false;
186 constexpr std::size_t alignment = 1;
227 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
229 static inline void perform(
const Iterator1& a,
const Iterator2& b,
230 std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag,
const Vec2& diag2)
232 std::uint32_t min = std::min(diag2[i], diag2[i-1]);
233 if (min < diag[i-1]) {
237 diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
257 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
258 __m128i compare = _mm_cmpgt_epi32(a, b);
259 __m128i aIsSmaller = _mm_andnot_si128(a, compare);
260 __m128i bIsSmaller = _mm_and_si128 (b, compare);
261 return _mm_or_si128(aIsSmaller, bIsSmaller);
271 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
272 const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
273 __m256i combined = _mm256_blend_epi32(a, b, 0x80);
274 return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
288 std::size_t& i, std::size_t j, std::size_t bLen,
289 std::uint32_t* diag,
const std::uint32_t* diag2)
293 if (i >= 32 && bLen - j >= 32) {
294 performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
300 if (i >= 16 && bLen - j >= 16) {
301 performSSE(a, b, i, j, bLen, diag, diag2);
307 ::perform(a, b, i, j, bLen, diag, diag2);
311 static inline void performSSE(
const T* a,
const T* b,
312 std::size_t& i, std::size_t j, std::size_t bLen,
313 std::uint32_t* diag,
const std::uint32_t* diag2)
315 const __m128i one128_epi32 = _mm_set1_epi32(1);
316 const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
322 const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
328 const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
334 const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
341 __m128i substitutionCost32[4];
347 if (
sizeof(T) <= 2) {
348 __m128i substitutionCost16LX, substitutionCost16HX;
350 if (
sizeof(T) == 1) {
351 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
352 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
353 a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
367 __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
368 __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
370 substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
371 substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
373 __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
374 __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
375 __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
376 __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
377 a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
378 a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
379 __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
380 __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
381 substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
382 substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
385 substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
386 substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
387 substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
388 substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
390 assert(
sizeof(T) == 4);
392 for (k = 0; k < 4; ++k) {
393 __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
394 __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
395 b_ = _mm_shuffle_epi32(b_, 0x1b);
396 substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
400 __m128i diag_[5], diag2_[5];
401 for (k = 0; k < 5; ++k) {
402 diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
404 for (k = 0; k < 5; ++k) {
405 diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
411 for (k = 0; k < 5; ++k) {
412 std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
413 std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
414 std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
415 std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
416 assert(diag0 == diag[i-k*4-3]);
417 assert(diag1 == diag[i-k*4-2]);
418 assert(diag2 == diag[i-k*4-1]);
419 assert(diag3 == diag[i-k*4-0]);
421 for (k = 0; k < 5; ++k) {
422 std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
423 std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
424 std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
425 std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
426 assert(diag20 == diag2[i-k*4-3]);
427 assert(diag21 == diag2[i-k*4-2]);
428 assert(diag22 == diag2[i-k*4-1]);
429 assert(diag23 == diag2[i-k*4-0]);
432 for (k = 0; k < 4; ++k) {
433 int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
434 int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
435 int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
436 int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
437 assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
438 assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
439 assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
440 assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
442 #endif // LSTSSE_DEBUG 457 for (k = 0; k < 4; ++k) {
458 __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
459 __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
461 __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
462 __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
463 min = _mm_add_epi32(min, one128_epi32);
466 std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
467 std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
468 std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
469 std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
470 assert(diag_i_m10 == diag[i-k*4-4]);
471 assert(diag_i_m11 == diag[i-k*4-3]);
472 assert(diag_i_m12 == diag[i-k*4-2]);
473 assert(diag_i_m13 == diag[i-k*4-1]);
475 std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
476 std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
477 std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
478 std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
479 assert(diag2_i_m10 == diag2[i-k*4-4]);
480 assert(diag2_i_m11 == diag2[i-k*4-3]);
481 assert(diag2_i_m12 == diag2[i-k*4-2]);
482 assert(diag2_i_m13 == diag2[i-k*4-1]);
484 std::uint32_t result30 = _mm_extract_epi32(result3, 0);
485 std::uint32_t result31 = _mm_extract_epi32(result3, 1);
486 std::uint32_t result32 = _mm_extract_epi32(result3, 2);
487 std::uint32_t result33 = _mm_extract_epi32(result3, 3);
489 assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
490 assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
491 assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
492 assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
493 #endif // LSTSSE_DEBUG 495 _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
505 static inline void performSSE_AVX2(
const T* a,
const T* b,
506 std::size_t& i, std::size_t j, std::size_t bLen,
507 std::uint32_t* diag,
const std::uint32_t* diag2)
509 const __m256i one256_epi32 = _mm256_set1_epi32(1);
510 const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
521 const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
532 const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
543 __m256i substitutionCost32[4];
546 if (
sizeof(T) <= 2) {
547 if (
sizeof(T) == 1) {
548 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
549 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
550 a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
551 a_ = _mm256_permute2x128_si256(a_, a_, 1);
553 __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
554 __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
555 __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
556 __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
557 substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
558 substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
559 substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
560 substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
562 __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
563 __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
564 __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
565 __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
566 a0 = _mm256_permute2x128_si256(a0, a0, 1);
567 a1 = _mm256_permute2x128_si256(a1, a1, 1);
568 b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
569 b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
570 __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
571 __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
572 __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
573 __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
574 __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
575 __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
576 substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
577 substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
578 substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
579 substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
582 assert(
sizeof(T) == 4);
584 for (k = 0; k < 4; ++k) {
585 __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
586 __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
587 b_ = _mm256_shuffle_epi32(b_, 0x1b);
588 b_ = _mm256_permute2x128_si256(b_, b_, 1);
589 substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
593 __m256i diag_[5], diag2_[5];
594 for (k = 0; k < 5; ++k) {
595 diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
597 for (k = 0; k < 5; ++k) {
598 diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
602 for (k = 0; k < 5; ++k) {
603 std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
604 std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
605 std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
606 std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
607 std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
608 std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
609 std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
610 std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
611 assert(diag0 == diag[i-k*8-7]);
612 assert(diag1 == diag[i-k*8-6]);
613 assert(diag2 == diag[i-k*8-5]);
614 assert(diag3 == diag[i-k*8-4]);
615 assert(diag4 == diag[i-k*8-3]);
616 assert(diag5 == diag[i-k*8-2]);
617 assert(diag6 == diag[i-k*8-1]);
618 assert(diag7 == diag[i-k*8-0]);
620 for (k = 0; k < 5; ++k) {
621 std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
622 std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
623 std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
624 std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
625 std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
626 std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
627 std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
628 std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
629 assert(diag20 == diag2[i-k*8-7]);
630 assert(diag21 == diag2[i-k*8-6]);
631 assert(diag22 == diag2[i-k*8-5]);
632 assert(diag23 == diag2[i-k*8-4]);
633 assert(diag24 == diag2[i-k*8-3]);
634 assert(diag25 == diag2[i-k*8-2]);
635 assert(diag26 == diag2[i-k*8-1]);
636 assert(diag27 == diag2[i-k*8-0]);
639 for (k = 0; k < 4; ++k) {
640 int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
641 int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
642 int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
643 int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
644 int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
645 int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
646 int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
647 int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
648 assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
649 assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
650 assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
651 assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
652 assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
653 assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
654 assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
655 assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
657 #endif // LSTSSE_DEBUG 659 for (k = 0; k < 4; ++k) {
660 __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
661 __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
663 __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
664 __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
665 min = _mm256_add_epi32(min, one256_epi32);
668 std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
669 std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
670 std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
671 std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
672 std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
673 std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
674 std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
675 std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
676 assert(diag_i_m10 == diag[i-k*8-8]);
677 assert(diag_i_m11 == diag[i-k*8-7]);
678 assert(diag_i_m12 == diag[i-k*8-6]);
679 assert(diag_i_m13 == diag[i-k*8-5]);
680 assert(diag_i_m14 == diag[i-k*8-4]);
681 assert(diag_i_m15 == diag[i-k*8-3]);
682 assert(diag_i_m16 == diag[i-k*8-2]);
683 assert(diag_i_m17 == diag[i-k*8-1]);
685 std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
686 std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
687 std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
688 std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
689 std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
690 std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
691 std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
692 std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
693 assert(diag2_i_m10 == diag2[i-k*8-8]);
694 assert(diag2_i_m11 == diag2[i-k*8-7]);
695 assert(diag2_i_m12 == diag2[i-k*8-6]);
696 assert(diag2_i_m13 == diag2[i-k*8-5]);
697 assert(diag2_i_m14 == diag2[i-k*8-4]);
698 assert(diag2_i_m15 == diag2[i-k*8-3]);
699 assert(diag2_i_m16 == diag2[i-k*8-2]);
700 assert(diag2_i_m17 == diag2[i-k*8-1]);
702 std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
703 std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
704 std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
705 std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
706 std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
707 std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
708 std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
709 std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
711 assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
712 assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
713 assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
714 assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
715 assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
716 assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
717 assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
718 assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
719 #endif // LSTSSE_DEBUG 721 _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
739 template<
typename Vec1,
typename Vec2,
typename Iterator1,
typename Iterator2>
746 template<
typename Alloc1,
typename Alloc2,
typename T>
748 static inline void perform(
const T* a,
const T* b,
749 std::size_t& i, std::size_t j, std::size_t bLen,
750 std::vector<std::uint32_t, Alloc1>& diag,
751 const std::vector<std::uint32_t, Alloc2>& diag2) {
761 template<
typename Alloc1,
typename Alloc2,
typename T>
762 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
763 : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
764 LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
765 LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
772 template<
typename Alloc1,
typename Alloc2,
typename T>
774 :
LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
780 template<
typename T,
typename Iterator1,
typename Iterator2>
782 const std::size_t aLen = aEnd - a;
783 const std::size_t bLen = bEnd - b;
786 assert(aLen <= bLen);
789 std::vector<T, Alloc> diag (aLen + 1, T(0));
790 std::vector<T, Alloc> diag2 (aLen + 1, T(0));
796 assert(k <= aLen + bLen);
798 std::size_t startRow = k > bLen ? k - bLen : 1;
799 std::size_t endRow = k > aLen ? aLen : k - 1;
801 assert(endRow >= startRow || k == 1);
803 for (i = endRow; i >= startRow; ) {
811 ::perform(a, b, i, j, bLen, diag, diag2);
820 if (k == aLen + bLen) {
821 assert(startRow == endRow);
822 return diag[startRow];
826 std::swap(diag, diag2);
837 template<
typename T,
typename Iterator1,
typename Iterator2>
841 std::size_t i = 0, j = 0;
844 for (Iterator1 it = a; it != aEnd; ++it) {
850 for (; b != bEnd; ++b) {
855 for (Iterator1 it = a; it != aEnd; ++it, ++i) {
856 T tmp2 = *b == *it ? tmp : tmp + 1;
858 ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
869 template<
typename Iterator1,
typename Iterator2>
870 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
871 std::random_access_iterator_tag, std::random_access_iterator_tag) {
872 if (aEnd - a > bEnd - b) {
877 while (a < aEnd && a[0] == b[0])
880 while (a < aEnd && aEnd[-1] == bEnd[-1])
883 std::size_t aLen = aEnd - a;
884 std::size_t bLen = bEnd - b;
891 return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
894 if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
895 return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
897 return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
904 template<
typename Iterator1,
typename Iterator2>
905 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
906 std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
908 while (a != aEnd && b != bEnd && *a == *b)
911 while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
915 return std::distance(b, bEnd);
919 return std::distance(a, aEnd);
922 if (std::next(a) == aEnd) {
923 std::size_t ret = 0, found = 0;
924 for (; b != bEnd; ++b, ++ret)
930 if (std::next(b) == bEnd) {
931 std::size_t ret = 0, found = 0;
932 for (; a != aEnd; ++a, ++ret)
938 return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
946 template<
typename U>
static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
947 template<
typename U>
static auto test(
const U* a) ->
char2;
949 static constexpr
bool value =
sizeof(test(static_cast<T*>(
nullptr))) == 1;
952 template<
bool useDataAndSize>
960 template<
typename Container1,
typename Container2>
961 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
962 return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
971 template<
typename Container1,
typename Container2>
972 static inline std::size_t
calc(
const Container1& a,
const Container2& b) {
973 return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
977 template<
typename Iterator1,
typename Iterator2>
978 std::size_t
levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
980 typename std::iterator_traits<Iterator1>::iterator_category(),
981 typename std::iterator_traits<Iterator2>::iterator_category());
984 template<
typename Container1,
typename Container2>
985 std::size_t
levenshtein(
const Container1& a,
const Container2& b) {
Definition: levenshtein-sse.hpp:945
Definition: levenshtein-sse.hpp:60
Definition: levenshtein-sse.hpp:747
Definition: levenshtein-sse.hpp:943
Definition: levenshtein-sse.hpp:285
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:229
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:972
Definition: levenshtein-sse.hpp:103
Definition: levenshtein-sse.hpp:228
Definition: levenshtein-sse.hpp:740
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:978
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:838
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:781
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:748
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:961
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:287
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:186
Definition: levenshtein-sse.hpp:953