EventAnalysis  1.2.2
levenshtein-sse.hpp
Go to the documentation of this file.
1 /*
2  * levenshtein-sse.hpp
3  *
4  * Created on: Sep 15, 2020
5  * Added by: Nicola Mori
6  */
7 
8 /* The code has been obtained from https://github.com/addaleax/levenshtein-sse
9  *
10  * This file contains the original code contained in levenshtein-sse.hpp as
11  * obtained from github plus the license. The code is guarded against clang-format
12  * to keep the original indentation and ease any comparison with the original
13  * version.
14  */
15 
40 // clang-format off
41 
42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP
43 #define LSTSSE_LEVENSHTEIN_SSE_HPP
44 
45 #include <algorithm>
46 #include <limits>
47 #include <vector>
48 #include <iterator>
49 #include <limits>
50 #include <cstdint>
51 #include <cassert>
52 #ifdef __SSSE3__
53 #include <tmmintrin.h>
54 #endif
55 #ifdef __SSE4_1__
56 #include <smmintrin.h>
57 #endif
58 #ifdef __AVX2__
59 #include <immintrin.h>
60 #endif
61 
62 namespace levenshteinSSE {
63 
77 template<typename Iterator1, typename Iterator2>
78 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
79 
91 template<typename Container1, typename Container2>
92 std::size_t levenshtein(const Container1& a, const Container2& b);
93 
104 template <typename T, std::size_t N = 16>
106 
107 #ifdef __SSE2__
108 #include <mm_malloc.h>
109 template <typename T, std::size_t N>
110 class AlignmentAllocator {
111 public:
112  typedef T value_type;
113  typedef std::size_t size_type;
114  typedef std::ptrdiff_t difference_type;
115 
116  typedef T* pointer;
117  typedef const T* const_pointer;
118 
119  typedef T& reference;
120  typedef const T& const_reference;
121 
122  constexpr inline AlignmentAllocator () noexcept { }
123 
124  template <typename T2>
125  constexpr inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) noexcept { }
126 
127  inline ~AlignmentAllocator () noexcept { }
128 
129  inline pointer address (reference r) {
130  return &r;
131  }
132 
133  inline const_pointer address (const_reference r) const {
134  return &r;
135  }
136 
137  inline pointer allocate (size_type n) {
138  // this allocator is special in that it leaves 4*N bytes before and after
139  // the allocated area for garbage reads/writes
140  return reinterpret_cast<pointer>(reinterpret_cast<char*>(_mm_malloc(n * sizeof(value_type) + 8 * N, N)) + 4*N);
141  }
142 
143  inline void deallocate (pointer p, size_type) {
144  _mm_free(reinterpret_cast<char*>(p) - 4*N);
145  }
146 
147  inline void construct (pointer p, const value_type& value) {
148  new (p) value_type (value);
149  }
150 
151  inline void destroy (pointer p) {
152  p->~value_type ();
153  }
154 
155  constexpr inline size_type max_size () const noexcept {
156  return size_type (-1) / sizeof (value_type);
157  }
158 
159  template <typename T2>
160  struct rebind {
161  typedef AlignmentAllocator<T2, N> other;
162  };
163 
164  constexpr bool operator!=(const AlignmentAllocator<T,N>& other) const {
165  return !(*this == other);
166  }
167 
168  // Returns true if and only if storage allocated from *this
169  // can be deallocated from other, and vice versa.
170  // Always returns true for stateless allocators.
171  constexpr bool operator==(const AlignmentAllocator<T,N>& other) const {
172  return other.usesMMAlloc;
173  }
174 
175  static constexpr bool usesMMAlloc = true;
176 };
177 #endif
178 
179 template <typename T>
180 class AlignmentAllocator<T, 1> : public std::allocator<T> {
181 public:
182  static constexpr bool usesMMAlloc = false;
183 };
184 
185 #ifdef __SSSE3__
186 constexpr std::size_t alignment = 16;
187 #else
188 constexpr std::size_t alignment = 1;
189 #endif
190 
229 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
231 static inline void perform(const Iterator1& a, const Iterator2& b,
232  std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag, const Vec2& diag2)
233 {
234  std::uint32_t min = std::min(diag2[i], diag2[i-1]);
235  if (min < diag[i-1]) {
236  diag[i] = min + 1;
237  }
238  else {
239  diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
240  }
241  --i;
242 }
243 };
244 
256 #ifdef __SSSE3__
257 #ifndef __SSE4_1__
258 // straightforward _mm_min_epi32 polyfill using only SSE2
259 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
260  __m128i compare = _mm_cmpgt_epi32(a, b);
261  __m128i aIsSmaller = _mm_andnot_si128(a, compare);
262  __m128i bIsSmaller = _mm_and_si128 (b, compare);
263  return _mm_or_si128(aIsSmaller, bIsSmaller);
264 }
265 #endif // __SSE4_1__
266 #endif // __SSSE3__
267 
268 #ifdef __AVX2__
269 // provide an AVX2 version of _mm256_alignr_epi32(a, b, 7)
270 // (i.e. move a one epi32 to the left and write the highest
271 // bytes of b to the lowest of a)
272 /*#ifndef __AVX512F__*/
273 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
274  const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
275  __m256i combined = _mm256_blend_epi32(a, b, 0x80);
276  return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
277 }
278 /*
279 #else
280 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
281  return _mm256_alignr_epi32(a, b, 7);
282 }
283 #endif*/
284 #endif
285 
286 template<typename T>
288 /* Decide which implementation is acceptable */
289 static inline void performSIMD(const T* a, const T* b,
290  std::size_t& i, std::size_t j, std::size_t bLen,
291  std::uint32_t* diag, const std::uint32_t* diag2)
292 {
293 
294 #ifdef __AVX2__
295  if (i >= 32 && bLen - j >= 32) {
296  performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
297  return;
298  }
299 #endif
300 
301 #ifdef __SSSE3__
302  if (i >= 16 && bLen - j >= 16) {
303  performSSE(a, b, i, j, bLen, diag, diag2);
304  return;
305  }
306 #endif
307 
309  ::perform(a, b, i, j, bLen, diag, diag2);
310 }
311 
312 #ifdef __SSSE3__
313 static inline void performSSE(const T* a, const T* b,
314  std::size_t& i, std::size_t j, std::size_t bLen,
315  std::uint32_t* diag, const std::uint32_t* diag2)
316 {
317  const __m128i one128_epi32 = _mm_set1_epi32(1);
318  const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
319  15, 14, 13, 12,
320  11, 10, 9, 8,
321  7, 6, 5, 4,
322  3, 2, 1, 0
323  );
324  const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
325  14, 15, 12, 13,
326  10, 11, 8, 9,
327  6, 7, 4, 5,
328  2, 3, 0, 1
329  );
330  const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
331  3, 2, 1, 0,
332  7, 6, 5, 4,
333  11, 10, 9, 8,
334  15, 14, 13, 12
335  );
336  const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
337  6, 7, 4, 5,
338  2, 3, 0, 1,
339  14, 15, 12, 13,
340  10, 11, 8, 9
341  );
342 
343  __m128i substitutionCost32[4];
344  std::size_t k;
345 
346  // We support 1, 2, and 4 byte objects for SSE comparison.
347  // We always process 16 entries at once, so we may need multiple fetches
348  // depending on object size.
349  if (sizeof(T) <= 2) {
350  __m128i substitutionCost16LX, substitutionCost16HX;
351 
352  if (sizeof(T) == 1) {
353  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
354  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
355  a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
356 
357  // int substitutionCost = a[i-1] == b[j-1] ? -1 : 0;
358 
359  // diag/diag2 will contain the following entries from the diagonal:
360  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
361  // diag+i -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
362 
363  // substitutionCost8 contains the comparisons for the following indices in a and b:
364  // index 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
365  // a+i -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16
366  // b+j -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
367  // in substitutionCost8X we shuffle this so that it matches up with diag/diag2
368  // (barring the offset induces by comparing i-1 against j-1)
369  __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
370  __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
371  // unpack with self so that 00 -> 00 00 and ff -> ff ff
372  substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
373  substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
374  } else {
375  __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
376  __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
377  __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
378  __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
379  a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
380  a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
381  __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
382  __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
383  substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
384  substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
385  }
386 
387  substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
388  substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
389  substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
390  substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
391  } else {
392  assert(sizeof(T) == 4);
393 
394  for (k = 0; k < 4; ++k) {
395  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
396  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
397  b_ = _mm_shuffle_epi32(b_, 0x1b); // simple reverse
398  substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
399  }
400  }
401 
402  __m128i diag_[5], diag2_[5];
403  for (k = 0; k < 5; ++k) {
404  diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
405  }
406  for (k = 0; k < 5; ++k) {
407  diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
408  }
409 
410  // There's a ton of debug stuff down here. You can use it for reference
411  // or, of course, compile with -DLSTSSE_DEBUG if you're feeling funny.
412 #ifdef LSTSSE_DEBUG
413  for (k = 0; k < 5; ++k) {
414  std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
415  std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
416  std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
417  std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
418  assert(diag0 == diag[i-k*4-3]);
419  assert(diag1 == diag[i-k*4-2]);
420  assert(diag2 == diag[i-k*4-1]);
421  assert(diag3 == diag[i-k*4-0]);
422  }
423  for (k = 0; k < 5; ++k) {
424  std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
425  std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
426  std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
427  std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
428  assert(diag20 == diag2[i-k*4-3]);
429  assert(diag21 == diag2[i-k*4-2]);
430  assert(diag22 == diag2[i-k*4-1]);
431  assert(diag23 == diag2[i-k*4-0]);
432  }
433 
434  for (k = 0; k < 4; ++k) {
435  int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
436  int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
437  int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
438  int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
439  assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
440  assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
441  assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
442  assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
443  }
444 #endif // LSTSSE_DEBUG
445 
446  // reminders:
447  // the arrays (diag, diag2, substitutionCost32) correspond to i:
448  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
449  // i+ -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
450  // so when we need to access ...[i-1], we need to align the entries
451  // in *reverse* order
452 
453  // diag[i] = min(
454  // diag2[i-1],
455  // diag2[i],
456  // diag[i-1] + substitutionCost
457  // ) + 1;
458  //
459  for (k = 0; k < 4; ++k) {
460  __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
461  __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
462 
463  __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
464  __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
465  min = _mm_add_epi32(min, one128_epi32);
466 
467 #ifdef LSTSSE_DEBUG
468  std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
469  std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
470  std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
471  std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
472  assert(diag_i_m10 == diag[i-k*4-4]);
473  assert(diag_i_m11 == diag[i-k*4-3]);
474  assert(diag_i_m12 == diag[i-k*4-2]);
475  assert(diag_i_m13 == diag[i-k*4-1]);
476 
477  std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
478  std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
479  std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
480  std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
481  assert(diag2_i_m10 == diag2[i-k*4-4]);
482  assert(diag2_i_m11 == diag2[i-k*4-3]);
483  assert(diag2_i_m12 == diag2[i-k*4-2]);
484  assert(diag2_i_m13 == diag2[i-k*4-1]);
485 
486  std::uint32_t result30 = _mm_extract_epi32(result3, 0);
487  std::uint32_t result31 = _mm_extract_epi32(result3, 1);
488  std::uint32_t result32 = _mm_extract_epi32(result3, 2);
489  std::uint32_t result33 = _mm_extract_epi32(result3, 3);
490 
491  assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
492  assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
493  assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
494  assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
495 #endif // LSTSSE_DEBUG
496 
497  _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
498  }
499 
500  // We just handled 16 entries at once. Yay!
501  i -= 16;
502 }
503 #endif // __SSSE3__
504 
505 
506 #ifdef __AVX2__
507 static inline void performSSE_AVX2(const T* a, const T* b,
508  std::size_t& i, std::size_t j, std::size_t bLen,
509  std::uint32_t* diag, const std::uint32_t* diag2)
510 {
511  const __m256i one256_epi32 = _mm256_set1_epi32(1);
512  const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
513  15, 14, 13, 12,
514  11, 10, 9, 8,
515  7, 6, 5, 4,
516  3, 2, 1, 0,
517  15, 14, 13, 12,
518  11, 10, 9, 8,
519  7, 6, 5, 4,
520  3, 2, 1, 0
521  );
522 
523  const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
524  7, 6, 5, 4,
525  3, 2, 1, 0,
526  15, 14, 13, 12,
527  11, 10, 9, 8,
528  7, 6, 5, 4,
529  3, 2, 1, 0,
530  15, 14, 13, 12,
531  11, 10, 9, 8
532  );
533 
534  const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
535  14, 15, 12, 13,
536  10, 11, 8, 9,
537  6, 7, 4, 5,
538  2, 3, 0, 1,
539  14, 15, 12, 13,
540  10, 11, 8, 9,
541  6, 7, 4, 5,
542  2, 3, 0, 1
543  );
544 
545  __m256i substitutionCost32[4];
546  std::size_t k;
547 
548  if (sizeof(T) <= 2) {
549  if (sizeof(T) == 1) {
550  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
551  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
552  a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
553  a_ = _mm256_permute2x128_si256(a_, a_, 1); // swap hi/lo
554 
555  __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
556  __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
557  __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
558  __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
559  substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
560  substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
561  substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
562  substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
563  } else {
564  __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
565  __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
566  __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
567  __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
568  a0 = _mm256_permute2x128_si256(a0, a0, 1); // swap hi/lo
569  a1 = _mm256_permute2x128_si256(a1, a1, 1); // swap hi/lo
570  b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
571  b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
572  __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
573  __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
574  __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
575  __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
576  __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
577  __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
578  substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
579  substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
580  substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
581  substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
582  }
583  } else {
584  assert(sizeof(T) == 4);
585 
586  for (k = 0; k < 4; ++k) {
587  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
588  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
589  b_ = _mm256_shuffle_epi32(b_, 0x1b); // simple reverse
590  b_ = _mm256_permute2x128_si256(b_, b_, 1); // swap hi/lo
591  substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
592  }
593  }
594 
595  __m256i diag_[5], diag2_[5];
596  for (k = 0; k < 5; ++k) {
597  diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
598  }
599  for (k = 0; k < 5; ++k) {
600  diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
601  }
602 
603 #ifdef LSTSSE_DEBUG
604  for (k = 0; k < 5; ++k) {
605  std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
606  std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
607  std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
608  std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
609  std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
610  std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
611  std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
612  std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
613  assert(diag0 == diag[i-k*8-7]);
614  assert(diag1 == diag[i-k*8-6]);
615  assert(diag2 == diag[i-k*8-5]);
616  assert(diag3 == diag[i-k*8-4]);
617  assert(diag4 == diag[i-k*8-3]);
618  assert(diag5 == diag[i-k*8-2]);
619  assert(diag6 == diag[i-k*8-1]);
620  assert(diag7 == diag[i-k*8-0]);
621  }
622  for (k = 0; k < 5; ++k) {
623  std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
624  std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
625  std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
626  std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
627  std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
628  std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
629  std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
630  std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
631  assert(diag20 == diag2[i-k*8-7]);
632  assert(diag21 == diag2[i-k*8-6]);
633  assert(diag22 == diag2[i-k*8-5]);
634  assert(diag23 == diag2[i-k*8-4]);
635  assert(diag24 == diag2[i-k*8-3]);
636  assert(diag25 == diag2[i-k*8-2]);
637  assert(diag26 == diag2[i-k*8-1]);
638  assert(diag27 == diag2[i-k*8-0]);
639  }
640 
641  for (k = 0; k < 4; ++k) {
642  int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
643  int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
644  int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
645  int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
646  int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
647  int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
648  int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
649  int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
650  assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
651  assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
652  assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
653  assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
654  assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
655  assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
656  assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
657  assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
658  }
659 #endif // LSTSSE_DEBUG
660 
661  for (k = 0; k < 4; ++k) {
662  __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
663  __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
664 
665  __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
666  __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
667  min = _mm256_add_epi32(min, one256_epi32);
668 
669 #ifdef LSTSSE_DEBUG
670  std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
671  std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
672  std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
673  std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
674  std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
675  std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
676  std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
677  std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
678  assert(diag_i_m10 == diag[i-k*8-8]);
679  assert(diag_i_m11 == diag[i-k*8-7]);
680  assert(diag_i_m12 == diag[i-k*8-6]);
681  assert(diag_i_m13 == diag[i-k*8-5]);
682  assert(diag_i_m14 == diag[i-k*8-4]);
683  assert(diag_i_m15 == diag[i-k*8-3]);
684  assert(diag_i_m16 == diag[i-k*8-2]);
685  assert(diag_i_m17 == diag[i-k*8-1]);
686 
687  std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
688  std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
689  std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
690  std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
691  std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
692  std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
693  std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
694  std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
695  assert(diag2_i_m10 == diag2[i-k*8-8]);
696  assert(diag2_i_m11 == diag2[i-k*8-7]);
697  assert(diag2_i_m12 == diag2[i-k*8-6]);
698  assert(diag2_i_m13 == diag2[i-k*8-5]);
699  assert(diag2_i_m14 == diag2[i-k*8-4]);
700  assert(diag2_i_m15 == diag2[i-k*8-3]);
701  assert(diag2_i_m16 == diag2[i-k*8-2]);
702  assert(diag2_i_m17 == diag2[i-k*8-1]);
703 
704  std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
705  std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
706  std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
707  std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
708  std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
709  std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
710  std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
711  std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
712 
713  assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
714  assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
715  assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
716  assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
717  assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
718  assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
719  assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
720  assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
721 #endif // LSTSSE_DEBUG
722 
723  _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
724  }
725 
726  // We just handled 32 entries at once. Yay!
727  i -= 32;
728 }
729 #endif // __AVX2__
730 
731 };
732 
741 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
742 struct LevenshteinIteration : LevenshteinIterationBase<Vec1, Vec2, Iterator1, Iterator2> {
743 };
744 
748 template<typename Alloc1, typename Alloc2, typename T>
750 static inline void perform(const T* a, const T* b,
751  std::size_t& i, std::size_t j, std::size_t bLen,
752  std::vector<std::uint32_t, Alloc1>& diag,
753  const std::vector<std::uint32_t, Alloc2>& diag2) {
754  return LevenshteinIterationSIMD<T>::performSIMD(a, b, i, j, bLen, diag.data(), diag2.data());
755 }
756 };
757 
763 template<typename Alloc1, typename Alloc2, typename T>
764 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
765  : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
766  LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
767  LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
768  >::type
769 { };
770 
774 template<typename Alloc1, typename Alloc2, typename T>
775 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, T*, T*>
776  : LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
777 { };
778 
782 template<typename T, typename Iterator1, typename Iterator2>
783 T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
784  const std::size_t aLen = aEnd - a;
785  const std::size_t bLen = bEnd - b;
786 
787  assert(0 < aLen);
788  assert(aLen <= bLen);
789 
790  typedef AlignmentAllocator<T, alignment> Alloc;
791  std::vector<T, Alloc> diag (aLen + 1, T(0));
792  std::vector<T, Alloc> diag2 (aLen + 1, T(0));
793 
794  std::size_t i, j, k;
795 
796  k = 0;
797  for (k = 1; ; ++k) {
798  assert(k <= aLen + bLen);
799 
800  std::size_t startRow = k > bLen ? k - bLen : 1;
801  std::size_t endRow = k > aLen ? aLen : k - 1;
802 
803  assert(endRow >= startRow || k == 1);
804 
805  for (i = endRow; i >= startRow; ) {
806  assert(i < k);
807  j = k - i;
808 
809  assert(bLen >= j);
810  assert(aLen >= i);
811 
812  LevenshteinIteration<std::vector<T, Alloc>, std::vector<T, Alloc>, Iterator1, Iterator2>
813  ::perform(a, b, i, j, bLen, diag, diag2);
814  }
815 
816  diag[0] = k;
817 
818  if (k <= aLen) {
819  diag[k] = k;
820  }
821 
822  if (k == aLen + bLen) {
823  assert(startRow == endRow);
824  return diag[startRow];
825  }
826 
827  // switch buffers
828  std::swap(diag, diag2);
829  }
830 
831  assert(0);
832 }
833 
839 template<typename T, typename Iterator1, typename Iterator2>
840 T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
841  std::vector<T> arr;
842 
843  std::size_t i = 0, j = 0;
844  T ret(0);
845 
846  for (Iterator1 it = a; it != aEnd; ++it) {
847  arr.push_back(++i);
848  }
849 
850  arr.shrink_to_fit();
851 
852  for (; b != bEnd; ++b) {
853  T tmp = j++;
854  ret = j;
855  i = 0;
856 
857  for (Iterator1 it = a; it != aEnd; ++it, ++i) {
858  T tmp2 = *b == *it ? tmp : tmp + 1;
859  tmp = arr[i];
860  ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
861  }
862  }
863 
864  return ret;
865 }
866 
871 template<typename Iterator1, typename Iterator2>
872 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
873  std::random_access_iterator_tag, std::random_access_iterator_tag) {
874  if (aEnd - a > bEnd - b) {
875  return levenshtein(b, bEnd, a, aEnd);
876  }
877 
878  // skip common prefixes and suffixes
879  while (a < aEnd && a[0] == b[0])
880  ++a, ++b;
881 
882  while (a < aEnd && aEnd[-1] == bEnd[-1])
883  --aEnd, --bEnd;
884 
885  std::size_t aLen = aEnd - a;
886  std::size_t bLen = bEnd - b;
887 
888  if (aLen == 0) {
889  return bLen;
890  }
891 
892  if (aLen == 1) {
893  return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
894  }
895 
896  if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
897  return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
898 
899  return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
900 }
901 
906 template<typename Iterator1, typename Iterator2>
907 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
908  std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
909  // skip common prefixes and suffixes
910  while (a != aEnd && b != bEnd && *a == *b)
911  ++a, ++b;
912 
913  while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
914  --aEnd, --bEnd;
915 
916  if (a == aEnd) {
917  return std::distance(b, bEnd);
918  }
919 
920  if (b == bEnd) {
921  return std::distance(a, aEnd);
922  }
923 
924  if (std::next(a) == aEnd) {
925  std::size_t ret = 0, found = 0;
926  for (; b != bEnd; ++b, ++ret)
927  if (*b == *a)
928  found = 1;
929  return ret - found;
930  }
931 
932  if (std::next(b) == bEnd) {
933  std::size_t ret = 0, found = 0;
934  for (; a != aEnd; ++a, ++ret)
935  if (*b == *a)
936  found = 1;
937  return ret - found;
938  }
939 
940  return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
941 }
942 
943 // SFINAE checker for .data() and .size()
944 template<typename T>
946 private:
947  struct char2 { char _[2]; };
948  template<typename U> static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
949  template<typename U> static auto test(const U* a) -> char2;
950 public:
951  static constexpr bool value = sizeof(test(static_cast<T*>(nullptr))) == 1;
952 };
953 
954 template<bool useDataAndSize>
956 
960 template<>
961 struct LevenshteinContainer<true> {
962 template<typename Container1, typename Container2>
963 static inline std::size_t calc(const Container1& a, const Container2& b) {
964  return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
965 }
966 };
967 
971 template<>
972 struct LevenshteinContainer<false> {
973 template<typename Container1, typename Container2>
974 static inline std::size_t calc(const Container1& a, const Container2& b) {
975  return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
976 }
977 };
978 
979 template<typename Iterator1, typename Iterator2>
980 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
981  return levenshtein(a, aEnd, b, bEnd,
982  typename std::iterator_traits<Iterator1>::iterator_category(),
983  typename std::iterator_traits<Iterator2>::iterator_category());
984 }
985 
986 template<typename Container1, typename Container2>
987 std::size_t levenshtein(const Container1& a, const Container2& b) {
990 }
991 }
992 
993 #endif
994 
995 // clang-format on
Definition: levenshtein-sse.hpp:947
Definition: levenshtein-sse.hpp:62
Definition: levenshtein-sse.hpp:749
Definition: levenshtein-sse.hpp:945
Definition: levenshtein-sse.hpp:287
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:231
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:974
Definition: levenshtein-sse.hpp:105
Definition: levenshtein-sse.hpp:230
Definition: levenshtein-sse.hpp:742
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:980
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:840
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:783
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:750
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:963
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:289
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:188
Definition: levenshtein-sse.hpp:955