EventAnalysis  1.3.0
levenshtein-sse.hpp
Go to the documentation of this file.
1 /*
2  * levenshtein-sse.hpp
3  *
4  * Created on: Sep 15, 2020
5  * Added by: Nicola Mori
6  */
7 
8 /* The code has been obtained from https://github.com/addaleax/levenshtein-sse
9  *
10  * This file contains the original code contained in levenshtein-sse.hpp as
11  * obtained from github plus the license. The code is guarded against clang-format
12  * to keep the original indentation and ease any comparison with the original
13  * version.
14  */
15 
40 // clang-format off
41 
42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP
43 #define LSTSSE_LEVENSHTEIN_SSE_HPP
44 
45 #include <algorithm>
46 #include <limits>
47 #include <vector>
48 #include <iterator>
49 #include <limits>
50 #include <cstdint>
51 #include <cassert>
52 #ifdef __SSSE3__
53 #include <tmmintrin.h>
54 #endif
55 #ifdef __SSE4_1__
56 #include <smmintrin.h>
57 #endif
58 #ifdef __AVX2__
59 #include <immintrin.h>
60 #endif
61 
62 namespace levenshteinSSE {
63 
77 template<typename Iterator1, typename Iterator2>
78 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
79 
91 template<typename Container1, typename Container2>
92 std::size_t levenshtein(const Container1& a, const Container2& b);
93 
104 template <typename T, std::size_t N = 16>
106 
107 #ifdef __SSE2__
108 #include <mm_malloc.h>
109 template <typename T, std::size_t N>
110 class AlignmentAllocator {
111 public:
112  typedef T value_type;
113  typedef std::size_t size_type;
114  typedef std::ptrdiff_t difference_type;
115 
116  typedef T* pointer;
117  typedef const T* const_pointer;
118 
119  typedef T& reference;
120  typedef const T& const_reference;
121 
122  constexpr inline AlignmentAllocator () noexcept { }
123 
124  template <typename T2>
125  constexpr inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) noexcept { }
126 
127  inline ~AlignmentAllocator () noexcept { }
128 
129  inline pointer address (reference r) {
130  return &r;
131  }
132 
133  inline const_pointer address (const_reference r) const {
134  return &r;
135  }
136 
137  inline pointer allocate (size_type n) {
138  // this allocator is special in that it leaves 4*N bytes before and after
139  // the allocated area for garbage reads/writes
140  return reinterpret_cast<pointer>(reinterpret_cast<char*>(_mm_malloc(n * sizeof(value_type) + 8 * N, N)) + 4*N);
141  }
142 
143  inline void deallocate (pointer p, size_type) {
144  _mm_free(reinterpret_cast<char*>(p) - 4*N);
145  }
146 
147  inline void construct (pointer p, const value_type& value) {
148  new (p) value_type (value);
149  }
150 
151  inline void destroy (pointer p) {
152  p->~value_type ();
153  }
154 
155  constexpr inline size_type max_size () const noexcept {
156  return size_type (-1) / sizeof (value_type);
157  }
158 
159  template <typename T2>
160  struct rebind {
161  typedef AlignmentAllocator<T2, N> other;
162  };
163 
164  constexpr bool operator!=(const AlignmentAllocator<T,N>& other) const {
165  return !(*this == other);
166  }
167 
168  // Returns true if and only if storage allocated from *this
169  // can be deallocated from other, and vice versa.
170  // Always returns true for stateless allocators.
171  constexpr bool operator==(const AlignmentAllocator<T,N>& other) const {
172  return other.usesMMAlloc;
173  }
174 
175  static constexpr bool usesMMAlloc = true;
176 };
177 #endif
178 
179 template <typename T>
180 class AlignmentAllocator<T, 1> : public std::allocator<T> {
181 public:
182  static constexpr bool usesMMAlloc = false;
183  template <typename T2>
184  struct rebind {
186  };
187 };
188 
189 #ifdef __SSSE3__
190 constexpr std::size_t alignment = 16;
191 #else
192 constexpr std::size_t alignment = 1;
193 #endif
194 
233 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
235 static inline void perform(const Iterator1& a, const Iterator2& b,
236  std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag, const Vec2& diag2)
237 {
238  std::uint32_t min = std::min(diag2[i], diag2[i-1]);
239  if (min < diag[i-1]) {
240  diag[i] = min + 1;
241  }
242  else {
243  diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
244  }
245  --i;
246 }
247 };
248 
260 #ifdef __SSSE3__
261 #ifndef __SSE4_1__
262 // straightforward _mm_min_epi32 polyfill using only SSE2
263 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
264  __m128i compare = _mm_cmpgt_epi32(a, b);
265  __m128i aIsSmaller = _mm_andnot_si128(a, compare);
266  __m128i bIsSmaller = _mm_and_si128 (b, compare);
267  return _mm_or_si128(aIsSmaller, bIsSmaller);
268 }
269 #endif // __SSE4_1__
270 #endif // __SSSE3__
271 
272 #ifdef __AVX2__
273 // provide an AVX2 version of _mm256_alignr_epi32(a, b, 7)
274 // (i.e. move a one epi32 to the left and write the highest
275 // bytes of b to the lowest of a)
276 /*#ifndef __AVX512F__*/
277 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
278  const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
279  __m256i combined = _mm256_blend_epi32(a, b, 0x80);
280  return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
281 }
282 /*
283 #else
284 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
285  return _mm256_alignr_epi32(a, b, 7);
286 }
287 #endif*/
288 #endif
289 
290 template<typename T>
292 /* Decide which implementation is acceptable */
293 static inline void performSIMD(const T* a, const T* b,
294  std::size_t& i, std::size_t j, std::size_t bLen,
295  std::uint32_t* diag, const std::uint32_t* diag2)
296 {
297 
298 #ifdef __AVX2__
299  if (i >= 32 && bLen - j >= 32) {
300  performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
301  return;
302  }
303 #endif
304 
305 #ifdef __SSSE3__
306  if (i >= 16 && bLen - j >= 16) {
307  performSSE(a, b, i, j, bLen, diag, diag2);
308  return;
309  }
310 #endif
311 
313  ::perform(a, b, i, j, bLen, diag, diag2);
314 }
315 
316 #ifdef __SSSE3__
317 static inline void performSSE(const T* a, const T* b,
318  std::size_t& i, std::size_t j, std::size_t bLen,
319  std::uint32_t* diag, const std::uint32_t* diag2)
320 {
321  const __m128i one128_epi32 = _mm_set1_epi32(1);
322  const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
323  15, 14, 13, 12,
324  11, 10, 9, 8,
325  7, 6, 5, 4,
326  3, 2, 1, 0
327  );
328  const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
329  14, 15, 12, 13,
330  10, 11, 8, 9,
331  6, 7, 4, 5,
332  2, 3, 0, 1
333  );
334  const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
335  3, 2, 1, 0,
336  7, 6, 5, 4,
337  11, 10, 9, 8,
338  15, 14, 13, 12
339  );
340  const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
341  6, 7, 4, 5,
342  2, 3, 0, 1,
343  14, 15, 12, 13,
344  10, 11, 8, 9
345  );
346 
347  __m128i substitutionCost32[4];
348  std::size_t k;
349 
350  // We support 1, 2, and 4 byte objects for SSE comparison.
351  // We always process 16 entries at once, so we may need multiple fetches
352  // depending on object size.
353  if (sizeof(T) <= 2) {
354  __m128i substitutionCost16LX, substitutionCost16HX;
355 
356  if (sizeof(T) == 1) {
357  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
358  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
359  a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
360 
361  // int substitutionCost = a[i-1] == b[j-1] ? -1 : 0;
362 
363  // diag/diag2 will contain the following entries from the diagonal:
364  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
365  // diag+i -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
366 
367  // substitutionCost8 contains the comparisons for the following indices in a and b:
368  // index 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
369  // a+i -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16
370  // b+j -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
371  // in substitutionCost8X we shuffle this so that it matches up with diag/diag2
372  // (barring the offset induces by comparing i-1 against j-1)
373  __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
374  __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
375  // unpack with self so that 00 -> 00 00 and ff -> ff ff
376  substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
377  substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
378  } else {
379  __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
380  __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
381  __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
382  __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
383  a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
384  a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
385  __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
386  __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
387  substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
388  substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
389  }
390 
391  substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
392  substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
393  substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
394  substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
395  } else {
396  assert(sizeof(T) == 4);
397 
398  for (k = 0; k < 4; ++k) {
399  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
400  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
401  b_ = _mm_shuffle_epi32(b_, 0x1b); // simple reverse
402  substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
403  }
404  }
405 
406  __m128i diag_[5], diag2_[5];
407  for (k = 0; k < 5; ++k) {
408  diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
409  }
410  for (k = 0; k < 5; ++k) {
411  diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
412  }
413 
414  // There's a ton of debug stuff down here. You can use it for reference
415  // or, of course, compile with -DLSTSSE_DEBUG if you're feeling funny.
416 #ifdef LSTSSE_DEBUG
417  for (k = 0; k < 5; ++k) {
418  std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
419  std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
420  std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
421  std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
422  assert(diag0 == diag[i-k*4-3]);
423  assert(diag1 == diag[i-k*4-2]);
424  assert(diag2 == diag[i-k*4-1]);
425  assert(diag3 == diag[i-k*4-0]);
426  }
427  for (k = 0; k < 5; ++k) {
428  std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
429  std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
430  std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
431  std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
432  assert(diag20 == diag2[i-k*4-3]);
433  assert(diag21 == diag2[i-k*4-2]);
434  assert(diag22 == diag2[i-k*4-1]);
435  assert(diag23 == diag2[i-k*4-0]);
436  }
437 
438  for (k = 0; k < 4; ++k) {
439  int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
440  int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
441  int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
442  int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
443  assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
444  assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
445  assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
446  assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
447  }
448 #endif // LSTSSE_DEBUG
449 
450  // reminders:
451  // the arrays (diag, diag2, substitutionCost32) correspond to i:
452  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
453  // i+ -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
454  // so when we need to access ...[i-1], we need to align the entries
455  // in *reverse* order
456 
457  // diag[i] = min(
458  // diag2[i-1],
459  // diag2[i],
460  // diag[i-1] + substitutionCost
461  // ) + 1;
462  //
463  for (k = 0; k < 4; ++k) {
464  __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
465  __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
466 
467  __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
468  __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
469  min = _mm_add_epi32(min, one128_epi32);
470 
471 #ifdef LSTSSE_DEBUG
472  std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
473  std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
474  std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
475  std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
476  assert(diag_i_m10 == diag[i-k*4-4]);
477  assert(diag_i_m11 == diag[i-k*4-3]);
478  assert(diag_i_m12 == diag[i-k*4-2]);
479  assert(diag_i_m13 == diag[i-k*4-1]);
480 
481  std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
482  std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
483  std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
484  std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
485  assert(diag2_i_m10 == diag2[i-k*4-4]);
486  assert(diag2_i_m11 == diag2[i-k*4-3]);
487  assert(diag2_i_m12 == diag2[i-k*4-2]);
488  assert(diag2_i_m13 == diag2[i-k*4-1]);
489 
490  std::uint32_t result30 = _mm_extract_epi32(result3, 0);
491  std::uint32_t result31 = _mm_extract_epi32(result3, 1);
492  std::uint32_t result32 = _mm_extract_epi32(result3, 2);
493  std::uint32_t result33 = _mm_extract_epi32(result3, 3);
494 
495  assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
496  assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
497  assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
498  assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
499 #endif // LSTSSE_DEBUG
500 
501  _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
502  }
503 
504  // We just handled 16 entries at once. Yay!
505  i -= 16;
506 }
507 #endif // __SSSE3__
508 
509 
510 #ifdef __AVX2__
511 static inline void performSSE_AVX2(const T* a, const T* b,
512  std::size_t& i, std::size_t j, std::size_t bLen,
513  std::uint32_t* diag, const std::uint32_t* diag2)
514 {
515  const __m256i one256_epi32 = _mm256_set1_epi32(1);
516  const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
517  15, 14, 13, 12,
518  11, 10, 9, 8,
519  7, 6, 5, 4,
520  3, 2, 1, 0,
521  15, 14, 13, 12,
522  11, 10, 9, 8,
523  7, 6, 5, 4,
524  3, 2, 1, 0
525  );
526 
527  const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
528  7, 6, 5, 4,
529  3, 2, 1, 0,
530  15, 14, 13, 12,
531  11, 10, 9, 8,
532  7, 6, 5, 4,
533  3, 2, 1, 0,
534  15, 14, 13, 12,
535  11, 10, 9, 8
536  );
537 
538  const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
539  14, 15, 12, 13,
540  10, 11, 8, 9,
541  6, 7, 4, 5,
542  2, 3, 0, 1,
543  14, 15, 12, 13,
544  10, 11, 8, 9,
545  6, 7, 4, 5,
546  2, 3, 0, 1
547  );
548 
549  __m256i substitutionCost32[4];
550  std::size_t k;
551 
552  if (sizeof(T) <= 2) {
553  if (sizeof(T) == 1) {
554  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
555  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
556  a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
557  a_ = _mm256_permute2x128_si256(a_, a_, 1); // swap hi/lo
558 
559  __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
560  __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
561  __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
562  __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
563  substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
564  substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
565  substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
566  substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
567  } else {
568  __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
569  __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
570  __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
571  __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
572  a0 = _mm256_permute2x128_si256(a0, a0, 1); // swap hi/lo
573  a1 = _mm256_permute2x128_si256(a1, a1, 1); // swap hi/lo
574  b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
575  b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
576  __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
577  __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
578  __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
579  __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
580  __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
581  __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
582  substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
583  substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
584  substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
585  substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
586  }
587  } else {
588  assert(sizeof(T) == 4);
589 
590  for (k = 0; k < 4; ++k) {
591  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
592  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
593  b_ = _mm256_shuffle_epi32(b_, 0x1b); // simple reverse
594  b_ = _mm256_permute2x128_si256(b_, b_, 1); // swap hi/lo
595  substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
596  }
597  }
598 
599  __m256i diag_[5], diag2_[5];
600  for (k = 0; k < 5; ++k) {
601  diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
602  }
603  for (k = 0; k < 5; ++k) {
604  diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
605  }
606 
607 #ifdef LSTSSE_DEBUG
608  for (k = 0; k < 5; ++k) {
609  std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
610  std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
611  std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
612  std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
613  std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
614  std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
615  std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
616  std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
617  assert(diag0 == diag[i-k*8-7]);
618  assert(diag1 == diag[i-k*8-6]);
619  assert(diag2 == diag[i-k*8-5]);
620  assert(diag3 == diag[i-k*8-4]);
621  assert(diag4 == diag[i-k*8-3]);
622  assert(diag5 == diag[i-k*8-2]);
623  assert(diag6 == diag[i-k*8-1]);
624  assert(diag7 == diag[i-k*8-0]);
625  }
626  for (k = 0; k < 5; ++k) {
627  std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
628  std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
629  std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
630  std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
631  std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
632  std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
633  std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
634  std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
635  assert(diag20 == diag2[i-k*8-7]);
636  assert(diag21 == diag2[i-k*8-6]);
637  assert(diag22 == diag2[i-k*8-5]);
638  assert(diag23 == diag2[i-k*8-4]);
639  assert(diag24 == diag2[i-k*8-3]);
640  assert(diag25 == diag2[i-k*8-2]);
641  assert(diag26 == diag2[i-k*8-1]);
642  assert(diag27 == diag2[i-k*8-0]);
643  }
644 
645  for (k = 0; k < 4; ++k) {
646  int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
647  int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
648  int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
649  int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
650  int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
651  int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
652  int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
653  int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
654  assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
655  assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
656  assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
657  assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
658  assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
659  assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
660  assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
661  assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
662  }
663 #endif // LSTSSE_DEBUG
664 
665  for (k = 0; k < 4; ++k) {
666  __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
667  __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
668 
669  __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
670  __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
671  min = _mm256_add_epi32(min, one256_epi32);
672 
673 #ifdef LSTSSE_DEBUG
674  std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
675  std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
676  std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
677  std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
678  std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
679  std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
680  std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
681  std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
682  assert(diag_i_m10 == diag[i-k*8-8]);
683  assert(diag_i_m11 == diag[i-k*8-7]);
684  assert(diag_i_m12 == diag[i-k*8-6]);
685  assert(diag_i_m13 == diag[i-k*8-5]);
686  assert(diag_i_m14 == diag[i-k*8-4]);
687  assert(diag_i_m15 == diag[i-k*8-3]);
688  assert(diag_i_m16 == diag[i-k*8-2]);
689  assert(diag_i_m17 == diag[i-k*8-1]);
690 
691  std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
692  std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
693  std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
694  std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
695  std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
696  std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
697  std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
698  std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
699  assert(diag2_i_m10 == diag2[i-k*8-8]);
700  assert(diag2_i_m11 == diag2[i-k*8-7]);
701  assert(diag2_i_m12 == diag2[i-k*8-6]);
702  assert(diag2_i_m13 == diag2[i-k*8-5]);
703  assert(diag2_i_m14 == diag2[i-k*8-4]);
704  assert(diag2_i_m15 == diag2[i-k*8-3]);
705  assert(diag2_i_m16 == diag2[i-k*8-2]);
706  assert(diag2_i_m17 == diag2[i-k*8-1]);
707 
708  std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
709  std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
710  std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
711  std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
712  std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
713  std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
714  std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
715  std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
716 
717  assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
718  assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
719  assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
720  assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
721  assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
722  assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
723  assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
724  assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
725 #endif // LSTSSE_DEBUG
726 
727  _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
728  }
729 
730  // We just handled 32 entries at once. Yay!
731  i -= 32;
732 }
733 #endif // __AVX2__
734 
735 };
736 
745 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
746 struct LevenshteinIteration : LevenshteinIterationBase<Vec1, Vec2, Iterator1, Iterator2> {
747 };
748 
752 template<typename Alloc1, typename Alloc2, typename T>
754 static inline void perform(const T* a, const T* b,
755  std::size_t& i, std::size_t j, std::size_t bLen,
756  std::vector<std::uint32_t, Alloc1>& diag,
757  const std::vector<std::uint32_t, Alloc2>& diag2) {
758  return LevenshteinIterationSIMD<T>::performSIMD(a, b, i, j, bLen, diag.data(), diag2.data());
759 }
760 };
761 
767 template<typename Alloc1, typename Alloc2, typename T>
768 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
769  : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
770  LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
771  LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
772  >::type
773 { };
774 
778 template<typename Alloc1, typename Alloc2, typename T>
779 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, T*, T*>
780  : LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
781 { };
782 
786 template<typename T, typename Iterator1, typename Iterator2>
787 T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
788  const std::size_t aLen = aEnd - a;
789  const std::size_t bLen = bEnd - b;
790 
791  assert(0 < aLen);
792  assert(aLen <= bLen);
793 
794  typedef AlignmentAllocator<T, alignment> Alloc;
795  std::vector<T, Alloc> diag (aLen + 1, T(0));
796  std::vector<T, Alloc> diag2 (aLen + 1, T(0));
797 
798  std::size_t i, j, k;
799 
800  k = 0;
801  for (k = 1; ; ++k) {
802  assert(k <= aLen + bLen);
803 
804  std::size_t startRow = k > bLen ? k - bLen : 1;
805  std::size_t endRow = k > aLen ? aLen : k - 1;
806 
807  assert(endRow >= startRow || k == 1);
808 
809  for (i = endRow; i >= startRow; ) {
810  assert(i < k);
811  j = k - i;
812 
813  assert(bLen >= j);
814  assert(aLen >= i);
815 
816  LevenshteinIteration<std::vector<T, Alloc>, std::vector<T, Alloc>, Iterator1, Iterator2>
817  ::perform(a, b, i, j, bLen, diag, diag2);
818  }
819 
820  diag[0] = k;
821 
822  if (k <= aLen) {
823  diag[k] = k;
824  }
825 
826  if (k == aLen + bLen) {
827  assert(startRow == endRow);
828  return diag[startRow];
829  }
830 
831  // switch buffers
832  std::swap(diag, diag2);
833  }
834 
835  assert(0);
836 }
837 
843 template<typename T, typename Iterator1, typename Iterator2>
844 T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
845  std::vector<T> arr;
846 
847  std::size_t i = 0, j = 0;
848  T ret(0);
849 
850  for (Iterator1 it = a; it != aEnd; ++it) {
851  arr.push_back(++i);
852  }
853 
854  arr.shrink_to_fit();
855 
856  for (; b != bEnd; ++b) {
857  T tmp = j++;
858  ret = j;
859  i = 0;
860 
861  for (Iterator1 it = a; it != aEnd; ++it, ++i) {
862  T tmp2 = *b == *it ? tmp : tmp + 1;
863  tmp = arr[i];
864  ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
865  }
866  }
867 
868  return ret;
869 }
870 
875 template<typename Iterator1, typename Iterator2>
876 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
877  std::random_access_iterator_tag, std::random_access_iterator_tag) {
878  if (aEnd - a > bEnd - b) {
879  return levenshtein(b, bEnd, a, aEnd);
880  }
881 
882  // skip common prefixes and suffixes
883  while (a < aEnd && a[0] == b[0])
884  ++a, ++b;
885 
886  while (a < aEnd && aEnd[-1] == bEnd[-1])
887  --aEnd, --bEnd;
888 
889  std::size_t aLen = aEnd - a;
890  std::size_t bLen = bEnd - b;
891 
892  if (aLen == 0) {
893  return bLen;
894  }
895 
896  if (aLen == 1) {
897  return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
898  }
899 
900  if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
901  return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
902 
903  return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
904 }
905 
910 template<typename Iterator1, typename Iterator2>
911 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
912  std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
913  // skip common prefixes and suffixes
914  while (a != aEnd && b != bEnd && *a == *b)
915  ++a, ++b;
916 
917  while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
918  --aEnd, --bEnd;
919 
920  if (a == aEnd) {
921  return std::distance(b, bEnd);
922  }
923 
924  if (b == bEnd) {
925  return std::distance(a, aEnd);
926  }
927 
928  if (std::next(a) == aEnd) {
929  std::size_t ret = 0, found = 0;
930  for (; b != bEnd; ++b, ++ret)
931  if (*b == *a)
932  found = 1;
933  return ret - found;
934  }
935 
936  if (std::next(b) == bEnd) {
937  std::size_t ret = 0, found = 0;
938  for (; a != aEnd; ++a, ++ret)
939  if (*b == *a)
940  found = 1;
941  return ret - found;
942  }
943 
944  return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
945 }
946 
947 // SFINAE checker for .data() and .size()
948 template<typename T>
950 private:
951  struct char2 { char _[2]; };
952  template<typename U> static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
953  template<typename U> static auto test(const U* a) -> char2;
954 public:
955  static constexpr bool value = sizeof(test(static_cast<T*>(nullptr))) == 1;
956 };
957 
958 template<bool useDataAndSize>
960 
964 template<>
965 struct LevenshteinContainer<true> {
966 template<typename Container1, typename Container2>
967 static inline std::size_t calc(const Container1& a, const Container2& b) {
968  return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
969 }
970 };
971 
975 template<>
976 struct LevenshteinContainer<false> {
977 template<typename Container1, typename Container2>
978 static inline std::size_t calc(const Container1& a, const Container2& b) {
979  return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
980 }
981 };
982 
983 template<typename Iterator1, typename Iterator2>
984 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
985  return levenshtein(a, aEnd, b, bEnd,
986  typename std::iterator_traits<Iterator1>::iterator_category(),
987  typename std::iterator_traits<Iterator2>::iterator_category());
988 }
989 
990 template<typename Container1, typename Container2>
991 std::size_t levenshtein(const Container1& a, const Container2& b) {
994 }
995 }
996 
997 #endif
998 
999 // clang-format on
Definition: levenshtein-sse.hpp:951
Definition: levenshtein-sse.hpp:62
Definition: levenshtein-sse.hpp:753
Definition: levenshtein-sse.hpp:949
AlignmentAllocator< T2, 1 > other
Definition: levenshtein-sse.hpp:185
Definition: levenshtein-sse.hpp:291
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:235
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:978
Definition: levenshtein-sse.hpp:105
Definition: levenshtein-sse.hpp:234
Definition: levenshtein-sse.hpp:746
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:984
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:844
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:787
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:754
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:967
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:293
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:192
Definition: levenshtein-sse.hpp:959