EventAnalysis  1.1.0
levenshtein-sse.hpp
Go to the documentation of this file.
1 /*
2  * levenshtein-sse.hpp
3  *
4  * Created on: Sep 15, 2020
5  * Added by: Nicola Mori
6  */
7 
8 /* The code has been obtained from https://github.com/addaleax/levenshtein-sse
9  *
10  * This file contains the original code contained in levenshtein-sse.hpp as
11  * obtained from github plus the license. The code is guarded against clang-format
12  * to keep the original indentation and ease any comparison with the original
13  * version.
14  */
15 
40 // clang-format off
41 
42 #ifndef LSTSSE_LEVENSHTEIN_SSE_HPP
43 #define LSTSSE_LEVENSHTEIN_SSE_HPP
44 
45 #include <algorithm>
46 #include <vector>
47 #include <iterator>
48 #include <cstdint>
49 #include <cassert>
50 #ifdef __SSSE3__
51 #include <tmmintrin.h>
52 #endif
53 #ifdef __SSE4_1__
54 #include <smmintrin.h>
55 #endif
56 #ifdef __AVX2__
57 #include <immintrin.h>
58 #endif
59 
60 namespace levenshteinSSE {
61 
75 template<typename Iterator1, typename Iterator2>
76 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd);
77 
89 template<typename Container1, typename Container2>
90 std::size_t levenshtein(const Container1& a, const Container2& b);
91 
102 template <typename T, std::size_t N = 16>
104 
105 #ifdef __SSE2__
106 #include <mm_malloc.h>
107 template <typename T, std::size_t N>
108 class AlignmentAllocator {
109 public:
110  typedef T value_type;
111  typedef std::size_t size_type;
112  typedef std::ptrdiff_t difference_type;
113 
114  typedef T* pointer;
115  typedef const T* const_pointer;
116 
117  typedef T& reference;
118  typedef const T& const_reference;
119 
120  constexpr inline AlignmentAllocator () noexcept { }
121 
122  template <typename T2>
123  constexpr inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) noexcept { }
124 
125  inline ~AlignmentAllocator () noexcept { }
126 
127  inline pointer address (reference r) {
128  return &r;
129  }
130 
131  inline const_pointer address (const_reference r) const {
132  return &r;
133  }
134 
135  inline pointer allocate (size_type n) {
136  // this allocator is special in that it leaves 4*N bytes before and after
137  // the allocated area for garbage reads/writes
138  return reinterpret_cast<pointer>(reinterpret_cast<char*>(_mm_malloc(n * sizeof(value_type) + 8 * N, N)) + 4*N);
139  }
140 
141  inline void deallocate (pointer p, size_type) {
142  _mm_free(reinterpret_cast<char*>(p) - 4*N);
143  }
144 
145  inline void construct (pointer p, const value_type& value) {
146  new (p) value_type (value);
147  }
148 
149  inline void destroy (pointer p) {
150  p->~value_type ();
151  }
152 
153  constexpr inline size_type max_size () const noexcept {
154  return size_type (-1) / sizeof (value_type);
155  }
156 
157  template <typename T2>
158  struct rebind {
159  typedef AlignmentAllocator<T2, N> other;
160  };
161 
162  constexpr bool operator!=(const AlignmentAllocator<T,N>& other) const {
163  return !(*this == other);
164  }
165 
166  // Returns true if and only if storage allocated from *this
167  // can be deallocated from other, and vice versa.
168  // Always returns true for stateless allocators.
169  constexpr bool operator==(const AlignmentAllocator<T,N>& other) const {
170  return other.usesMMAlloc;
171  }
172 
173  static constexpr bool usesMMAlloc = true;
174 };
175 #endif
176 
177 template <typename T>
178 class AlignmentAllocator<T, 1> : public std::allocator<T> {
179 public:
180  static constexpr bool usesMMAlloc = false;
181 };
182 
183 #ifdef __SSSE3__
184 constexpr std::size_t alignment = 16;
185 #else
186 constexpr std::size_t alignment = 1;
187 #endif
188 
227 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
229 static inline void perform(const Iterator1& a, const Iterator2& b,
230  std::size_t& i, std::size_t j, std::size_t bLen, Vec1& diag, const Vec2& diag2)
231 {
232  std::uint32_t min = std::min(diag2[i], diag2[i-1]);
233  if (min < diag[i-1]) {
234  diag[i] = min + 1;
235  }
236  else {
237  diag[i] = diag[i-1] + (a[i-1] != b[j-1]);
238  }
239  --i;
240 }
241 };
242 
254 #ifdef __SSSE3__
255 #ifndef __SSE4_1__
256 // straightforward _mm_min_epi32 polyfill using only SSE2
257 inline __m128i _mm_min_epi32 (__m128i a, __m128i b) {
258  __m128i compare = _mm_cmpgt_epi32(a, b);
259  __m128i aIsSmaller = _mm_andnot_si128(a, compare);
260  __m128i bIsSmaller = _mm_and_si128 (b, compare);
261  return _mm_or_si128(aIsSmaller, bIsSmaller);
262 }
263 #endif // __SSE4_1__
264 #endif // __SSSE3__
265 
266 #ifdef __AVX2__
267 // provide an AVX2 version of _mm256_alignr_epi32(a, b, 7)
268 // (i.e. move a one epi32 to the left and write the highest
269 // bytes of b to the lowest of a)
270 /*#ifndef __AVX512F__*/
271 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
272  const __m256i rotl1_256_epi32 = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
273  __m256i combined = _mm256_blend_epi32(a, b, 0x80);
274  return _mm256_permutevar8x32_epi32(combined, rotl1_256_epi32);
275 }
276 /*
277 #else
278 __m256i _mm256_alignr_epi32_7(__m256i a, __m256i b) {
279  return _mm256_alignr_epi32(a, b, 7);
280 }
281 #endif*/
282 #endif
283 
284 template<typename T>
286 /* Decide which implementation is acceptable */
287 static inline void performSIMD(const T* a, const T* b,
288  std::size_t& i, std::size_t j, std::size_t bLen,
289  std::uint32_t* diag, const std::uint32_t* diag2)
290 {
291 
292 #ifdef __AVX2__
293  if (i >= 32 && bLen - j >= 32) {
294  performSSE_AVX2(a, b, i, j, bLen, diag, diag2);
295  return;
296  }
297 #endif
298 
299 #ifdef __SSSE3__
300  if (i >= 16 && bLen - j >= 16) {
301  performSSE(a, b, i, j, bLen, diag, diag2);
302  return;
303  }
304 #endif
305 
307  ::perform(a, b, i, j, bLen, diag, diag2);
308 }
309 
310 #ifdef __SSSE3__
311 static inline void performSSE(const T* a, const T* b,
312  std::size_t& i, std::size_t j, std::size_t bLen,
313  std::uint32_t* diag, const std::uint32_t* diag2)
314 {
315  const __m128i one128_epi32 = _mm_set1_epi32(1);
316  const __m128i reversedIdentity128_epi8 = _mm_setr_epi8(
317  15, 14, 13, 12,
318  11, 10, 9, 8,
319  7, 6, 5, 4,
320  3, 2, 1, 0
321  );
322  const __m128i reversedIdentity128_epi16 = _mm_setr_epi8(
323  14, 15, 12, 13,
324  10, 11, 8, 9,
325  6, 7, 4, 5,
326  2, 3, 0, 1
327  );
328  const __m128i blockwiseReversed128_epi8_4 = _mm_setr_epi8(
329  3, 2, 1, 0,
330  7, 6, 5, 4,
331  11, 10, 9, 8,
332  15, 14, 13, 12
333  );
334  const __m128i blockwiseReversed128_epi16_4 = _mm_setr_epi8(
335  6, 7, 4, 5,
336  2, 3, 0, 1,
337  14, 15, 12, 13,
338  10, 11, 8, 9
339  );
340 
341  __m128i substitutionCost32[4];
342  std::size_t k;
343 
344  // We support 1, 2, and 4 byte objects for SSE comparison.
345  // We always process 16 entries at once, so we may need multiple fetches
346  // depending on object size.
347  if (sizeof(T) <= 2) {
348  __m128i substitutionCost16LX, substitutionCost16HX;
349 
350  if (sizeof(T) == 1) {
351  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
352  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
353  a_ = _mm_shuffle_epi8(a_, reversedIdentity128_epi8);
354 
355  // int substitutionCost = a[i-1] == b[j-1] ? -1 : 0;
356 
357  // diag/diag2 will contain the following entries from the diagonal:
358  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
359  // diag+i -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
360 
361  // substitutionCost8 contains the comparisons for the following indices in a and b:
362  // index 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
363  // a+i -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16
364  // b+j -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
365  // in substitutionCost8X we shuffle this so that it matches up with diag/diag2
366  // (barring the offset induces by comparing i-1 against j-1)
367  __m128i substitutionCost8 = _mm_cmpeq_epi8(a_, b_);
368  __m128i substitutionCost8X = _mm_shuffle_epi8(substitutionCost8, blockwiseReversed128_epi8_4);
369  // unpack with self so that 00 -> 00 00 and ff -> ff ff
370  substitutionCost16LX = _mm_unpacklo_epi8(substitutionCost8X, substitutionCost8X);
371  substitutionCost16HX = _mm_unpackhi_epi8(substitutionCost8X, substitutionCost8X);
372  } else {
373  __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-8]));
374  __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-16]));
375  __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1]));
376  __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j+7]));
377  a0 = _mm_shuffle_epi8(a0, reversedIdentity128_epi16);
378  a1 = _mm_shuffle_epi8(a1, reversedIdentity128_epi16);
379  __m128i substitutionCost16L = _mm_cmpeq_epi16(a0, b0);
380  __m128i substitutionCost16H = _mm_cmpeq_epi16(a1, b1);
381  substitutionCost16LX = _mm_shuffle_epi8(substitutionCost16L, blockwiseReversed128_epi16_4);
382  substitutionCost16HX = _mm_shuffle_epi8(substitutionCost16H, blockwiseReversed128_epi16_4);
383  }
384 
385  substitutionCost32[0] = _mm_unpacklo_epi16(substitutionCost16LX, substitutionCost16LX);
386  substitutionCost32[1] = _mm_unpackhi_epi16(substitutionCost16LX, substitutionCost16LX);
387  substitutionCost32[2] = _mm_unpacklo_epi16(substitutionCost16HX, substitutionCost16HX);
388  substitutionCost32[3] = _mm_unpackhi_epi16(substitutionCost16HX, substitutionCost16HX);
389  } else {
390  assert(sizeof(T) == 4);
391 
392  for (k = 0; k < 4; ++k) {
393  __m128i a_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&a[i-4-k*4]));
394  __m128i b_ = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&b[j-1+k*4]));
395  b_ = _mm_shuffle_epi32(b_, 0x1b); // simple reverse
396  substitutionCost32[k] = _mm_cmpeq_epi32(a_, b_);
397  }
398  }
399 
400  __m128i diag_[5], diag2_[5];
401  for (k = 0; k < 5; ++k) {
402  diag_ [k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag [i-3-k*4]));
403  }
404  for (k = 0; k < 5; ++k) {
405  diag2_[k] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&diag2[i-3-k*4]));
406  }
407 
408  // There's a ton of debug stuff down here. You can use it for reference
409  // or, of course, compile with -DLSTSSE_DEBUG if you're feeling funny.
410 #ifdef LSTSSE_DEBUG
411  for (k = 0; k < 5; ++k) {
412  std::uint32_t diag0 = _mm_extract_epi32(diag_[k], 0);
413  std::uint32_t diag1 = _mm_extract_epi32(diag_[k], 1);
414  std::uint32_t diag2 = _mm_extract_epi32(diag_[k], 2);
415  std::uint32_t diag3 = _mm_extract_epi32(diag_[k], 3);
416  assert(diag0 == diag[i-k*4-3]);
417  assert(diag1 == diag[i-k*4-2]);
418  assert(diag2 == diag[i-k*4-1]);
419  assert(diag3 == diag[i-k*4-0]);
420  }
421  for (k = 0; k < 5; ++k) {
422  std::uint32_t diag20 = _mm_extract_epi32(diag2_[k], 0);
423  std::uint32_t diag21 = _mm_extract_epi32(diag2_[k], 1);
424  std::uint32_t diag22 = _mm_extract_epi32(diag2_[k], 2);
425  std::uint32_t diag23 = _mm_extract_epi32(diag2_[k], 3);
426  assert(diag20 == diag2[i-k*4-3]);
427  assert(diag21 == diag2[i-k*4-2]);
428  assert(diag22 == diag2[i-k*4-1]);
429  assert(diag23 == diag2[i-k*4-0]);
430  }
431 
432  for (k = 0; k < 4; ++k) {
433  int sc0 = _mm_extract_epi32(substitutionCost32[k], 0);
434  int sc1 = _mm_extract_epi32(substitutionCost32[k], 1);
435  int sc2 = _mm_extract_epi32(substitutionCost32[k], 2);
436  int sc3 = _mm_extract_epi32(substitutionCost32[k], 3);
437  assert(sc0 == ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
438  assert(sc1 == ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
439  assert(sc2 == ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
440  assert(sc3 == ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
441  }
442 #endif // LSTSSE_DEBUG
443 
444  // reminders:
445  // the arrays (diag, diag2, substitutionCost32) correspond to i:
446  // index [0]0 [0]1 [0]2 [0]3 [1]0 [1]1 [1]2 [1]3 ... [4]0 [4]1 [4]2 [4]3
447  // i+ -3 -2 -1 0 -7 -6 -5 -4 ... -19 -18 -17 -16
448  // so when we need to access ...[i-1], we need to align the entries
449  // in *reverse* order
450 
451  // diag[i] = min(
452  // diag2[i-1],
453  // diag2[i],
454  // diag[i-1] + substitutionCost
455  // ) + 1;
456  //
457  for (k = 0; k < 4; ++k) {
458  __m128i diag2_i_m1 = _mm_alignr_epi8(diag2_[k], diag2_[k+1], 12);
459  __m128i diag_i_m1 = _mm_alignr_epi8(diag_ [k], diag_ [k+1], 12);
460 
461  __m128i result3 = _mm_add_epi32(diag_i_m1, substitutionCost32[k]);
462  __m128i min = _mm_min_epi32(_mm_min_epi32(diag2_i_m1, diag2_[k]), result3);
463  min = _mm_add_epi32(min, one128_epi32);
464 
465 #ifdef LSTSSE_DEBUG
466  std::uint32_t diag_i_m10 = _mm_extract_epi32(diag_i_m1, 0);
467  std::uint32_t diag_i_m11 = _mm_extract_epi32(diag_i_m1, 1);
468  std::uint32_t diag_i_m12 = _mm_extract_epi32(diag_i_m1, 2);
469  std::uint32_t diag_i_m13 = _mm_extract_epi32(diag_i_m1, 3);
470  assert(diag_i_m10 == diag[i-k*4-4]);
471  assert(diag_i_m11 == diag[i-k*4-3]);
472  assert(diag_i_m12 == diag[i-k*4-2]);
473  assert(diag_i_m13 == diag[i-k*4-1]);
474 
475  std::uint32_t diag2_i_m10 = _mm_extract_epi32(diag2_i_m1, 0);
476  std::uint32_t diag2_i_m11 = _mm_extract_epi32(diag2_i_m1, 1);
477  std::uint32_t diag2_i_m12 = _mm_extract_epi32(diag2_i_m1, 2);
478  std::uint32_t diag2_i_m13 = _mm_extract_epi32(diag2_i_m1, 3);
479  assert(diag2_i_m10 == diag2[i-k*4-4]);
480  assert(diag2_i_m11 == diag2[i-k*4-3]);
481  assert(diag2_i_m12 == diag2[i-k*4-2]);
482  assert(diag2_i_m13 == diag2[i-k*4-1]);
483 
484  std::uint32_t result30 = _mm_extract_epi32(result3, 0);
485  std::uint32_t result31 = _mm_extract_epi32(result3, 1);
486  std::uint32_t result32 = _mm_extract_epi32(result3, 2);
487  std::uint32_t result33 = _mm_extract_epi32(result3, 3);
488 
489  assert(result30 == diag[i-1-k*4-3] + ((a[i-1-k*4-3] == b[j-1+k*4+3]) ? -1 : 0));
490  assert(result31 == diag[i-1-k*4-2] + ((a[i-1-k*4-2] == b[j-1+k*4+2]) ? -1 : 0));
491  assert(result32 == diag[i-1-k*4-1] + ((a[i-1-k*4-1] == b[j-1+k*4+1]) ? -1 : 0));
492  assert(result33 == diag[i-1-k*4-0] + ((a[i-1-k*4-0] == b[j-1+k*4+0]) ? -1 : 0));
493 #endif // LSTSSE_DEBUG
494 
495  _mm_storeu_si128(reinterpret_cast<__m128i*>(&diag[i-k*4-3]), min);
496  }
497 
498  // We just handled 16 entries at once. Yay!
499  i -= 16;
500 }
501 #endif // __SSSE3__
502 
503 
504 #ifdef __AVX2__
505 static inline void performSSE_AVX2(const T* a, const T* b,
506  std::size_t& i, std::size_t j, std::size_t bLen,
507  std::uint32_t* diag, const std::uint32_t* diag2)
508 {
509  const __m256i one256_epi32 = _mm256_set1_epi32(1);
510  const __m256i reversedIdentity256_epi8 = _mm256_setr_epi8(
511  15, 14, 13, 12,
512  11, 10, 9, 8,
513  7, 6, 5, 4,
514  3, 2, 1, 0,
515  15, 14, 13, 12,
516  11, 10, 9, 8,
517  7, 6, 5, 4,
518  3, 2, 1, 0
519  );
520 
521  const __m256i blockwiseReversed256_epi8_8 = _mm256_setr_epi8(
522  7, 6, 5, 4,
523  3, 2, 1, 0,
524  15, 14, 13, 12,
525  11, 10, 9, 8,
526  7, 6, 5, 4,
527  3, 2, 1, 0,
528  15, 14, 13, 12,
529  11, 10, 9, 8
530  );
531 
532  const __m256i blockwiseReversed256_epi16_8 = _mm256_setr_epi8(
533  14, 15, 12, 13,
534  10, 11, 8, 9,
535  6, 7, 4, 5,
536  2, 3, 0, 1,
537  14, 15, 12, 13,
538  10, 11, 8, 9,
539  6, 7, 4, 5,
540  2, 3, 0, 1
541  );
542 
543  __m256i substitutionCost32[4];
544  std::size_t k;
545 
546  if (sizeof(T) <= 2) {
547  if (sizeof(T) == 1) {
548  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
549  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
550  a_ = _mm256_shuffle_epi8(a_, reversedIdentity256_epi8);
551  a_ = _mm256_permute2x128_si256(a_, a_, 1); // swap hi/lo
552 
553  __m256i substitutionCost8 = _mm256_cmpeq_epi8(a_, b_);
554  __m256i substitutionCost8X = _mm256_shuffle_epi8(substitutionCost8, blockwiseReversed256_epi8_8);
555  __m128i sc8Xlo = _mm256_extracti128_si256(substitutionCost8X, 0);
556  __m128i sc8Xhi = _mm256_extracti128_si256(substitutionCost8X, 1);
557  substitutionCost32[0] = _mm256_cvtepi8_epi32(sc8Xlo);
558  substitutionCost32[1] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xlo, 8));
559  substitutionCost32[2] = _mm256_cvtepi8_epi32(sc8Xhi);
560  substitutionCost32[3] = _mm256_cvtepi8_epi32(_mm_srli_si128(sc8Xhi, 8));
561  } else {
562  __m256i a0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-16]));
563  __m256i a1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-32]));
564  __m256i b0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1]));
565  __m256i b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j+15]));
566  a0 = _mm256_permute2x128_si256(a0, a0, 1); // swap hi/lo
567  a1 = _mm256_permute2x128_si256(a1, a1, 1); // swap hi/lo
568  b0 = _mm256_shuffle_epi8(b0, blockwiseReversed256_epi16_8);
569  b1 = _mm256_shuffle_epi8(b1, blockwiseReversed256_epi16_8);
570  __m256i substitutionCost16L = _mm256_cmpeq_epi16(a0, b0);
571  __m256i substitutionCost16H = _mm256_cmpeq_epi16(a1, b1);
572  __m128i sc16Llo = _mm256_extracti128_si256(substitutionCost16L, 0);
573  __m128i sc16Lhi = _mm256_extracti128_si256(substitutionCost16L, 1);
574  __m128i sc16Hlo = _mm256_extracti128_si256(substitutionCost16H, 0);
575  __m128i sc16Hhi = _mm256_extracti128_si256(substitutionCost16H, 1);
576  substitutionCost32[0] = _mm256_cvtepi16_epi32(sc16Llo);
577  substitutionCost32[1] = _mm256_cvtepi16_epi32(sc16Lhi);
578  substitutionCost32[2] = _mm256_cvtepi16_epi32(sc16Hlo);
579  substitutionCost32[3] = _mm256_cvtepi16_epi32(sc16Hhi);
580  }
581  } else {
582  assert(sizeof(T) == 4);
583 
584  for (k = 0; k < 4; ++k) {
585  __m256i a_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i-8-k*8]));
586  __m256i b_ = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[j-1+k*8]));
587  b_ = _mm256_shuffle_epi32(b_, 0x1b); // simple reverse
588  b_ = _mm256_permute2x128_si256(b_, b_, 1); // swap hi/lo
589  substitutionCost32[k] = _mm256_cmpeq_epi32(a_, b_);
590  }
591  }
592 
593  __m256i diag_[5], diag2_[5];
594  for (k = 0; k < 5; ++k) {
595  diag_ [k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag [i-7-k*8]));
596  }
597  for (k = 0; k < 5; ++k) {
598  diag2_[k] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&diag2[i-7-k*8]));
599  }
600 
601 #ifdef LSTSSE_DEBUG
602  for (k = 0; k < 5; ++k) {
603  std::uint32_t diag0 = _mm256_extract_epi32(diag_[k], 0);
604  std::uint32_t diag1 = _mm256_extract_epi32(diag_[k], 1);
605  std::uint32_t diag2 = _mm256_extract_epi32(diag_[k], 2);
606  std::uint32_t diag3 = _mm256_extract_epi32(diag_[k], 3);
607  std::uint32_t diag4 = _mm256_extract_epi32(diag_[k], 4);
608  std::uint32_t diag5 = _mm256_extract_epi32(diag_[k], 5);
609  std::uint32_t diag6 = _mm256_extract_epi32(diag_[k], 6);
610  std::uint32_t diag7 = _mm256_extract_epi32(diag_[k], 7);
611  assert(diag0 == diag[i-k*8-7]);
612  assert(diag1 == diag[i-k*8-6]);
613  assert(diag2 == diag[i-k*8-5]);
614  assert(diag3 == diag[i-k*8-4]);
615  assert(diag4 == diag[i-k*8-3]);
616  assert(diag5 == diag[i-k*8-2]);
617  assert(diag6 == diag[i-k*8-1]);
618  assert(diag7 == diag[i-k*8-0]);
619  }
620  for (k = 0; k < 5; ++k) {
621  std::uint32_t diag20 = _mm256_extract_epi32(diag2_[k], 0);
622  std::uint32_t diag21 = _mm256_extract_epi32(diag2_[k], 1);
623  std::uint32_t diag22 = _mm256_extract_epi32(diag2_[k], 2);
624  std::uint32_t diag23 = _mm256_extract_epi32(diag2_[k], 3);
625  std::uint32_t diag24 = _mm256_extract_epi32(diag2_[k], 4);
626  std::uint32_t diag25 = _mm256_extract_epi32(diag2_[k], 5);
627  std::uint32_t diag26 = _mm256_extract_epi32(diag2_[k], 6);
628  std::uint32_t diag27 = _mm256_extract_epi32(diag2_[k], 7);
629  assert(diag20 == diag2[i-k*8-7]);
630  assert(diag21 == diag2[i-k*8-6]);
631  assert(diag22 == diag2[i-k*8-5]);
632  assert(diag23 == diag2[i-k*8-4]);
633  assert(diag24 == diag2[i-k*8-3]);
634  assert(diag25 == diag2[i-k*8-2]);
635  assert(diag26 == diag2[i-k*8-1]);
636  assert(diag27 == diag2[i-k*8-0]);
637  }
638 
639  for (k = 0; k < 4; ++k) {
640  int sc0 = _mm256_extract_epi32(substitutionCost32[k], 0);
641  int sc1 = _mm256_extract_epi32(substitutionCost32[k], 1);
642  int sc2 = _mm256_extract_epi32(substitutionCost32[k], 2);
643  int sc3 = _mm256_extract_epi32(substitutionCost32[k], 3);
644  int sc4 = _mm256_extract_epi32(substitutionCost32[k], 4);
645  int sc5 = _mm256_extract_epi32(substitutionCost32[k], 5);
646  int sc6 = _mm256_extract_epi32(substitutionCost32[k], 6);
647  int sc7 = _mm256_extract_epi32(substitutionCost32[k], 7);
648  assert(sc0 == ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
649  assert(sc1 == ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
650  assert(sc2 == ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
651  assert(sc3 == ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
652  assert(sc4 == ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
653  assert(sc5 == ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
654  assert(sc6 == ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
655  assert(sc7 == ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
656  }
657 #endif // LSTSSE_DEBUG
658 
659  for (k = 0; k < 4; ++k) {
660  __m256i diag2_i_m1 = _mm256_alignr_epi32_7(diag2_[k], diag2_[k+1]);
661  __m256i diag_i_m1 = _mm256_alignr_epi32_7(diag_ [k], diag_ [k+1]);
662 
663  __m256i result3 = _mm256_add_epi32(diag_i_m1, substitutionCost32[k]);
664  __m256i min = _mm256_min_epi32(_mm256_min_epi32(diag2_i_m1, diag2_[k]), result3);
665  min = _mm256_add_epi32(min, one256_epi32);
666 
667 #ifdef LSTSSE_DEBUG
668  std::uint32_t diag_i_m10 = _mm256_extract_epi32(diag_i_m1, 0);
669  std::uint32_t diag_i_m11 = _mm256_extract_epi32(diag_i_m1, 1);
670  std::uint32_t diag_i_m12 = _mm256_extract_epi32(diag_i_m1, 2);
671  std::uint32_t diag_i_m13 = _mm256_extract_epi32(diag_i_m1, 3);
672  std::uint32_t diag_i_m14 = _mm256_extract_epi32(diag_i_m1, 4);
673  std::uint32_t diag_i_m15 = _mm256_extract_epi32(diag_i_m1, 5);
674  std::uint32_t diag_i_m16 = _mm256_extract_epi32(diag_i_m1, 6);
675  std::uint32_t diag_i_m17 = _mm256_extract_epi32(diag_i_m1, 7);
676  assert(diag_i_m10 == diag[i-k*8-8]);
677  assert(diag_i_m11 == diag[i-k*8-7]);
678  assert(diag_i_m12 == diag[i-k*8-6]);
679  assert(diag_i_m13 == diag[i-k*8-5]);
680  assert(diag_i_m14 == diag[i-k*8-4]);
681  assert(diag_i_m15 == diag[i-k*8-3]);
682  assert(diag_i_m16 == diag[i-k*8-2]);
683  assert(diag_i_m17 == diag[i-k*8-1]);
684 
685  std::uint32_t diag2_i_m10 = _mm256_extract_epi32(diag2_i_m1, 0);
686  std::uint32_t diag2_i_m11 = _mm256_extract_epi32(diag2_i_m1, 1);
687  std::uint32_t diag2_i_m12 = _mm256_extract_epi32(diag2_i_m1, 2);
688  std::uint32_t diag2_i_m13 = _mm256_extract_epi32(diag2_i_m1, 3);
689  std::uint32_t diag2_i_m14 = _mm256_extract_epi32(diag2_i_m1, 4);
690  std::uint32_t diag2_i_m15 = _mm256_extract_epi32(diag2_i_m1, 5);
691  std::uint32_t diag2_i_m16 = _mm256_extract_epi32(diag2_i_m1, 6);
692  std::uint32_t diag2_i_m17 = _mm256_extract_epi32(diag2_i_m1, 7);
693  assert(diag2_i_m10 == diag2[i-k*8-8]);
694  assert(diag2_i_m11 == diag2[i-k*8-7]);
695  assert(diag2_i_m12 == diag2[i-k*8-6]);
696  assert(diag2_i_m13 == diag2[i-k*8-5]);
697  assert(diag2_i_m14 == diag2[i-k*8-4]);
698  assert(diag2_i_m15 == diag2[i-k*8-3]);
699  assert(diag2_i_m16 == diag2[i-k*8-2]);
700  assert(diag2_i_m17 == diag2[i-k*8-1]);
701 
702  std::uint32_t result30 = _mm256_extract_epi32(result3, 0);
703  std::uint32_t result31 = _mm256_extract_epi32(result3, 1);
704  std::uint32_t result32 = _mm256_extract_epi32(result3, 2);
705  std::uint32_t result33 = _mm256_extract_epi32(result3, 3);
706  std::uint32_t result34 = _mm256_extract_epi32(result3, 4);
707  std::uint32_t result35 = _mm256_extract_epi32(result3, 5);
708  std::uint32_t result36 = _mm256_extract_epi32(result3, 6);
709  std::uint32_t result37 = _mm256_extract_epi32(result3, 7);
710 
711  assert(result30 == diag[i-1-k*8-7] + ((a[i-1-k*8-7] == b[j-1+k*8+7]) ? -1 : 0));
712  assert(result31 == diag[i-1-k*8-6] + ((a[i-1-k*8-6] == b[j-1+k*8+6]) ? -1 : 0));
713  assert(result32 == diag[i-1-k*8-5] + ((a[i-1-k*8-5] == b[j-1+k*8+5]) ? -1 : 0));
714  assert(result33 == diag[i-1-k*8-4] + ((a[i-1-k*8-4] == b[j-1+k*8+4]) ? -1 : 0));
715  assert(result34 == diag[i-1-k*8-3] + ((a[i-1-k*8-3] == b[j-1+k*8+3]) ? -1 : 0));
716  assert(result35 == diag[i-1-k*8-2] + ((a[i-1-k*8-2] == b[j-1+k*8+2]) ? -1 : 0));
717  assert(result36 == diag[i-1-k*8-1] + ((a[i-1-k*8-1] == b[j-1+k*8+1]) ? -1 : 0));
718  assert(result37 == diag[i-1-k*8-0] + ((a[i-1-k*8-0] == b[j-1+k*8+0]) ? -1 : 0));
719 #endif // LSTSSE_DEBUG
720 
721  _mm256_storeu_si256(reinterpret_cast<__m256i*>(&diag[i-k*8-7]), min);
722  }
723 
724  // We just handled 32 entries at once. Yay!
725  i -= 32;
726 }
727 #endif // __AVX2__
728 
729 };
730 
739 template<typename Vec1, typename Vec2, typename Iterator1, typename Iterator2>
740 struct LevenshteinIteration : LevenshteinIterationBase<Vec1, Vec2, Iterator1, Iterator2> {
741 };
742 
746 template<typename Alloc1, typename Alloc2, typename T>
748 static inline void perform(const T* a, const T* b,
749  std::size_t& i, std::size_t j, std::size_t bLen,
750  std::vector<std::uint32_t, Alloc1>& diag,
751  const std::vector<std::uint32_t, Alloc2>& diag2) {
752  return LevenshteinIterationSIMD<T>::performSIMD(a, b, i, j, bLen, diag.data(), diag2.data());
753 }
754 };
755 
761 template<typename Alloc1, typename Alloc2, typename T>
762 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
763  : std::conditional<std::is_scalar<T>::value && (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4),
764  LevenshteinIterationSIMDWrap<Alloc1, Alloc2, T>,
765  LevenshteinIterationBase<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
766  >::type
767 { };
768 
772 template<typename Alloc1, typename Alloc2, typename T>
773 struct LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, T*, T*>
774  : LevenshteinIteration<std::vector<std::uint32_t, Alloc1>, std::vector<std::uint32_t, Alloc2>, const T*, const T*>
775 { };
776 
780 template<typename T, typename Iterator1, typename Iterator2>
781 T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
782  const std::size_t aLen = aEnd - a;
783  const std::size_t bLen = bEnd - b;
784 
785  assert(0 < aLen);
786  assert(aLen <= bLen);
787 
788  typedef AlignmentAllocator<T, alignment> Alloc;
789  std::vector<T, Alloc> diag (aLen + 1, T(0));
790  std::vector<T, Alloc> diag2 (aLen + 1, T(0));
791 
792  std::size_t i, j, k;
793 
794  k = 0;
795  for (k = 1; ; ++k) {
796  assert(k <= aLen + bLen);
797 
798  std::size_t startRow = k > bLen ? k - bLen : 1;
799  std::size_t endRow = k > aLen ? aLen : k - 1;
800 
801  assert(endRow >= startRow || k == 1);
802 
803  for (i = endRow; i >= startRow; ) {
804  assert(i < k);
805  j = k - i;
806 
807  assert(bLen >= j);
808  assert(aLen >= i);
809 
810  LevenshteinIteration<std::vector<T, Alloc>, std::vector<T, Alloc>, Iterator1, Iterator2>
811  ::perform(a, b, i, j, bLen, diag, diag2);
812  }
813 
814  diag[0] = k;
815 
816  if (k <= aLen) {
817  diag[k] = k;
818  }
819 
820  if (k == aLen + bLen) {
821  assert(startRow == endRow);
822  return diag[startRow];
823  }
824 
825  // switch buffers
826  std::swap(diag, diag2);
827  }
828 
829  assert(0);
830 }
831 
837 template<typename T, typename Iterator1, typename Iterator2>
838 T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
839  std::vector<T> arr;
840 
841  std::size_t i = 0, j = 0;
842  T ret(0);
843 
844  for (Iterator1 it = a; it != aEnd; ++it) {
845  arr.push_back(++i);
846  }
847 
848  arr.shrink_to_fit();
849 
850  for (; b != bEnd; ++b) {
851  T tmp = j++;
852  ret = j;
853  i = 0;
854 
855  for (Iterator1 it = a; it != aEnd; ++it, ++i) {
856  T tmp2 = *b == *it ? tmp : tmp + 1;
857  tmp = arr[i];
858  ret = arr[i] = tmp > ret ? tmp2 > ret ? ret + 1 : tmp2 : tmp2 > tmp ? tmp + 1 : tmp2;
859  }
860  }
861 
862  return ret;
863 }
864 
869 template<typename Iterator1, typename Iterator2>
870 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
871  std::random_access_iterator_tag, std::random_access_iterator_tag) {
872  if (aEnd - a > bEnd - b) {
873  return levenshtein(b, bEnd, a, aEnd);
874  }
875 
876  // skip common prefixes and suffixes
877  while (a < aEnd && a[0] == b[0])
878  ++a, ++b;
879 
880  while (a < aEnd && aEnd[-1] == bEnd[-1])
881  --aEnd, --bEnd;
882 
883  std::size_t aLen = aEnd - a;
884  std::size_t bLen = bEnd - b;
885 
886  if (aLen == 0) {
887  return bLen;
888  }
889 
890  if (aLen == 1) {
891  return bLen - (std::find(b, bEnd, *a) == bEnd ? 0 : 1);
892  }
893 
894  if (aLen + bLen <= std::numeric_limits<std::uint32_t>::max())
895  return levenshteinDiagonal<std::uint32_t>(a, aEnd, b, bEnd);
896 
897  return levenshteinDiagonal<std::size_t>(a, aEnd, b, bEnd);
898 }
899 
904 template<typename Iterator1, typename Iterator2>
905 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd,
906  std::bidirectional_iterator_tag, std::bidirectional_iterator_tag) {
907  // skip common prefixes and suffixes
908  while (a != aEnd && b != bEnd && *a == *b)
909  ++a, ++b;
910 
911  while (a != aEnd && b != bEnd && *std::prev(aEnd) == *std::prev(bEnd))
912  --aEnd, --bEnd;
913 
914  if (a == aEnd) {
915  return std::distance(b, bEnd);
916  }
917 
918  if (b == bEnd) {
919  return std::distance(a, aEnd);
920  }
921 
922  if (std::next(a) == aEnd) {
923  std::size_t ret = 0, found = 0;
924  for (; b != bEnd; ++b, ++ret)
925  if (*b == *a)
926  found = 1;
927  return ret - found;
928  }
929 
930  if (std::next(b) == bEnd) {
931  std::size_t ret = 0, found = 0;
932  for (; a != aEnd; ++a, ++ret)
933  if (*b == *a)
934  found = 1;
935  return ret - found;
936  }
937 
938  return levenshteinRowBased<std::size_t>(a, aEnd, b, bEnd);
939 }
940 
941 // SFINAE checker for .data() and .size()
942 template<typename T>
944 private:
945  struct char2 { char _[2]; };
946  template<typename U> static auto test(U* a) -> decltype(a->data() + a->size(), char(0));
947  template<typename U> static auto test(const U* a) -> char2;
948 public:
949  static constexpr bool value = sizeof(test(static_cast<T*>(nullptr))) == 1;
950 };
951 
952 template<bool useDataAndSize>
954 
958 template<>
959 struct LevenshteinContainer<true> {
960 template<typename Container1, typename Container2>
961 static inline std::size_t calc(const Container1& a, const Container2& b) {
962  return levenshtein(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
963 }
964 };
965 
969 template<>
970 struct LevenshteinContainer<false> {
971 template<typename Container1, typename Container2>
972 static inline std::size_t calc(const Container1& a, const Container2& b) {
973  return levenshtein(std::begin(a), std::end(a), std::begin(b), std::end(b));
974 }
975 };
976 
977 template<typename Iterator1, typename Iterator2>
978 std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd) {
979  return levenshtein(a, aEnd, b, bEnd,
980  typename std::iterator_traits<Iterator1>::iterator_category(),
981  typename std::iterator_traits<Iterator2>::iterator_category());
982 }
983 
984 template<typename Container1, typename Container2>
985 std::size_t levenshtein(const Container1& a, const Container2& b) {
988 }
989 }
990 
991 #endif
992 
993 // clang-format on
Definition: levenshtein-sse.hpp:945
Definition: levenshtein-sse.hpp:60
Definition: levenshtein-sse.hpp:747
Definition: levenshtein-sse.hpp:943
Definition: levenshtein-sse.hpp:285
std::enable_if_t< one_observer_ptr< T, W >::value, bool > operator!=(const T &p1, const W &p2)
Definition: ObserverPtr.h:281
static void perform(const Iterator1 &a, const Iterator2 &b, std::size_t &i, std::size_t j, std::size_t bLen, Vec1 &diag, const Vec2 &diag2)
Definition: levenshtein-sse.hpp:229
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:972
Definition: levenshtein-sse.hpp:103
Definition: levenshtein-sse.hpp:228
Definition: levenshtein-sse.hpp:740
std::size_t levenshtein(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:978
std::enable_if_t< std::is_pointer< W >::value, bool > operator==(const observer_ptr< T > &p1, const W &p2)
Definition: ObserverPtr.h:264
T levenshteinRowBased(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:838
T levenshteinDiagonal(Iterator1 a, Iterator1 aEnd, Iterator2 b, Iterator2 bEnd)
Definition: levenshtein-sse.hpp:781
static void perform(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::vector< std::uint32_t, Alloc1 > &diag, const std::vector< std::uint32_t, Alloc2 > &diag2)
Definition: levenshtein-sse.hpp:748
static std::size_t calc(const Container1 &a, const Container2 &b)
Definition: levenshtein-sse.hpp:961
static void performSIMD(const T *a, const T *b, std::size_t &i, std::size_t j, std::size_t bLen, std::uint32_t *diag, const std::uint32_t *diag2)
Definition: levenshtein-sse.hpp:287
constexpr std::size_t alignment
Definition: levenshtein-sse.hpp:186
Definition: levenshtein-sse.hpp:953