fix: edit_distance_approx

Prevent crash from buffer overflow and improve the approximation of the
Levenshtein distance without sacrificing much performance.

Fixes: #7412
This commit is contained in:
smokespren 2023-05-03 10:21:33 -04:00 committed by Gunter Labes
parent 60184ff1e7
commit ec40bd211d
2 changed files with 71 additions and 41 deletions

View file

@ -23,6 +23,10 @@
#include "log.hpp"
#include "gettext.hpp"
#include <algorithm>
#include <array>
#include <utility>
static lg::log_domain log_engine("engine");
#define ERR_NG LOG_STREAM(err, log_engine)
#define WRN_NG LOG_STREAM(warn, log_engine)
@ -357,41 +361,69 @@ std::string vngettext_impl(const char* domain,
return msg;
}
int edit_distance_approx(const std::string &str_1, const std::string &str_2)
[[nodiscard]] std::size_t edit_distance_approx(std::string_view str_1, std::string_view str_2) noexcept
{
int edit_distance = 0;
if(str_1.length() == 0) {
return str_2.length();
// First, trim prefixes
auto s1_first = str_1.begin();
auto s2_first = str_2.begin();
while(s1_first != str_1.end() && s2_first != str_2.end() && *s1_first == *s2_first) {
++s1_first;
++s2_first;
}
else if(str_2.length() == 0) {
return str_1.length();
// Then, trim suffixes
auto s1_size = static_cast<std::size_t>(str_1.end() - s1_first);
auto s2_size = static_cast<std::size_t>(str_2.end() - s2_first);
while(s1_size != 0 && s2_size != 0 && s1_first[s1_size - 1] == s2_first[s2_size - 1]) {
--s1_size;
--s2_size;
}
else {
int j = 0;
int len_max = std::max(str_1.length(), str_2.length());
for(int i = 0; i < len_max; i++) {
if(str_1[i] != str_2[j]) {
//SWAP
if(str_1[i+1] == str_2[j] && str_1[i] == str_2[j+1]) {
// No need to test the next letter
i++;j++;
}
//ADDITION
else if(str_1[i+1] == str_2[j]) {
j--;
}
//DELETION
else if(str_1[i] == str_2[j+1]) {
i--;
}
// CHANGE (no need to do anything, next letter MAY be successful).
edit_distance++;
if(edit_distance * 100 / std::min(str_1.length(), str_2.length()) > 33) {
break;
}
if(s1_size == 0) {
return s2_size;
}
if(s2_size == 0) {
return s1_size;
}
// Limit the relevant characters to no more than 15
s1_size = std::min(s1_size, std::size_t{15});
s2_size = std::min(s2_size, std::size_t{15});
if(s1_size < s2_size) {
std::swap(s1_first, s2_first);
std::swap(s1_size, s2_size);
}
// This is an 'optimal string alignment distance' algorithm
// (https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance)
// with some optimizations. Two variables are used to track the previous row instead of using another array.
// `up` handles deletion, `row[j]` handles insertion, and `upper_left` handles substitution.
// This is a single row of the matrix
std::array<std::size_t, 16> row{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
for(std::size_t i = 0; i != s1_size; ++i) {
auto upper_left = i;
row[0] = i + 1;
for(std::size_t j = 0; j != s2_size; ++j) {
const auto up = row[j + 1];
const bool transposed = i > 0 && j > 0 && s1_first[i] == s2_first[j - 1] && s1_first[i - 1] == s2_first[j];
if(s1_first[i] != s2_first[j] && !transposed) {
row[j + 1] = std::min({up, row[j], upper_left}) + 1;
} else {
row[j + 1] = upper_left;
}
j++;
// When moving to the next element of a row, the previous `up` element is now the `upper_left`
upper_left = up;
}
}
return edit_distance;
return row[s2_size];
}

View file

@ -21,7 +21,9 @@
#include "serialization/string_utils.hpp"
#include <cstddef>
#include <ctime>
#include <string_view>
class variable_set;
@ -131,17 +133,13 @@ std::string vngettext_impl(const char* domain,
vngettext_impl(GETTEXT_DOMAIN, msgid, msgid_plural, count, __VA_ARGS__)
/**
* Approximately calculates the distance between two strings
* @brief Calculate the approximate edit distance of two strings.
*
* Inspired in the Levenshtein distance, but made simpler
* to avoid using recursion and wasting resources.
* @param str_1 First string to compare.
* @param str_2 Second string to compare.
*
* The consequence is that the function gets "lost"
* after two consecutive differences.
* @returns A score indicating how different the two strings are--the lower the score, the more similar the strings are.
*
* @param str_1 First string to compare
* @param str_2 Second string to compare
* @note To avoid dynamic allocation, this function limits the number of characters that participate in the comparison.
*/
int edit_distance_approx(const std::string &str_1, const std::string &str_2);
[[nodiscard]] std::size_t edit_distance_approx(std::string_view str_1, std::string_view str_2) noexcept;