LibDiff: Add new API to generate hunks from two pieces of text

For now this is just a standard implementation of the longest
common subsequence algorithm over the lines, except that it doesn't
do any coalescing of the lines. This isn't really ideal since
we get a single Hunk per changed line, and is definitely something
to improve in the future.
This commit is contained in:
Mustafa Quraish 2021-09-15 21:58:53 -04:00 committed by Brian Gianforcaro
parent 27f28998b1
commit 5e28da1aa4
Notes: sideshowbarker 2024-07-18 03:46:25 +09:00
3 changed files with 105 additions and 1 deletions

View file

@ -1,7 +1,8 @@
set(SOURCES
Hunks.cpp
Format.cpp
Generator.cpp
Hunks.cpp
)
serenity_lib(LibDiff diff)

View file

@ -0,0 +1,88 @@
/*
* Copyright (c) 2021, Mustafa Quraish <mustafa@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "Generator.h"
namespace Diff {
Vector<Hunk> from_text(StringView const& old_text, StringView const& new_text)
{
auto old_lines = old_text.lines();
auto new_lines = new_text.lines();
/**
* This is a simple implementation of the Longest Common Subsequence algorithm (over
* the lines of the text as opposed to the characters). A Dynamic programming approach
* is used here.
*/
enum class Direction {
Up, // Added a new line
Left, // Removed a line
Diagonal, // Line remained the same
};
// A single cell in the DP-matrix. Cell (i, j) represents the longest common
// sub-sequence of lines between old_lines[0 : i] and new_lines[0 : j].
struct Cell {
size_t length;
Direction direction;
};
auto dp_matrix = Vector<Cell>();
dp_matrix.resize((old_lines.size() + 1) * (new_lines.size() + 1));
auto dp = [&dp_matrix, width = old_lines.size() + 1](size_t i, size_t j) -> Cell& {
return dp_matrix[i + width * j];
};
// Initialize the first row and column
for (size_t i = 0; i <= old_lines.size(); ++i)
dp(i, 0) = { 0, Direction::Left };
for (size_t j = 0; j <= new_lines.size(); ++j)
dp(0, j) = { 0, Direction::Up };
// Fill in the rest of the DP table
for (size_t i = 1; i <= old_lines.size(); ++i) {
for (size_t j = 1; j <= new_lines.size(); ++j) {
if (old_lines[i - 1] == new_lines[j - 1]) {
dp(i, j) = { dp(i - 1, j - 1).length + 1, Direction::Diagonal };
} else {
auto up = dp(i, j - 1).length;
auto left = dp(i - 1, j).length;
if (up > left)
dp(i, j) = { up, Direction::Up };
else
dp(i, j) = { left, Direction::Left };
}
}
}
Vector<Hunk> hunks;
size_t i = old_lines.size();
size_t j = new_lines.size();
// FIXME: This creates a hunk per line, very inefficient.
while (i > 0 && j > 0) {
auto& cell = dp(i, j);
if (cell.direction == Direction::Up) {
--j;
hunks.append({ i, j, {}, { new_lines[j] } });
} else if (cell.direction == Direction::Left) {
--i;
hunks.append({ i, j, { old_lines[i] }, {} });
} else if (cell.direction == Direction::Diagonal) {
--i;
--j;
}
}
hunks.reverse();
return hunks;
}
}

View file

@ -0,0 +1,15 @@
/*
* Copyright (c) 2021, Mustafa Quraish <mustafa@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "Hunks.h"
namespace Diff {
Vector<Hunk> from_text(StringView const& old_text, StringView const& new_text);
}