LibDiff: Make parsing of unified hunks more robust

Parsing of the unified hunks now verifies that the expected number of
lines given by the unified location at the beginning of that hunk are
actually in that hunk. Furthermore, we no longer crash when given a
bogus unified range.

As a further benefit, this begins the scaffolding for a patch parser
which should assist us in parsing full patches - even when we are not
aware of the format that a patch has been written in.
This commit is contained in:
Shannon Booth 2023-07-02 19:19:22 +12:00 committed by Sam Atkins
parent b2d33c5689
commit ef45221c21
Notes: sideshowbarker 2024-07-17 01:23:08 +09:00
3 changed files with 102 additions and 72 deletions

View file

@ -8,6 +8,8 @@
namespace Diff {
class Parser;
struct Hunk;
struct HunkLocation;
struct Line;

View file

@ -10,45 +10,100 @@
namespace Diff {
ErrorOr<Vector<Hunk>> parse_hunks(StringView diff)
Optional<HunkLocation> Parser::consume_unified_location()
{
Vector<StringView> diff_lines = diff.split_view('\n');
if (diff_lines.is_empty())
return Vector<Hunk> {};
auto consume_range = [this](Range& range) {
if (!consume_line_number(range.start_line))
return false;
if (consume_specific(',')) {
if (!consume_line_number(range.number_of_lines))
return false;
} else {
range.number_of_lines = 1;
}
return true;
};
if (!consume_specific("@@ -"))
return {};
HunkLocation location;
if (!consume_range(location.old_range))
return {};
if (!consume_specific(" +"))
return {};
if (!consume_range(location.new_range))
return {};
if (!consume_specific(" @@"))
return {};
return location;
}
bool Parser::consume_line_number(size_t& number)
{
auto line = consume_while(is_ascii_digit);
auto maybe_number = line.to_uint<size_t>();
if (!maybe_number.has_value())
return false;
number = maybe_number.value();
return true;
}
ErrorOr<Vector<Hunk>> Parser::parse_hunks()
{
Vector<Hunk> hunks;
size_t line_index = 0;
HunkLocation current_location {};
while (!is_eof()) {
// Try an locate a hunk location in this hunk. It may be prefixed with information.
auto maybe_location = consume_unified_location();
consume_line();
// Skip to first hunk
while (diff_lines[line_index][0] != '@') {
++line_index;
}
while (line_index < diff_lines.size()) {
if (diff_lines[line_index][0] == '@') {
current_location = parse_hunk_location(diff_lines[line_index]);
++line_index;
if (!maybe_location.has_value())
continue;
}
Hunk hunk {};
hunk.location = current_location;
Hunk hunk { *maybe_location, {} };
while (line_index < diff_lines.size()) {
auto const& line = diff_lines[line_index];
auto old_lines_expected = hunk.location.old_range.number_of_lines;
auto new_lines_expected = hunk.location.new_range.number_of_lines;
char const operation = line[0];
if (operation != ' ' && operation != '+' && operation != '-')
break;
// We've found a location. Now parse out all of the expected content lines.
while (old_lines_expected != 0 || new_lines_expected != 0) {
StringView line = consume_line();
if (line.is_empty())
return Error::from_string_literal("Malformed empty content line in patch");
if (line[0] != ' ' && line[0] != '+' && line[0] != '-')
return Error::from_string_literal("Invaid operation in patch");
auto const operation = Line::operation_from_symbol(line[0]);
if (operation != Line::Operation::Removal) {
if (new_lines_expected == 0)
return Error::from_string_literal("Found more removal and context lines in patch than expected");
--new_lines_expected;
}
if (operation != Line::Operation::Addition) {
if (old_lines_expected == 0)
return Error::from_string_literal("Found more addition and context lines in patch than expected");
--old_lines_expected;
}
auto const content = line.substring_view(1, line.length() - 1);
TRY(hunk.lines.try_append(Line { Line::operation_from_symbol(operation), TRY(String::from_utf8(content)) }));
++line_index;
TRY(hunk.lines.try_append(Line { operation, TRY(String::from_utf8(content)) }));
}
TRY(hunks.try_append(hunk));
}
@ -63,48 +118,9 @@ ErrorOr<Vector<Hunk>> parse_hunks(StringView diff)
return hunks;
}
HunkLocation parse_hunk_location(StringView location_line)
ErrorOr<Vector<Hunk>> parse_hunks(StringView diff)
{
size_t char_index = 0;
auto parse_start_and_length_pair = [](StringView raw) {
auto maybe_index_of_separator = raw.find(',');
size_t start = 0;
size_t length = 0;
if (maybe_index_of_separator.has_value()) {
auto index_of_separator = maybe_index_of_separator.value();
start = raw.substring_view(0, index_of_separator).to_uint().value();
length = raw.substring_view(index_of_separator + 1, raw.length() - index_of_separator - 1).to_uint().value();
} else {
length = 1;
start = raw.to_uint().value();
}
return Range { start, length };
};
while (char_index < location_line.length() && location_line[char_index++] != '-') {
}
VERIFY(char_index < location_line.length());
size_t original_location_start_index = char_index;
while (char_index < location_line.length() && location_line[char_index++] != ' ') {
}
VERIFY(char_index < location_line.length() && location_line[char_index] == '+');
size_t original_location_end_index = char_index - 2;
size_t target_location_start_index = char_index + 1;
char_index += 1;
while (char_index < location_line.length() && location_line[char_index++] != ' ') {
}
VERIFY(char_index < location_line.length());
size_t target_location_end_index = char_index - 2;
auto old_range = parse_start_and_length_pair(location_line.substring_view(original_location_start_index, original_location_end_index - original_location_start_index + 1));
auto new_range = parse_start_and_length_pair(location_line.substring_view(target_location_start_index, target_location_end_index - target_location_start_index + 1));
return { old_range, new_range };
Parser lexer(diff);
return lexer.parse_hunks();
}
}
};

View file

@ -9,6 +9,7 @@
#include <AK/Assertions.h>
#include <AK/Format.h>
#include <AK/GenericLexer.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Vector.h>
@ -58,10 +59,21 @@ struct Hunk {
Vector<Line> lines;
};
ErrorOr<Vector<Hunk>> parse_hunks(StringView diff);
HunkLocation parse_hunk_location(StringView location_line);
class Parser : public GenericLexer {
public:
using GenericLexer::GenericLexer;
ErrorOr<Vector<Hunk>> parse_hunks();
private:
Optional<HunkLocation> consume_unified_location();
bool consume_line_number(size_t& number);
};
ErrorOr<Vector<Hunk>> parse_hunks(StringView diff);
}
template<>
struct AK::Formatter<Diff::Line::Operation> : Formatter<FormatString> {
ErrorOr<void> format(FormatBuilder& builder, Diff::Line::Operation operation)