MessageHeaderEncoding.cpp 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. /*
  2. * Copyright (c) 2023, Valtteri Koskivuori <vkoskiv@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "QuotedPrintable.h"
  7. #include <AK/Base64.h>
  8. #include <AK/GenericLexer.h>
  9. #include <AK/StringBuilder.h>
  10. #include <LibIMAP/MessageHeaderEncoding.h>
  11. #include <LibTextCodec/Decoder.h>
  12. namespace IMAP {
  13. ErrorOr<ByteBuffer> decode_rfc2047_encoded_words(StringView input)
  14. {
  15. GenericLexer lexer(input);
  16. StringBuilder output;
  17. while (!lexer.is_eof()) {
  18. auto ascii_view = lexer.consume_until("=?"sv);
  19. ByteString ascii = ascii_view.replace("\r"sv, " "sv, ReplaceMode::All);
  20. ascii = ascii.replace("\n"sv, " "sv, ReplaceMode::All);
  21. TRY(output.try_append(ascii));
  22. if (lexer.is_eof())
  23. break;
  24. lexer.consume_specific("=?"sv);
  25. auto charset = lexer.consume_until('?');
  26. lexer.consume();
  27. auto encoding = lexer.consume_until('?');
  28. lexer.consume();
  29. auto encoded_text = lexer.consume_until("?=");
  30. lexer.consume_specific("?="sv);
  31. // RFC 2047 Section 6.2, "...any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored."
  32. // https://datatracker.ietf.org/doc/html/rfc2047#section-6.2
  33. bool found_next_start = false;
  34. int spaces = 0;
  35. for (size_t i = 0; i < lexer.tell_remaining(); ++i) {
  36. if (lexer.peek(i) == ' ' || lexer.peek(i) == '\r' || lexer.peek(i) == '\n') {
  37. spaces++;
  38. if (lexer.peek(i + 1) == '=' && lexer.peek(i + 2) == '?') {
  39. found_next_start = true;
  40. break;
  41. }
  42. } else {
  43. break;
  44. }
  45. }
  46. if (found_next_start) {
  47. for (int i = 0; i < spaces; i++) {
  48. lexer.consume();
  49. }
  50. }
  51. ByteBuffer first_pass_decoded;
  52. if (encoding == 'Q' || encoding == 'q') {
  53. auto maybe_decoded_data = decode_quoted_printable(encoded_text);
  54. if (maybe_decoded_data.is_error()) {
  55. dbgln("Failed to decode quoted-printable rfc2047 text, skipping.");
  56. continue;
  57. }
  58. // RFC 2047 Section 4.2.2, https://datatracker.ietf.org/doc/html/rfc2047#section-4.2
  59. auto decoded_data = maybe_decoded_data.release_value();
  60. for (auto character : decoded_data.bytes()) {
  61. if (character == '_')
  62. first_pass_decoded.append(' ');
  63. else
  64. first_pass_decoded.append(character);
  65. }
  66. } else if (encoding == 'B' || encoding == 'b') {
  67. auto maybe_decoded_data = AK::decode_base64(encoded_text);
  68. if (maybe_decoded_data.is_error()) {
  69. dbgln("Failed to decode base64-encoded rfc2047 text, skipping.");
  70. continue;
  71. }
  72. first_pass_decoded = maybe_decoded_data.release_value();
  73. } else {
  74. dbgln("Unknown encoding \"{}\" found, skipping, original string: \"{}\"", encoding, input);
  75. continue;
  76. }
  77. if (first_pass_decoded.is_empty())
  78. continue;
  79. auto maybe_decoder = TextCodec::decoder_for(charset);
  80. if (!maybe_decoder.has_value()) {
  81. dbgln("No decoder found for charset \"{}\", skipping.", charset);
  82. continue;
  83. }
  84. auto decoded_text = TRY(maybe_decoder->to_utf8(first_pass_decoded));
  85. TRY(output.try_append(decoded_text));
  86. }
  87. return output.to_byte_buffer();
  88. }
  89. }