Parser.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. /*
  2. * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/ScopeGuard.h>
  7. #include <LibPDF/CommonNames.h>
  8. #include <LibPDF/Document.h>
  9. #include <LibPDF/Filter.h>
  10. #include <LibPDF/Parser.h>
  11. #include <LibTextCodec/Decoder.h>
  12. #include <ctype.h>
  13. namespace PDF {
  14. PDFErrorOr<Vector<Operator>> Parser::parse_operators(Document* document, ReadonlyBytes bytes)
  15. {
  16. Parser parser(document, bytes);
  17. parser.m_disable_encryption = true;
  18. return parser.parse_operators();
  19. }
  20. Parser::Parser(Document* document, ReadonlyBytes bytes)
  21. : m_reader(bytes)
  22. , m_document(document)
  23. {
  24. }
  25. Parser::Parser(ReadonlyBytes bytes)
  26. : m_reader(bytes)
  27. {
  28. }
  29. void Parser::set_document(WeakPtr<Document> const& document)
  30. {
  31. m_document = document;
  32. }
  33. String Parser::parse_comment()
  34. {
  35. if (!m_reader.matches('%'))
  36. return {};
  37. m_reader.consume();
  38. auto comment_start_offset = m_reader.offset();
  39. m_reader.move_until([&](auto) {
  40. return m_reader.matches_eol();
  41. });
  42. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  43. m_reader.consume_eol();
  44. m_reader.consume_whitespace();
  45. return str;
  46. }
  47. PDFErrorOr<Value> Parser::parse_value(CanBeIndirectValue can_be_indirect_value)
  48. {
  49. parse_comment();
  50. if (m_reader.matches("null")) {
  51. m_reader.move_by(4);
  52. m_reader.consume_whitespace();
  53. return Value(nullptr);
  54. }
  55. if (m_reader.matches("true")) {
  56. m_reader.move_by(4);
  57. m_reader.consume_whitespace();
  58. return Value(true);
  59. }
  60. if (m_reader.matches("false")) {
  61. m_reader.move_by(5);
  62. m_reader.consume_whitespace();
  63. return Value(false);
  64. }
  65. if (m_reader.matches_number()) {
  66. if (can_be_indirect_value == CanBeIndirectValue::Yes)
  67. return parse_possible_indirect_value_or_ref();
  68. else
  69. return parse_number();
  70. }
  71. if (m_reader.matches('/'))
  72. return MUST(parse_name());
  73. if (m_reader.matches("<<")) {
  74. auto dict = TRY(parse_dict());
  75. if (m_reader.matches("stream"))
  76. return TRY(parse_stream(dict));
  77. return dict;
  78. }
  79. if (m_reader.matches_any('(', '<'))
  80. return parse_string();
  81. if (m_reader.matches('['))
  82. return TRY(parse_array());
  83. return error(String::formatted("Unexpected char \"{}\"", m_reader.peek()));
  84. }
  85. PDFErrorOr<Value> Parser::parse_possible_indirect_value_or_ref()
  86. {
  87. auto first_number = TRY(parse_number());
  88. if (!m_reader.matches_number())
  89. return first_number;
  90. m_reader.save();
  91. auto second_number = parse_number();
  92. if (second_number.is_error()) {
  93. m_reader.load();
  94. return first_number;
  95. }
  96. if (m_reader.matches('R')) {
  97. m_reader.discard();
  98. m_reader.consume();
  99. m_reader.consume_whitespace();
  100. return Value(Reference(first_number.get<int>(), second_number.value().get<int>()));
  101. }
  102. if (m_reader.matches("obj")) {
  103. m_reader.discard();
  104. auto index = first_number.get<int>();
  105. auto generation = second_number.value().get<int>();
  106. VERIFY(index >= 0);
  107. VERIFY(generation >= 0);
  108. return TRY(parse_indirect_value(index, generation));
  109. }
  110. m_reader.load();
  111. return first_number;
  112. }
  113. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value(u32 index, u32 generation)
  114. {
  115. if (!m_reader.matches("obj"))
  116. return error("Expected \"obj\" at beginning of indirect value");
  117. m_reader.move_by(3);
  118. m_reader.consume_whitespace();
  119. push_reference({ index, generation });
  120. auto value = TRY(parse_value());
  121. if (!m_reader.matches("endobj"))
  122. return error("Expected \"endobj\" at end of indirect value");
  123. m_reader.consume(6);
  124. m_reader.consume_whitespace();
  125. pop_reference();
  126. return make_object<IndirectValue>(index, generation, value);
  127. }
  128. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value()
  129. {
  130. auto first_number = TRY(parse_number());
  131. auto second_number = TRY(parse_number());
  132. auto index = first_number.get<int>();
  133. auto generation = second_number.get<int>();
  134. VERIFY(index >= 0);
  135. VERIFY(generation >= 0);
  136. return parse_indirect_value(index, generation);
  137. }
  138. PDFErrorOr<Value> Parser::parse_number()
  139. {
  140. size_t start_offset = m_reader.offset();
  141. bool is_float = false;
  142. bool consumed_digit = false;
  143. if (m_reader.matches('+') || m_reader.matches('-'))
  144. m_reader.consume();
  145. while (!m_reader.done()) {
  146. if (m_reader.matches('.')) {
  147. if (is_float)
  148. break;
  149. is_float = true;
  150. m_reader.consume();
  151. } else if (isdigit(m_reader.peek())) {
  152. m_reader.consume();
  153. consumed_digit = true;
  154. } else {
  155. break;
  156. }
  157. }
  158. if (!consumed_digit)
  159. return error("Invalid number");
  160. m_reader.consume_whitespace();
  161. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  162. float f = strtof(string.characters(), nullptr);
  163. if (is_float)
  164. return Value(f);
  165. VERIFY(floorf(f) == f);
  166. return Value(static_cast<int>(f));
  167. }
  168. PDFErrorOr<NonnullRefPtr<NameObject>> Parser::parse_name()
  169. {
  170. if (!m_reader.consume('/'))
  171. return error("Expected Name object to start with \"/\"");
  172. StringBuilder builder;
  173. while (true) {
  174. if (!m_reader.matches_regular_character())
  175. break;
  176. if (m_reader.matches('#')) {
  177. m_reader.consume();
  178. int hex_value = 0;
  179. for (int i = 0; i < 2; i++) {
  180. auto ch = m_reader.consume();
  181. VERIFY(isxdigit(ch));
  182. hex_value *= 16;
  183. if (ch <= '9') {
  184. hex_value += ch - '0';
  185. } else {
  186. hex_value += ch - 'A' + 10;
  187. }
  188. }
  189. builder.append(static_cast<char>(hex_value));
  190. continue;
  191. }
  192. builder.append(m_reader.consume());
  193. }
  194. m_reader.consume_whitespace();
  195. return make_object<NameObject>(builder.to_string());
  196. }
  197. NonnullRefPtr<StringObject> Parser::parse_string()
  198. {
  199. ScopeGuard guard([&] { m_reader.consume_whitespace(); });
  200. String string;
  201. bool is_binary_string;
  202. if (m_reader.matches('(')) {
  203. string = parse_literal_string();
  204. is_binary_string = false;
  205. } else {
  206. string = parse_hex_string();
  207. is_binary_string = true;
  208. }
  209. VERIFY(!string.is_null());
  210. auto string_object = make_object<StringObject>(string, is_binary_string);
  211. if (m_document->security_handler() && !m_disable_encryption)
  212. m_document->security_handler()->decrypt(string_object, m_current_reference_stack.last());
  213. auto unencrypted_string = string_object->string();
  214. if (unencrypted_string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  215. // The string is encoded in UTF16-BE
  216. string_object->set_string(TextCodec::decoder_for("utf-16be")->to_utf8(unencrypted_string));
  217. } else if (unencrypted_string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  218. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  219. // are explicitly included, we have to trim them
  220. string_object->set_string(unencrypted_string.substring(3));
  221. }
  222. return string_object;
  223. }
  224. String Parser::parse_literal_string()
  225. {
  226. VERIFY(m_reader.consume('('));
  227. StringBuilder builder;
  228. auto opened_parens = 0;
  229. while (true) {
  230. if (m_reader.matches('(')) {
  231. opened_parens++;
  232. builder.append(m_reader.consume());
  233. } else if (m_reader.matches(')')) {
  234. m_reader.consume();
  235. if (opened_parens == 0)
  236. break;
  237. opened_parens--;
  238. builder.append(')');
  239. } else if (m_reader.matches('\\')) {
  240. m_reader.consume();
  241. if (m_reader.matches_eol()) {
  242. m_reader.consume_eol();
  243. continue;
  244. }
  245. if (m_reader.done())
  246. return {};
  247. auto ch = m_reader.consume();
  248. switch (ch) {
  249. case 'n':
  250. builder.append('\n');
  251. break;
  252. case 'r':
  253. builder.append('\r');
  254. break;
  255. case 't':
  256. builder.append('\t');
  257. break;
  258. case 'b':
  259. builder.append('\b');
  260. break;
  261. case 'f':
  262. builder.append('\f');
  263. break;
  264. case '(':
  265. builder.append('(');
  266. break;
  267. case ')':
  268. builder.append(')');
  269. break;
  270. case '\\':
  271. builder.append('\\');
  272. break;
  273. default: {
  274. if (ch >= '0' && ch <= '7') {
  275. int octal_value = ch - '0';
  276. for (int i = 0; i < 2; i++) {
  277. auto octal_ch = m_reader.consume();
  278. if (octal_ch < '0' || octal_ch > '7')
  279. break;
  280. octal_value = octal_value * 8 + (octal_ch - '0');
  281. }
  282. builder.append(static_cast<char>(octal_value));
  283. } else {
  284. builder.append(ch);
  285. }
  286. }
  287. }
  288. } else if (m_reader.matches_eol()) {
  289. m_reader.consume_eol();
  290. builder.append('\n');
  291. } else {
  292. builder.append(m_reader.consume());
  293. }
  294. }
  295. return builder.to_string();
  296. }
  297. String Parser::parse_hex_string()
  298. {
  299. VERIFY(m_reader.consume('<'));
  300. StringBuilder builder;
  301. while (true) {
  302. if (m_reader.matches('>')) {
  303. m_reader.consume();
  304. return builder.to_string();
  305. } else {
  306. int hex_value = 0;
  307. for (int i = 0; i < 2; i++) {
  308. auto ch = m_reader.consume();
  309. if (ch == '>') {
  310. // The hex string contains an odd number of characters, and the last character
  311. // is assumed to be '0'
  312. m_reader.consume();
  313. hex_value *= 16;
  314. builder.append(static_cast<char>(hex_value));
  315. return builder.to_string();
  316. }
  317. VERIFY(isxdigit(ch));
  318. hex_value *= 16;
  319. if (ch <= '9') {
  320. hex_value += ch - '0';
  321. } else if (ch >= 'A' && ch <= 'F') {
  322. hex_value += ch - 'A' + 10;
  323. } else {
  324. hex_value += ch - 'a' + 10;
  325. }
  326. }
  327. builder.append(static_cast<char>(hex_value));
  328. }
  329. }
  330. }
  331. PDFErrorOr<NonnullRefPtr<ArrayObject>> Parser::parse_array()
  332. {
  333. if (!m_reader.consume('['))
  334. return error("Expected array to start with \"[\"");
  335. m_reader.consume_whitespace();
  336. Vector<Value> values;
  337. while (!m_reader.matches(']'))
  338. values.append(TRY(parse_value()));
  339. VERIFY(m_reader.consume(']'));
  340. m_reader.consume_whitespace();
  341. return make_object<ArrayObject>(values);
  342. }
  343. PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict()
  344. {
  345. if (!m_reader.consume('<') || !m_reader.consume('<'))
  346. return error("Expected dict to start with \"<<\"");
  347. m_reader.consume_whitespace();
  348. HashMap<FlyString, Value> map;
  349. while (!m_reader.done()) {
  350. if (m_reader.matches(">>"))
  351. break;
  352. auto name = TRY(parse_name())->name();
  353. auto value = TRY(parse_value());
  354. map.set(name, value);
  355. }
  356. if (!m_reader.consume('>') || !m_reader.consume('>'))
  357. return error("Expected dict to end with \">>\"");
  358. m_reader.consume_whitespace();
  359. return make_object<DictObject>(map);
  360. }
  361. PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  362. {
  363. if (!m_reader.matches("stream"))
  364. return error("Expected stream to start with \"stream\"");
  365. m_reader.move_by(6);
  366. if (!m_reader.consume_eol())
  367. return error("Expected \"stream\" to be followed by a newline");
  368. ReadonlyBytes bytes;
  369. auto maybe_length = dict->get(CommonNames::Length);
  370. if (maybe_length.has_value() && (!maybe_length->has<Reference>())) {
  371. // The PDF writer has kindly provided us with the direct length of the stream
  372. m_reader.save();
  373. auto length = TRY(m_document->resolve_to<int>(maybe_length.value()));
  374. m_reader.load();
  375. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  376. m_reader.move_by(length);
  377. m_reader.consume_whitespace();
  378. } else {
  379. // We have to look for the endstream keyword
  380. auto stream_start = m_reader.offset();
  381. while (true) {
  382. m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
  383. auto potential_stream_end = m_reader.offset();
  384. m_reader.consume_eol();
  385. if (!m_reader.matches("endstream"))
  386. continue;
  387. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  388. break;
  389. }
  390. }
  391. m_reader.move_by(9);
  392. m_reader.consume_whitespace();
  393. auto stream_object = make_object<StreamObject>(dict, MUST(ByteBuffer::copy(bytes)));
  394. if (m_document->security_handler() && !m_disable_encryption)
  395. m_document->security_handler()->decrypt(stream_object, m_current_reference_stack.last());
  396. if (dict->contains(CommonNames::Filter)) {
  397. Vector<FlyString> filters;
  398. // We may either get a single filter or an array of cascading filters
  399. auto filter_object = TRY(dict->get_object(m_document, CommonNames::Filter));
  400. if (filter_object->is<ArrayObject>()) {
  401. auto filter_array = filter_object->cast<ArrayObject>();
  402. for (size_t i = 0; i < filter_array->size(); ++i)
  403. filters.append(TRY(filter_array->get_name_at(m_document, i))->name());
  404. } else {
  405. filters.append(filter_object->cast<NameObject>()->name());
  406. }
  407. for (auto const& filter_type : filters)
  408. stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filter_type));
  409. }
  410. return stream_object;
  411. }
  412. PDFErrorOr<Vector<Operator>> Parser::parse_operators()
  413. {
  414. Vector<Operator> operators;
  415. Vector<Value> operator_args;
  416. constexpr static auto is_operator_char = [](char ch) {
  417. return isalpha(ch) || ch == '*' || ch == '\'';
  418. };
  419. m_reader.consume_whitespace();
  420. while (!m_reader.done()) {
  421. auto ch = m_reader.peek();
  422. if (is_operator_char(ch)) {
  423. auto operator_start = m_reader.offset();
  424. while (is_operator_char(ch)) {
  425. m_reader.consume();
  426. if (m_reader.done())
  427. break;
  428. ch = m_reader.peek();
  429. }
  430. auto operator_string = StringView(m_reader.bytes().slice(operator_start, m_reader.offset() - operator_start));
  431. auto operator_type = Operator::operator_type_from_symbol(operator_string);
  432. operators.append(Operator(operator_type, move(operator_args)));
  433. operator_args = Vector<Value>();
  434. m_reader.consume_whitespace();
  435. continue;
  436. }
  437. // Note: We disallow parsing indirect values here, since
  438. // operations like 0 0 0 RG would confuse the parser
  439. auto v = TRY(parse_value(CanBeIndirectValue::No));
  440. operator_args.append(v);
  441. }
  442. return operators;
  443. }
  444. Error Parser::error(
  445. String const& message
  446. #ifdef PDF_DEBUG
  447. ,
  448. SourceLocation loc
  449. #endif
  450. ) const
  451. {
  452. #ifdef PDF_DEBUG
  453. dbgln("\033[31m{} Parser error at offset {}: {}\033[0m", loc, m_reader.offset(), message);
  454. #endif
  455. return Error { Error::Type::Parse, message };
  456. }
  457. }