Regex.cpp 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/ByteString.h>
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Variant.h>
  9. #include <LibRegex/Regex.h>
  10. #include <ctype.h>
  11. #include <regex.h>
  12. #include <stdio.h>
  13. #include <string.h>
  14. #ifndef AK_OS_SERENITY
  15. # error "This file is intended for use on Serenity only to implement POSIX regex.h"
  16. #endif
  17. struct internal_regex_t {
  18. u8 cflags;
  19. u8 eflags;
  20. Optional<Variant<NonnullOwnPtr<Regex<PosixExtended>>, NonnullOwnPtr<Regex<PosixBasic>>>> re;
  21. size_t re_pat_errpos;
  22. ReError re_pat_err;
  23. ByteString re_pat;
  24. };
  25. static internal_regex_t* impl_from(regex_t* re)
  26. {
  27. if (!re)
  28. return nullptr;
  29. return reinterpret_cast<internal_regex_t*>(re->__data);
  30. }
  31. static internal_regex_t const* impl_from(regex_t const* re)
  32. {
  33. return impl_from(const_cast<regex_t*>(re));
  34. }
  35. extern "C" {
  36. int regcomp(regex_t* reg, char const* pattern, int cflags)
  37. {
  38. if (!reg)
  39. return REG_ESPACE;
  40. // Note that subsequent uses of regcomp() without regfree() _will_ leak memory
  41. // This could've been prevented if libc provided a reginit() or similar, but it does not.
  42. reg->__data = new internal_regex_t { 0, 0, {}, 0, ReError::REG_NOERR, {} };
  43. auto* preg = impl_from(reg);
  44. bool is_extended = cflags & REG_EXTENDED;
  45. preg->cflags = cflags;
  46. ByteString pattern_str(pattern);
  47. if (is_extended)
  48. preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
  49. else
  50. preg->re = make<Regex<PosixBasic>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
  51. auto parser_result = preg->re->visit([](auto& re) { return re->parser_result; });
  52. if (parser_result.error != regex::Error::NoError) {
  53. preg->re_pat_errpos = parser_result.error_token.position();
  54. preg->re_pat_err = (ReError)parser_result.error;
  55. preg->re_pat = pattern;
  56. dbgln("Have Error: {}", (int)parser_result.error);
  57. return (ReError)parser_result.error;
  58. }
  59. reg->re_nsub = parser_result.capture_groups_count;
  60. return REG_NOERR;
  61. }
  62. int regexec(regex_t const* reg, char const* string, size_t nmatch, regmatch_t pmatch[], int eflags)
  63. {
  64. auto const* preg = impl_from(reg);
  65. if (!preg->re.has_value() || preg->re_pat_err) {
  66. if (preg->re_pat_err)
  67. return preg->re_pat_err;
  68. return REG_BADPAT;
  69. }
  70. RegexResult result;
  71. StringView string_view { string, strlen(string) };
  72. if (eflags & REG_SEARCH)
  73. result = preg->re->visit([&](auto& re) { return re->search(string_view, PosixOptions {} | (PosixFlags)eflags); });
  74. else
  75. result = preg->re->visit([&](auto& re) { return re->match(string_view, PosixOptions {} | (PosixFlags)eflags); });
  76. if (result.success) {
  77. auto capture_groups_count = preg->re->visit([](auto& re) { return re->parser_result.capture_groups_count; });
  78. auto size = result.matches.size();
  79. if (size && nmatch && pmatch) {
  80. pmatch[0].rm_cnt = size;
  81. size_t match_index { 0 };
  82. for (size_t i = 0; i < size; ++i) {
  83. pmatch[match_index].rm_so = result.matches.at(i).global_offset;
  84. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
  85. if (match_index > 0)
  86. pmatch[match_index].rm_cnt = result.capture_group_matches.size();
  87. ++match_index;
  88. if (match_index >= nmatch)
  89. return REG_NOERR;
  90. if (i < result.capture_group_matches.size()) {
  91. auto capture_groups_size = result.capture_group_matches.at(i).size();
  92. for (size_t j = 0; j < capture_groups_count; ++j) {
  93. if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
  94. pmatch[match_index].rm_so = -1;
  95. pmatch[match_index].rm_eo = -1;
  96. pmatch[match_index].rm_cnt = 0;
  97. } else {
  98. pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
  99. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
  100. pmatch[match_index].rm_cnt = 1;
  101. }
  102. ++match_index;
  103. if (match_index >= nmatch)
  104. return REG_NOERR;
  105. }
  106. }
  107. }
  108. if (match_index < nmatch) {
  109. for (size_t i = match_index; i < nmatch; ++i) {
  110. pmatch[i].rm_so = -1;
  111. pmatch[i].rm_eo = -1;
  112. pmatch[i].rm_cnt = 0;
  113. }
  114. }
  115. }
  116. return REG_NOERR;
  117. }
  118. if (nmatch && pmatch) {
  119. pmatch[0].rm_so = -1;
  120. pmatch[0].rm_eo = -1;
  121. pmatch[0].rm_cnt = 0;
  122. }
  123. return REG_NOMATCH;
  124. }
  125. static StringView get_error(ReError errcode)
  126. {
  127. switch (errcode) {
  128. case REG_NOERR:
  129. return "No error"sv;
  130. case REG_NOMATCH:
  131. return "regexec() failed to match."sv;
  132. case REG_BADPAT:
  133. return "Invalid regular expression."sv;
  134. case REG_ECOLLATE:
  135. return "Invalid collating element referenced."sv;
  136. case REG_ECTYPE:
  137. return "Invalid character class type referenced."sv;
  138. case REG_EESCAPE:
  139. return "Trailing \\ in pattern."sv;
  140. case REG_ESUBREG:
  141. return "Number in \\digit invalid or in error."sv;
  142. case REG_EBRACK:
  143. return "[ ] imbalance."sv;
  144. case REG_EPAREN:
  145. return "\\( \\) or ( ) imbalance."sv;
  146. case REG_EBRACE:
  147. return "\\{ \\} imbalance."sv;
  148. case REG_BADBR:
  149. return "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second."sv;
  150. case REG_ERANGE:
  151. return "Invalid endpoint in range expression."sv;
  152. case REG_ESPACE:
  153. return "Out of memory."sv;
  154. case REG_BADRPT:
  155. return "?, * or + not preceded by valid regular expression."sv;
  156. case REG_ENOSYS:
  157. return "The implementation does not support the function."sv;
  158. case REG_EMPTY_EXPR:
  159. return "Empty expression provided"sv;
  160. }
  161. return {};
  162. }
  163. size_t regerror(int errcode, regex_t const* reg, char* errbuf, size_t errbuf_size)
  164. {
  165. ByteString error;
  166. auto const* preg = impl_from(reg);
  167. if (!preg)
  168. error = get_error((ReError)errcode);
  169. else
  170. error = preg->re->visit([&](auto& re) { return re->error_string(get_error(preg->re_pat_err)); });
  171. if (!errbuf_size)
  172. return error.length();
  173. if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
  174. return 0;
  175. return error.length();
  176. }
  177. void regfree(regex_t* reg)
  178. {
  179. auto* preg = impl_from(reg);
  180. if (preg) {
  181. delete preg;
  182. reg->__data = nullptr;
  183. }
  184. }
  185. }