Regex.cpp 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/String.h>
  7. #include <AK/StringBuilder.h>
  8. #include <LibRegex/Regex.h>
  9. #include <ctype.h>
  10. #include <stdio.h>
  11. #include <string.h>
  12. #ifdef __serenity__
  13. # include <regex.h>
  14. #else
  15. # include <LibC/regex.h>
  16. #endif
  17. #include <AK/Variant.h>
  18. struct internal_regex_t {
  19. u8 cflags;
  20. u8 eflags;
  21. Optional<Variant<NonnullOwnPtr<Regex<PosixExtended>>, NonnullOwnPtr<Regex<PosixBasic>>>> re;
  22. size_t re_pat_errpos;
  23. ReError re_pat_err;
  24. String re_pat;
  25. size_t re_nsub;
  26. };
  27. static internal_regex_t* impl_from(regex_t* re)
  28. {
  29. if (!re)
  30. return nullptr;
  31. return reinterpret_cast<internal_regex_t*>(re->__data);
  32. }
  33. static const internal_regex_t* impl_from(const regex_t* re)
  34. {
  35. return impl_from(const_cast<regex_t*>(re));
  36. }
  37. extern "C" {
  38. int regcomp(regex_t* reg, const char* pattern, int cflags)
  39. {
  40. if (!reg)
  41. return REG_ESPACE;
  42. // Note that subsequent uses of regcomp() without regfree() _will_ leak memory
  43. // This could've been prevented if libc provided a reginit() or similar, but it does not.
  44. reg->__data = new internal_regex_t { 0, 0, {}, 0, ReError::REG_NOERR, {}, 0 };
  45. auto preg = impl_from(reg);
  46. bool is_extended = cflags & REG_EXTENDED;
  47. preg->cflags = cflags;
  48. String pattern_str(pattern);
  49. if (is_extended)
  50. preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
  51. else
  52. preg->re = make<Regex<PosixBasic>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
  53. auto parser_result = preg->re->visit([](auto& re) { return re->parser_result; });
  54. if (parser_result.error != regex::Error::NoError) {
  55. preg->re_pat_errpos = parser_result.error_token.position();
  56. preg->re_pat_err = (ReError)parser_result.error;
  57. preg->re_pat = pattern;
  58. dbgln("Have Error: {}", (int)parser_result.error);
  59. return (ReError)parser_result.error;
  60. }
  61. preg->re_nsub = parser_result.capture_groups_count;
  62. return REG_NOERR;
  63. }
  64. int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
  65. {
  66. auto preg = impl_from(reg);
  67. if (!preg->re.has_value() || preg->re_pat_err) {
  68. if (preg->re_pat_err)
  69. return preg->re_pat_err;
  70. return REG_BADPAT;
  71. }
  72. RegexResult result;
  73. if (eflags & REG_SEARCH)
  74. result = preg->re->visit([&](auto& re) { return re->search(string, PosixOptions {} | (PosixFlags)eflags); });
  75. else
  76. result = preg->re->visit([&](auto& re) { return re->match(string, PosixOptions {} | (PosixFlags)eflags); });
  77. if (result.success) {
  78. auto capture_groups_count = preg->re->visit([](auto& re) { return re->parser_result.capture_groups_count; });
  79. auto size = result.matches.size();
  80. if (size && nmatch && pmatch) {
  81. pmatch[0].rm_cnt = size;
  82. size_t match_index { 0 };
  83. for (size_t i = 0; i < size; ++i) {
  84. pmatch[match_index].rm_so = result.matches.at(i).global_offset;
  85. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
  86. if (match_index > 0)
  87. pmatch[match_index].rm_cnt = result.capture_group_matches.size();
  88. ++match_index;
  89. if (match_index >= nmatch)
  90. return REG_NOERR;
  91. if (i < result.capture_group_matches.size()) {
  92. auto capture_groups_size = result.capture_group_matches.at(i).size();
  93. for (size_t j = 0; j < capture_groups_count; ++j) {
  94. if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
  95. pmatch[match_index].rm_so = -1;
  96. pmatch[match_index].rm_eo = -1;
  97. pmatch[match_index].rm_cnt = 0;
  98. } else {
  99. pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
  100. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
  101. pmatch[match_index].rm_cnt = 1;
  102. }
  103. ++match_index;
  104. if (match_index >= nmatch)
  105. return REG_NOERR;
  106. }
  107. }
  108. }
  109. if (match_index < nmatch) {
  110. for (size_t i = match_index; i < nmatch; ++i) {
  111. pmatch[i].rm_so = -1;
  112. pmatch[i].rm_eo = -1;
  113. pmatch[i].rm_cnt = 0;
  114. }
  115. }
  116. }
  117. return REG_NOERR;
  118. } else {
  119. if (nmatch && pmatch) {
  120. pmatch[0].rm_so = -1;
  121. pmatch[0].rm_eo = -1;
  122. pmatch[0].rm_cnt = 0;
  123. }
  124. }
  125. return REG_NOMATCH;
  126. }
  127. inline static String get_error(ReError errcode)
  128. {
  129. String error;
  130. switch ((ReError)errcode) {
  131. case REG_NOERR:
  132. error = "No error";
  133. break;
  134. case REG_NOMATCH:
  135. error = "regexec() failed to match.";
  136. break;
  137. case REG_BADPAT:
  138. error = "Invalid regular expression.";
  139. break;
  140. case REG_ECOLLATE:
  141. error = "Invalid collating element referenced.";
  142. break;
  143. case REG_ECTYPE:
  144. error = "Invalid character class type referenced.";
  145. break;
  146. case REG_EESCAPE:
  147. error = "Trailing \\ in pattern.";
  148. break;
  149. case REG_ESUBREG:
  150. error = "Number in \\digit invalid or in error.";
  151. break;
  152. case REG_EBRACK:
  153. error = "[ ] imbalance.";
  154. break;
  155. case REG_EPAREN:
  156. error = "\\( \\) or ( ) imbalance.";
  157. break;
  158. case REG_EBRACE:
  159. error = "\\{ \\} imbalance.";
  160. break;
  161. case REG_BADBR:
  162. error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
  163. break;
  164. case REG_ERANGE:
  165. error = "Invalid endpoint in range expression.";
  166. break;
  167. case REG_ESPACE:
  168. error = "Out of memory.";
  169. break;
  170. case REG_BADRPT:
  171. error = "?, * or + not preceded by valid regular expression.";
  172. break;
  173. case REG_ENOSYS:
  174. error = "The implementation does not support the function.";
  175. break;
  176. case REG_EMPTY_EXPR:
  177. error = "Empty expression provided";
  178. break;
  179. }
  180. return error;
  181. }
  182. size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
  183. {
  184. String error;
  185. auto preg = impl_from(reg);
  186. if (!preg)
  187. error = get_error((ReError)errcode);
  188. else
  189. error = preg->re->visit([&](auto& re) { return re->error_string(get_error(preg->re_pat_err)); });
  190. if (!errbuf_size)
  191. return error.length();
  192. if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
  193. return 0;
  194. return error.length();
  195. }
  196. void regfree(regex_t* reg)
  197. {
  198. auto preg = impl_from(reg);
  199. if (preg) {
  200. delete preg;
  201. reg->__data = nullptr;
  202. }
  203. }
  204. }