Regex.cpp 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <AK/String.h>
  27. #include <AK/StringBuilder.h>
  28. #include <LibRegex/Regex.h>
  29. #include <ctype.h>
  30. #include <stdio.h>
  31. #include <string.h>
  32. #ifdef __serenity__
  33. # include <regex.h>
  34. #else
  35. # include <LibC/regex.h>
  36. #endif
  37. struct internal_regex_t {
  38. u8 cflags;
  39. u8 eflags;
  40. OwnPtr<Regex<PosixExtended>> re;
  41. size_t re_pat_errpos;
  42. ReError re_pat_err;
  43. String re_pat;
  44. size_t re_nsub;
  45. };
  46. static internal_regex_t* impl_from(regex_t* re)
  47. {
  48. if (!re)
  49. return nullptr;
  50. return reinterpret_cast<internal_regex_t*>(re->__data);
  51. }
  52. static const internal_regex_t* impl_from(const regex_t* re)
  53. {
  54. return impl_from(const_cast<regex_t*>(re));
  55. }
  56. extern "C" {
  57. int regcomp(regex_t* reg, const char* pattern, int cflags)
  58. {
  59. if (!reg)
  60. return REG_ESPACE;
  61. // Note that subsequent uses of regcomp() without regfree() _will_ leak memory
  62. // This could've been prevented if libc provided a reginit() or similar, but it does not.
  63. reg->__data = new internal_regex_t { 0, 0, {}, 0, ReError::REG_NOERR, {}, 0 };
  64. auto preg = impl_from(reg);
  65. if (!(cflags & REG_EXTENDED))
  66. return REG_ENOSYS;
  67. preg->cflags = cflags;
  68. String pattern_str(pattern);
  69. preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
  70. auto parser_result = preg->re->parser_result;
  71. if (parser_result.error != regex::Error::NoError) {
  72. preg->re_pat_errpos = parser_result.error_token.position();
  73. preg->re_pat_err = (ReError)parser_result.error;
  74. preg->re_pat = pattern;
  75. dbgln("Have Error: {}", (int)parser_result.error);
  76. return (ReError)parser_result.error;
  77. }
  78. preg->re_nsub = parser_result.capture_groups_count;
  79. return REG_NOERR;
  80. }
  81. int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
  82. {
  83. auto preg = impl_from(reg);
  84. if (!preg->re || preg->re_pat_err) {
  85. if (preg->re_pat_err)
  86. return preg->re_pat_err;
  87. return REG_BADPAT;
  88. }
  89. RegexResult result;
  90. if (eflags & REG_SEARCH)
  91. result = preg->re->search(string, PosixOptions {} | (PosixFlags)eflags);
  92. else
  93. result = preg->re->match(string, PosixOptions {} | (PosixFlags)eflags);
  94. if (result.success) {
  95. auto size = result.matches.size();
  96. if (size && nmatch && pmatch) {
  97. pmatch[0].rm_cnt = size;
  98. size_t match_index { 0 };
  99. for (size_t i = 0; i < size; ++i) {
  100. pmatch[match_index].rm_so = result.matches.at(i).global_offset;
  101. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
  102. if (match_index > 0)
  103. pmatch[match_index].rm_cnt = result.capture_group_matches.size();
  104. ++match_index;
  105. if (match_index >= nmatch)
  106. return REG_NOERR;
  107. if (i < result.capture_group_matches.size()) {
  108. auto capture_groups_size = result.capture_group_matches.at(i).size();
  109. for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) {
  110. if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
  111. pmatch[match_index].rm_so = -1;
  112. pmatch[match_index].rm_eo = -1;
  113. pmatch[match_index].rm_cnt = 0;
  114. } else {
  115. pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
  116. pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
  117. pmatch[match_index].rm_cnt = 1;
  118. }
  119. ++match_index;
  120. if (match_index >= nmatch)
  121. return REG_NOERR;
  122. }
  123. }
  124. }
  125. if (match_index < nmatch) {
  126. for (size_t i = match_index; i < nmatch; ++i) {
  127. pmatch[i].rm_so = -1;
  128. pmatch[i].rm_eo = -1;
  129. pmatch[i].rm_cnt = 0;
  130. }
  131. }
  132. }
  133. return REG_NOERR;
  134. } else {
  135. if (nmatch && pmatch) {
  136. pmatch[0].rm_so = -1;
  137. pmatch[0].rm_eo = -1;
  138. pmatch[0].rm_cnt = 0;
  139. }
  140. }
  141. return REG_NOMATCH;
  142. }
  143. inline static String get_error(ReError errcode)
  144. {
  145. String error;
  146. switch ((ReError)errcode) {
  147. case REG_NOERR:
  148. error = "No error";
  149. break;
  150. case REG_NOMATCH:
  151. error = "regexec() failed to match.";
  152. break;
  153. case REG_BADPAT:
  154. error = "Invalid regular expression.";
  155. break;
  156. case REG_ECOLLATE:
  157. error = "Invalid collating element referenced.";
  158. break;
  159. case REG_ECTYPE:
  160. error = "Invalid character class type referenced.";
  161. break;
  162. case REG_EESCAPE:
  163. error = "Trailing \\ in pattern.";
  164. break;
  165. case REG_ESUBREG:
  166. error = "Number in \\digit invalid or in error.";
  167. break;
  168. case REG_EBRACK:
  169. error = "[ ] imbalance.";
  170. break;
  171. case REG_EPAREN:
  172. error = "\\( \\) or ( ) imbalance.";
  173. break;
  174. case REG_EBRACE:
  175. error = "\\{ \\} imbalance.";
  176. break;
  177. case REG_BADBR:
  178. error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
  179. break;
  180. case REG_ERANGE:
  181. error = "Invalid endpoint in range expression.";
  182. break;
  183. case REG_ESPACE:
  184. error = "Out of memory.";
  185. break;
  186. case REG_BADRPT:
  187. error = "?, * or + not preceded by valid regular expression.";
  188. break;
  189. case REG_ENOSYS:
  190. error = "The implementation does not support the function.";
  191. break;
  192. case REG_EMPTY_EXPR:
  193. error = "Empty expression provided";
  194. break;
  195. }
  196. return error;
  197. }
  198. size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
  199. {
  200. String error;
  201. auto preg = impl_from(reg);
  202. if (!preg)
  203. error = get_error((ReError)errcode);
  204. else
  205. error = preg->re->error_string(get_error(preg->re_pat_err));
  206. if (!errbuf_size)
  207. return error.length();
  208. if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
  209. return 0;
  210. return error.length();
  211. }
  212. void regfree(regex_t* reg)
  213. {
  214. auto preg = impl_from(reg);
  215. if (preg) {
  216. delete preg;
  217. reg->__data = nullptr;
  218. }
  219. }
  220. }