RegularExpression.mjs 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /**
  2. * @author n1474335 [n1474335@gmail.com]
  3. * @copyright Crown Copyright 2018
  4. * @license Apache-2.0
  5. */
  6. import XRegExp from "xregexp";
  7. import Operation from "../Operation.mjs";
  8. import Utils from "../Utils.mjs";
  9. import OperationError from "../errors/OperationError.mjs";
  10. /**
  11. * Regular expression operation
  12. */
  13. class RegularExpression extends Operation {
  14. /**
  15. * RegularExpression constructor
  16. */
  17. constructor() {
  18. super();
  19. this.name = "Regular expression";
  20. this.module = "Regex";
  21. this.description = "Define your own regular expression (regex) to search the input data with, optionally choosing from a list of pre-defined patterns.<br><br>Supports extended regex syntax including the 'dot matches all' flag, named capture groups, full unicode coverage (including <code>\\p{}</code> categories and scripts as well as astral codes) and recursive matching.";
  22. this.infoURL = "https://wikipedia.org/wiki/Regular_expression";
  23. this.inputType = "string";
  24. this.outputType = "html";
  25. this.args = [
  26. {
  27. "name": "Built in regexes",
  28. "type": "populateOption",
  29. "value": [
  30. {
  31. name: "User defined",
  32. value: ""
  33. },
  34. {
  35. name: "IPv4 address",
  36. value: "(?:(?:\\d|[01]?\\d\\d|2[0-4]\\d|25[0-5])\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d|\\d)(?:\\/\\d{1,2})?"
  37. },
  38. {
  39. name: "IPv6 address",
  40. value: "((?=.*::)(?!.*::.+::)(::)?([\\dA-Fa-f]{1,4}:(:|\\b)|){5}|([\\dA-Fa-f]{1,4}:){6})((([\\dA-Fa-f]{1,4}((?!\\3)::|:\\b|(?![\\dA-Fa-f])))|(?!\\2\\3)){2}|(((2[0-4]|1\\d|[1-9])?\\d|25[0-5])\\.?\\b){4})"
  41. },
  42. {
  43. name: "Email address",
  44. value: "(?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9](?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9-]*[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9])?\\.)+[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9](?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9-]*[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\\])"
  45. },
  46. {
  47. name: "URL",
  48. value: "([A-Za-z]+://)([-\\w]+(?:\\.\\w[-\\w]*)+)(:\\d+)?(/[^.!,?\"<>\\[\\]{}\\s\\x7F-\\xFF]*(?:[.!,?]+[^.!,?\"<>\\[\\]{}\\s\\x7F-\\xFF]+)*)?"
  49. },
  50. {
  51. name: "Domain",
  52. value: "\\b((?=[a-z0-9-]{1,63}\\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\\.)+[a-z]{2,63}\\b"
  53. },
  54. {
  55. name: "Windows file path",
  56. value: "([A-Za-z]):\\\\((?:[A-Za-z\\d][A-Za-z\\d\\- \\x27_\\(\\)~]{0,61}\\\\?)*[A-Za-z\\d][A-Za-z\\d\\- \\x27_\\(\\)]{0,61})(\\.[A-Za-z\\d]{1,6})?"
  57. },
  58. {
  59. name: "UNIX file path",
  60. value: "(?:/[A-Za-z\\d.][A-Za-z\\d\\-.]{0,61})+"
  61. },
  62. {
  63. name: "MAC address",
  64. value: "[A-Fa-f\\d]{2}(?:[:-][A-Fa-f\\d]{2}){5}"
  65. },
  66. {
  67. name: "Date (yyyy-mm-dd)",
  68. value: "((?:19|20)\\d\\d)[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])"
  69. },
  70. {
  71. name: "Date (dd/mm/yyyy)",
  72. value: "(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.]((?:19|20)\\d\\d)"
  73. },
  74. {
  75. name: "Date (mm/dd/yyyy)",
  76. value: "(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.]((?:19|20)\\d\\d)"
  77. },
  78. {
  79. name: "Strings",
  80. value: "[A-Za-z\\d/\\-:.,_$%\\x27\"()<>= !\\[\\]{}@]{4,}"
  81. },
  82. ],
  83. "target": 1
  84. },
  85. {
  86. "name": "Regex",
  87. "type": "text",
  88. "value": ""
  89. },
  90. {
  91. "name": "Case insensitive",
  92. "type": "boolean",
  93. "value": true
  94. },
  95. {
  96. "name": "^ and $ match at newlines",
  97. "type": "boolean",
  98. "value": true
  99. },
  100. {
  101. "name": "Dot matches all",
  102. "type": "boolean",
  103. "value": false
  104. },
  105. {
  106. "name": "Unicode support",
  107. "type": "boolean",
  108. "value": false
  109. },
  110. {
  111. "name": "Astral support",
  112. "type": "boolean",
  113. "value": false
  114. },
  115. {
  116. "name": "Display total",
  117. "type": "boolean",
  118. "value": false
  119. },
  120. {
  121. "name": "Output format",
  122. "type": "option",
  123. "value": ["Highlight matches", "List matches", "List capture groups", "List matches with capture groups"]
  124. }
  125. ];
  126. }
  127. /**
  128. * @param {string} input
  129. * @param {Object[]} args
  130. * @returns {html}
  131. */
  132. run(input, args) {
  133. const [,
  134. userRegex,
  135. i, m, s, u, a,
  136. displayTotal,
  137. outputFormat
  138. ] = args;
  139. let modifiers = "g";
  140. if (i) modifiers += "i";
  141. if (m) modifiers += "m";
  142. if (s) modifiers += "s";
  143. if (u) modifiers += "u";
  144. if (a) modifiers += "A";
  145. if (userRegex && userRegex !== "^" && userRegex !== "$") {
  146. try {
  147. const regex = new XRegExp(userRegex, modifiers);
  148. switch (outputFormat) {
  149. case "Highlight matches":
  150. return regexHighlight(input, regex, displayTotal);
  151. case "List matches":
  152. return Utils.escapeHtml(regexList(input, regex, displayTotal, true, false));
  153. case "List capture groups":
  154. return Utils.escapeHtml(regexList(input, regex, displayTotal, false, true));
  155. case "List matches with capture groups":
  156. return Utils.escapeHtml(regexList(input, regex, displayTotal, true, true));
  157. default:
  158. throw new OperationError("Error: Invalid output format");
  159. }
  160. } catch (err) {
  161. throw new OperationError("Invalid regex. Details: " + err.message);
  162. }
  163. } else {
  164. return Utils.escapeHtml(input);
  165. }
  166. }
  167. }
  168. /**
  169. * Creates a string listing the matches within a string.
  170. *
  171. * @param {string} input
  172. * @param {RegExp} regex
  173. * @param {boolean} displayTotal
  174. * @param {boolean} matches - Display full match
  175. * @param {boolean} captureGroups - Display each of the capture groups separately
  176. * @returns {string}
  177. */
  178. function regexList (input, regex, displayTotal, matches, captureGroups) {
  179. let output = "",
  180. total = 0,
  181. match;
  182. while ((match = regex.exec(input))) {
  183. // Moves pointer when an empty string is matched (prevents infinite loop)
  184. if (match.index === regex.lastIndex) {
  185. regex.lastIndex++;
  186. }
  187. total++;
  188. if (matches) {
  189. output += match[0] + "\n";
  190. }
  191. if (captureGroups) {
  192. for (let i = 1; i < match.length; i++) {
  193. if (matches) {
  194. output += " Group " + i + ": ";
  195. }
  196. output += match[i] + "\n";
  197. }
  198. }
  199. }
  200. if (displayTotal)
  201. output = "Total found: " + total + "\n\n" + output;
  202. return output.slice(0, -1);
  203. }
  204. /**
  205. * Adds HTML highlights to matches within a string.
  206. *
  207. * @private
  208. * @param {string} input
  209. * @param {RegExp} regex
  210. * @param {boolean} displayTotal
  211. * @returns {string}
  212. */
  213. function regexHighlight (input, regex, displayTotal) {
  214. let output = "",
  215. title = "",
  216. hl = 1,
  217. total = 0;
  218. const captureGroups = [];
  219. output = input.replace(regex, (match, ...args) => {
  220. args.pop(); // Throw away full string
  221. const offset = args.pop(),
  222. groups = args;
  223. title = `Offset: ${offset}\n`;
  224. if (groups.length) {
  225. title += "Groups:\n";
  226. for (let i = 0; i < groups.length; i++) {
  227. title += `\t${i+1}: ${Utils.escapeHtml(groups[i] || "")}\n`;
  228. }
  229. }
  230. // Switch highlight
  231. hl = hl === 1 ? 2 : 1;
  232. // Store highlighted match and replace with a placeholder
  233. captureGroups.push(`<span class='hl${hl}' title='${title}'>${Utils.escapeHtml(match)}</span>`);
  234. return `[cc_capture_group_${total++}]`;
  235. });
  236. // Safely escape all remaining text, then replace placeholders
  237. output = Utils.escapeHtml(output);
  238. output = output.replace(/\[cc_capture_group_(\d+)\]/g, (_, i) => {
  239. return captureGroups[i];
  240. });
  241. if (displayTotal)
  242. output = "Total found: " + total + "\n\n" + output;
  243. return output;
  244. }
  245. export default RegularExpression;