Extract.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. /**
  2. * Identifier extraction operations.
  3. *
  4. * @author n1474335 [n1474335@gmail.com]
  5. * @copyright Crown Copyright 2016
  6. * @license Apache-2.0
  7. *
  8. * @namespace
  9. */
  10. var Extract = {
  11. /**
  12. * Runs search operations across the input data using regular expressions.
  13. *
  14. * @private
  15. * @param {string} input
  16. * @param {RegExp} search_regex
  17. * @param {RegExp} remove_regex - A regular expression defining results to remove from the
  18. * final list
  19. * @param {boolean} include_total - Whether or not to include the total number of results
  20. * @returns {string}
  21. */
  22. _search: function(input, search_regex, remove_regex, include_total) {
  23. var output = "",
  24. total = 0,
  25. match;
  26. while (!!(match = search_regex.exec(input))) {
  27. if (remove_regex && remove_regex.test(match[0]))
  28. continue;
  29. total++;
  30. output += match[0] + "\n";
  31. }
  32. if (include_total)
  33. output = "Total found: " + total + "\n\n" + output;
  34. return output;
  35. },
  36. /**
  37. * @constant
  38. * @default
  39. */
  40. MIN_STRING_LEN: 3,
  41. /**
  42. * @constant
  43. * @default
  44. */
  45. DISPLAY_TOTAL: false,
  46. /**
  47. * Strings operation.
  48. *
  49. * @param {string} input
  50. * @param {Object[]} args
  51. * @returns {string}
  52. */
  53. run_strings: function(input, args) {
  54. var min_len = args[0] || Extract.MIN_STRING_LEN,
  55. display_total = args[1],
  56. strings = "[A-Z\\d/\\-:.,_$%'\"()<>= !\\[\\]{}@]",
  57. regex = new RegExp(strings + "{" + min_len + ",}", "ig");
  58. return Extract._search(input, regex, null, display_total);
  59. },
  60. /**
  61. * @constant
  62. * @default
  63. */
  64. INCLUDE_IPV4: true,
  65. /**
  66. * @constant
  67. * @default
  68. */
  69. INCLUDE_IPV6: false,
  70. /**
  71. * @constant
  72. * @default
  73. */
  74. REMOVE_LOCAL: false,
  75. /**
  76. * Extract IP addresses operation.
  77. *
  78. * @param {string} input
  79. * @param {Object[]} args
  80. * @returns {string}
  81. */
  82. run_ip: function(input, args) {
  83. var include_ipv4 = args[0],
  84. include_ipv6 = args[1],
  85. remove_local = args[2],
  86. display_total = args[3],
  87. ipv4 = "(?:(?:\\d|[01]?\\d\\d|2[0-4]\\d|25[0-5])\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d|\\d)(?:\\/\\d{1,2})?",
  88. ipv6 = "((?=.*::)(?!.*::.+::)(::)?([\\dA-F]{1,4}:(:|\\b)|){5}|([\\dA-F]{1,4}:){6})((([\\dA-F]{1,4}((?!\\3)::|:\\b|(?![\\dA-F])))|(?!\\2\\3)){2}|(((2[0-4]|1\\d|[1-9])?\\d|25[0-5])\\.?\\b){4})",
  89. ips = "";
  90. if (include_ipv4 && include_ipv6) {
  91. ips = ipv4 + "|" + ipv6;
  92. } else if (include_ipv4) {
  93. ips = ipv4;
  94. } else if (include_ipv6) {
  95. ips = ipv6;
  96. }
  97. if (ips) {
  98. var regex = new RegExp(ips, "ig");
  99. if (remove_local) {
  100. var ten = "10\\..+",
  101. oneninetwo = "192\\.168\\..+",
  102. oneseventwo = "172\\.(?:1[6-9]|2\\d|3[01])\\..+",
  103. onetwoseven = "127\\..+",
  104. remove_regex = new RegExp("^(?:" + ten + "|" + oneninetwo +
  105. "|" + oneseventwo + "|" + onetwoseven + ")");
  106. return Extract._search(input, regex, remove_regex, display_total);
  107. } else {
  108. return Extract._search(input, regex, null, display_total);
  109. }
  110. } else {
  111. return "";
  112. }
  113. },
  114. /**
  115. * Extract email addresses operation.
  116. *
  117. * @param {string} input
  118. * @param {Object[]} args
  119. * @returns {string}
  120. */
  121. run_email: function(input, args) {
  122. var display_total = args[0],
  123. regex = /\w[-.\w]*@[-\w]+(?:\.[-\w]+)*\.[A-Z]{2,4}/ig;
  124. return Extract._search(input, regex, null, display_total);
  125. },
  126. /**
  127. * Extract MAC addresses operation.
  128. *
  129. * @param {string} input
  130. * @param {Object[]} args
  131. * @returns {string}
  132. */
  133. run_mac: function(input, args) {
  134. var display_total = args[0],
  135. regex = /[A-F\d]{2}(?:[:-][A-F\d]{2}){5}/ig;
  136. return Extract._search(input, regex, null, display_total);
  137. },
  138. /**
  139. * Extract URLs operation.
  140. *
  141. * @param {string} input
  142. * @param {Object[]} args
  143. * @returns {string}
  144. */
  145. run_urls: function(input, args) {
  146. var display_total = args[0],
  147. protocol = "[A-Z]+://",
  148. hostname = "[-\\w]+(?:\\.\\w[-\\w]*)+",
  149. port = ":\\d+",
  150. path = "/[^.!,?;\"'<>()\\[\\]{}\\s\\x7F-\\xFF]*";
  151. path += "(?:[.!,?]+[^.!,?;\"'<>()\\[\\]{}\\s\\x7F-\\xFF]+)*";
  152. var regex = new RegExp(protocol + hostname + "(?:" + port +
  153. ")?(?:" + path + ")?", "ig");
  154. return Extract._search(input, regex, null, display_total);
  155. },
  156. /**
  157. * Extract domains operation.
  158. *
  159. * @param {string} input
  160. * @param {Object[]} args
  161. * @returns {string}
  162. */
  163. run_domains: function(input, args) {
  164. var display_total = args[0],
  165. protocol = "https?://",
  166. hostname = "[-\\w\\.]+",
  167. tld = "\\.(?:com|net|org|biz|info|co|uk|onion|int|mobi|name|edu|gov|mil|eu|ac|ae|af|de|ca|ch|cn|cy|es|gb|hk|il|in|io|tv|me|nl|no|nz|ro|ru|tr|us|az|ir|kz|uz|pk)+",
  168. regex = new RegExp("(?:" + protocol + ")?" + hostname + tld, "ig");
  169. return Extract._search(input, regex, null, display_total);
  170. },
  171. /**
  172. * @constant
  173. * @default
  174. */
  175. INCLUDE_WIN_PATH: true,
  176. /**
  177. * @constant
  178. * @default
  179. */
  180. INCLUDE_UNIX_PATH: true,
  181. /**
  182. * Extract file paths operation.
  183. *
  184. * @param {string} input
  185. * @param {Object[]} args
  186. * @returns {string}
  187. */
  188. run_file_paths: function(input, args) {
  189. var include_win_path = args[0],
  190. include_unix_path = args[1],
  191. display_total = args[2],
  192. win_drive = "[A-Z]:\\\\",
  193. win_name = "[A-Z\\d][A-Z\\d\\- '_\\(\\)]{0,61}",
  194. win_ext = "[A-Z\\d]{1,6}",
  195. win_path = win_drive + "(?:" + win_name + "\\\\?)*" + win_name +
  196. "(?:\\." + win_ext + ")?",
  197. unix_path = "(?:/[A-Z\\d.][A-Z\\d\\-.]{0,61})+",
  198. file_paths = "";
  199. if (include_win_path && include_unix_path) {
  200. file_paths = win_path + "|" + unix_path;
  201. } else if (include_win_path) {
  202. file_paths = win_path;
  203. } else if (include_unix_path) {
  204. file_paths = unix_path;
  205. }
  206. if (file_paths) {
  207. var regex = new RegExp(file_paths, "ig");
  208. return Extract._search(input, regex, null, display_total);
  209. } else {
  210. return "";
  211. }
  212. },
  213. /**
  214. * Extract dates operation.
  215. *
  216. * @param {string} input
  217. * @param {Object[]} args
  218. * @returns {string}
  219. */
  220. run_dates: function(input, args) {
  221. var display_total = args[0],
  222. date1 = "(?:19|20)\\d\\d[- /.](?:0[1-9]|1[012])[- /.](?:0[1-9]|[12][0-9]|3[01])", // yyyy-mm-dd
  223. date2 = "(?:0[1-9]|[12][0-9]|3[01])[- /.](?:0[1-9]|1[012])[- /.](?:19|20)\\d\\d", // dd/mm/yyyy
  224. date3 = "(?:0[1-9]|1[012])[- /.](?:0[1-9]|[12][0-9]|3[01])[- /.](?:19|20)\\d\\d", // mm/dd/yyyy
  225. regex = new RegExp(date1 + "|" + date2 + "|" + date3, "ig");
  226. return Extract._search(input, regex, null, display_total);
  227. },
  228. /**
  229. * Extract all identifiers operation.
  230. *
  231. * @param {string} input
  232. * @param {Object[]} args
  233. * @returns {string}
  234. */
  235. run_all_idents: function(input, args) {
  236. var output = "";
  237. output += "IP addresses\n";
  238. output += Extract.run_ip(input, [true, true, false]);
  239. output += "\nEmail addresses\n";
  240. output += Extract.run_email(input, []);
  241. output += "\nMAC addresses\n";
  242. output += Extract.run_mac(input, []);
  243. output += "\nURLs\n";
  244. output += Extract.run_urls(input, []);
  245. output += "\nDomain names\n";
  246. output += Extract.run_domains(input, []);
  247. output += "\nFile paths\n";
  248. output += Extract.run_file_paths(input, [true, true]);
  249. output += "\nDates\n";
  250. output += Extract.run_dates(input, []);
  251. return output;
  252. },
  253. /**
  254. * @constant
  255. * @default
  256. */
  257. XPATH_INITIAL: "",
  258. /**
  259. * @constant
  260. * @default
  261. */
  262. XPATH_DELIMITER: "\\n",
  263. /**
  264. * Extract information (from an xml document) with an XPath query
  265. *
  266. * @param {string} input
  267. * @param {Object[]} args
  268. * @returns {string}
  269. */
  270. run_xpath:function(input, args) {
  271. const query = args[0];
  272. const delimiter = args[1];
  273. try {
  274. var xml = $.parseXML(input);
  275. } catch (err) {
  276. return "Invalid input XML.";
  277. }
  278. try {
  279. var result = $.xpath(xml, query);
  280. } catch (err) {
  281. return "Invalid XPath. Details:\n" + err.message;
  282. }
  283. const serializer = new XMLSerializer();
  284. const nodeToString = function(node) {
  285. const { nodeType, value, wholeText, data } = node;
  286. switch (nodeType) {
  287. case Node.ELEMENT_NODE: return serializer.serializeToString(node);
  288. case Node.ATTRIBUTE_NODE: return value;
  289. case Node.COMMENT_NODE: return data;
  290. case Node.DOCUMENT_NODE: return serializer.serializeToString(node);
  291. default: throw new Error(`Unknown Node Type: ${nodeType}`);
  292. }
  293. }
  294. return Object.values(result).slice(0, -1) // all values except last (length)
  295. .map(nodeToString)
  296. .join(delimiter);
  297. },
  298. /**
  299. * @constant
  300. * @default
  301. */
  302. SELECTOR_INITIAL: "",
  303. /**
  304. * @constant
  305. * @default
  306. */
  307. CSS_QUERY_DELIMITER: "\\n",
  308. /**
  309. * Extract information (from an hmtl document) with an css selector
  310. *
  311. * @param {string} input
  312. * @param {Object[]} args
  313. * @returns {string}
  314. */
  315. run_css_query: function(input, args) {
  316. const query = args[0];
  317. const delimiter = args[1];
  318. try {
  319. var html = $.parseHTML(input);
  320. } catch (err) {
  321. return "Invalid input HTML.";
  322. }
  323. try {
  324. var result = $(html).find(query);
  325. } catch (err) {
  326. return "Invalid CSS Selector. Details:\n" + err.message;
  327. }
  328. const nodeToString = function(node) {
  329. const { nodeType, value, wholeText, data } = node;
  330. switch (nodeType) {
  331. case Node.ELEMENT_NODE: return node.outerHTML;
  332. case Node.ATTRIBUTE_NODE: return value;
  333. case Node.COMMENT_NODE: return data;
  334. case Node.TEXT_NODE: return wholeText;
  335. case Node.DOCUMENT_NODE: return node.outerHTML;
  336. default: throw new Error(`Unknown Node Type: ${nodeType}`);
  337. }
  338. }
  339. return Array.apply(null, Array(result.length))
  340. .map(function (_, i) {return result[i];})
  341. .map(nodeToString)
  342. .join(delimiter);
  343. },
  344. };