Browse Source

Add operation to normalise unicode

Matthieu 5 years ago
parent
commit
a6fa0628f2

+ 5 - 0
package-lock.json

@@ -14403,6 +14403,11 @@
         "normalize-path": "^2.1.1"
         "normalize-path": "^2.1.1"
       }
       }
     },
     },
+    "unorm": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/unorm/-/unorm-1.6.0.tgz",
+      "integrity": "sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA=="
+    },
     "unpipe": {
     "unpipe": {
       "version": "1.0.0",
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",

+ 1 - 0
package.json

@@ -147,6 +147,7 @@
     "ssdeep.js": "0.0.2",
     "ssdeep.js": "0.0.2",
     "tesseract.js": "^2.0.0-alpha.15",
     "tesseract.js": "^2.0.0-alpha.15",
     "ua-parser-js": "^0.7.20",
     "ua-parser-js": "^0.7.20",
+    "unorm": "^1.6.0",
     "utf8": "^3.0.0",
     "utf8": "^3.0.0",
     "vkbeautify": "^0.99.3",
     "vkbeautify": "^0.99.3",
     "xmldom": "^0.1.27",
     "xmldom": "^0.1.27",

+ 1 - 0
src/core/config/Categories.json

@@ -39,6 +39,7 @@
             "URL Decode",
             "URL Decode",
             "Escape Unicode Characters",
             "Escape Unicode Characters",
             "Unescape Unicode Characters",
             "Unescape Unicode Characters",
+            "Normalise Unicode",
             "To Quoted Printable",
             "To Quoted Printable",
             "From Quoted Printable",
             "From Quoted Printable",
             "To Punycode",
             "To Punycode",

+ 12 - 0
src/core/lib/ChrEnc.mjs

@@ -164,3 +164,15 @@ export const IO_FORMAT = {
     "Simplified Chinese GB18030 (54936)": 54936,
     "Simplified Chinese GB18030 (54936)": 54936,
 };
 };
 
 
+/**
+ * Unicode Normalisation Forms
+ *
+ * @author Matthieu [m@tthieu.xyz]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+/**
+ * Character encoding format mappings.
+ */
+export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];

+ 60 - 0
src/core/operations/NormaliseUnicode.mjs

@@ -0,0 +1,60 @@
+/**
+ * @author Matthieu [m@tthieu.xyz]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import OperationError from "../errors/OperationError.mjs";
+import unorm from "unorm";
+import {UNICODE_NORMALISATION_FORMS} from "../lib/ChrEnc";
+
+/**
+ * Normalise Unicode operation
+ */
+class NormaliseUnicode extends Operation {
+
+    /**
+     * NormaliseUnicode constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "Normalise Unicode";
+        this.module = "UnicodeNormalisation";
+        this.description = "Transform Unicode to one of the Normalisation Form";
+        this.infoURL = "http://www.unicode.org/reports/tr15/";
+        this.inputType = "string";
+        this.outputType = "string";
+        this.args = [
+            {
+                name: "Normal Form",
+                type: "option",
+                value: UNICODE_NORMALISATION_FORMS
+            }
+        ];
+    }
+
+    /**
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    run(input, args) {
+        const [normalForm] = args;
+        if (normalForm === "NFD") {
+            return unorm.nfd(input);
+        } else if (normalForm === "NFC") {
+            return unorm.nfc(input);
+        } else if (normalForm === "NFKD") {
+            return unorm.nfkd(input);
+        } else if (normalForm === "NFKC") {
+            return unorm.nfc(input);
+        }
+
+        throw new OperationError("Unknown Normalisation Form");
+    }
+
+}
+
+export default NormaliseUnicode;

+ 1 - 0
tests/operations/index.mjs

@@ -57,6 +57,7 @@ import "./tests/MS.mjs";
 import "./tests/Magic.mjs";
 import "./tests/Magic.mjs";
 import "./tests/MorseCode.mjs";
 import "./tests/MorseCode.mjs";
 import "./tests/NetBIOS.mjs";
 import "./tests/NetBIOS.mjs";
+import "./tests/NormaliseUnicode.mjs";
 import "./tests/OTP.mjs";
 import "./tests/OTP.mjs";
 import "./tests/PGP.mjs";
 import "./tests/PGP.mjs";
 import "./tests/PHP.mjs";
 import "./tests/PHP.mjs";

+ 54 - 0
tests/operations/tests/NormaliseUnicode.mjs

@@ -0,0 +1,54 @@
+/**
+ * Text Encoding Brute Force tests.
+ *
+ * @author Matthieu [m@tthieux.xyz]
+ *
+ * @copyright Crown Copyright 2018
+ * @license Apache-2.0
+ */
+import TestRegister from "../../lib/TestRegister.mjs";
+
+TestRegister.addTests([
+    {
+        name: "Normalise Unicode - NFD",
+        input: "\u00c7\u0043\u0327\u2160",
+        expectedMatch: /C\u0327C\u0327\u2160/,
+        recipeConfig: [
+            {
+                op: "Normalise Unicode",
+                args: ["NFD"],
+            },
+        ],
+    }, {
+        name: "Normalise Unicode - NFC",
+        input: "\u00c7\u0043\u0327\u2160",
+        expectedMatch: /\u00C7\u00C7\u2160/,
+        recipeConfig: [
+            {
+                op: "Normalise Unicode",
+                args: ["NFC"],
+            },
+        ],
+    }, {
+        name: "Normalise Unicode - NFKD",
+        input: "\u00c7\u0043\u0327\u2160",
+        expectedMatch: /C\u0327C\u0327I/,
+        recipeConfig: [
+            {
+                op: "Normalise Unicode",
+                args: ["NFKD"],
+            },
+        ],
+    }, {
+        name: "Normalise Unicode - NFKC",
+        input: "\u00c7\u0043\u0327\u2160",
+        expectedMatch: /\u00C7\u00C7\u2160/,
+        recipeConfig: [
+            {
+                op: "Normalise Unicode",
+                args: ["NFKC"],
+            },
+        ],
+    },
+]);
+