浏览代码

Magic operation now detects UTF8 and gives a probability score for each language

n1474335 7 年之前
父节点
当前提交
6624f25a64
共有 5 个文件被更改,包括 131 次插入24 次删除
  1. 1 1
      .eslintignore
  2. 13 0
      package-lock.json
  3. 1 0
      package.json
  4. 13 9
      src/core/FlowControl.js
  5. 103 14
      src/core/lib/Magic.js

+ 1 - 1
.eslintignore

@@ -1,3 +1,3 @@
 src/core/lib/**
 !src/core/lib/Magic.js
-src/core/config/MetaConfig.js
+src/core/config/MetaConfig.js

+ 13 - 0
package-lock.json

@@ -1404,6 +1404,14 @@
         "supports-color": "2.0.0"
       }
     },
+    "chi-squared": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz",
+      "integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=",
+      "requires": {
+        "gamma": "1.0.0"
+      }
+    },
     "chokidar": {
       "version": "1.7.0",
       "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
@@ -4255,6 +4263,11 @@
       "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
       "dev": true
     },
+    "gamma": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz",
+      "integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs="
+    },
     "get-caller-file": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",

+ 1 - 0
package.json

@@ -72,6 +72,7 @@
     "bootstrap": "^3.3.7",
     "bootstrap-colorpicker": "^2.5.2",
     "bootstrap-switch": "^3.3.4",
+    "chi-squared": "^1.1.0",
     "crypto-api": "^0.7.5",
     "crypto-js": "^3.1.9-1",
     "diff": "^3.4.0",

+ 13 - 9
src/core/FlowControl.js

@@ -278,8 +278,7 @@ const FlowControl = {
             <tr>
                 <th>Recipe (click to load)</th>
                 <th>Data snippet</th>
-                <th>Most likely language\n(lower scores are better)</th>
-                <th>File type</th>
+                <th>Properties</th>
             </tr>`;
 
         options.forEach(option => {
@@ -290,20 +289,25 @@ const FlowControl = {
                     .concat(currentRecipeConfig.slice(state.progress + 1)),
                 recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
 
-            const language = option.languageScores[0];
-            let fileType = "Unknown";
+            const bestLanguage = option.languageScores[0];
+            let language = "Unknown",
+                fileType = "Unknown";
+
+            if (bestLanguage.probability > 0.00005) {
+                language = Magic.codeToLanguage(bestLanguage.lang) + " " +
+                    (bestLanguage.probability * 100).toFixed(2) + "%";
+            }
 
             if (option.fileType) {
-                fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`;
-                if (option.fileType.desc)
-                    fileType += `\nDescription: ${option.fileType.desc}`;
+                fileType = `${option.fileType.mime} (${option.fileType.ext})`;
             }
 
             output += `<tr>
                 <td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td>
                 <td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td>
-                <td>${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()}</td>
-                <td>${fileType}</td>
+                <td>Language: ${language}
+File type: ${fileType}
+Valid UTF8: ${option.isUTF8}</td>
             </tr>`;
         });
 

+ 103 - 14
src/core/lib/Magic.js

@@ -3,6 +3,7 @@ import Utils from "../Utils.js";
 import Recipe from "../Recipe.js";
 import Dish from "../Dish.js";
 import FileType from "../operations/FileType.js";
+import chiSquared from "chi-squared";
 
 
 /**
@@ -19,11 +20,12 @@ class Magic {
      * Magic constructor.
      *
      * @param {ArrayBuffer} buf
+     * @param {Object[]} [opPatterns]
      */
-    constructor(buf) {
+    constructor(buf, opPatterns) {
         this.inputBuffer = new Uint8Array(buf);
         this.inputStr = Utils.arrayBufferToStr(buf);
-        this.opPatterns = Magic._generateOpPatterns();
+        this.opPatterns = opPatterns || Magic._generateOpPatterns();
     }
 
     /**
@@ -58,15 +60,17 @@ class Magic {
         let chiSqrs = [];
 
         for (let lang in LANG_FREQS) {
+            let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]);
             chiSqrs.push({
                 lang: lang,
-                chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang])
+                score: score,
+                probability: prob
             });
         }
 
         // Sort results so that the most likely match is at the top
         chiSqrs.sort((a, b) => {
-            return a.chiSqr - b.chiSqr;
+            return a.score - b.score;
         });
 
         return chiSqrs;
@@ -84,6 +88,81 @@ class Magic {
         return FileType.magicType(this.inputBuffer);
     }
 
+    /**
+     * Detects whether the input buffer is valid UTF8.
+     *
+     * @returns {boolean}
+     */
+    isUTF8() {
+        const bytes = new Uint8Array(this.inputBuffer);
+        let i = 0;
+        while (i < bytes.length) {
+            if (( // ASCII
+                bytes[i] === 0x09 ||
+                bytes[i] === 0x0A ||
+                bytes[i] === 0x0D ||
+                (0x20 <= bytes[i] && bytes[i] <= 0x7E)
+            )) {
+                i += 1;
+                continue;
+            }
+
+            if (( // non-overlong 2-byte
+                (0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
+                (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
+            )) {
+                i += 2;
+                continue;
+            }
+
+            if (( // excluding overlongs
+                bytes[i] === 0xE0 &&
+                (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
+            ) ||
+            ( // straight 3-byte
+                ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
+                bytes[i] === 0xEE ||
+                bytes[i] === 0xEF) &&
+                (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
+                (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+            ) ||
+            ( // excluding surrogates
+                bytes[i] === 0xED &&
+                (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
+                (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+            )) {
+                i += 3;
+                continue;
+            }
+
+            if (( // planes 1-3
+                bytes[i] === 0xF0 &&
+                (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+            ) ||
+            ( // planes 4-15
+                (0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
+                (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+            ) ||
+            ( // plane 16
+                bytes[i] === 0xF4 &&
+                (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
+                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+            )) {
+                i += 4;
+                continue;
+            }
+
+            return false;
+        }
+
+        return true;
+    }
 
     /**
      * Speculatively executes matching operations, recording metadata of each result.
@@ -103,6 +182,7 @@ class Magic {
             data: this.inputStr.slice(0, 100),
             languageScores: this.detectLanguage(),
             fileType: this.detectFileType(),
+            isUTF8: this.isUTF8()
         });
 
         // Find any operations that can be run on this data
@@ -122,7 +202,7 @@ class Magic {
             const recipe = new Recipe([opConfig]);
             await recipe.execute(dish, 0);
 
-            const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)),
+            const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns),
                 speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
 
             results = results.concat(speculativeResults);
@@ -131,13 +211,17 @@ class Magic {
         // Return a sorted list of possible recipes along with their properties
         return results.sort((a, b) => {
             // Each option is sorted based on its most likely language (lower is better)
-            let aScore = a.languageScores[0].chiSqr,
-                bScore = b.languageScores[0].chiSqr;
+            let aScore = a.languageScores[0].score,
+                bScore = b.languageScores[0].score;
 
             // If a recipe results in a file being detected, it receives a relatively good score
             if (a.fileType) aScore = 500;
             if (b.fileType) bScore = 500;
 
+            // If the result is valid UTF8, its score gets boosted (lower being better)
+            if (a.isUTF8) aScore -= 100;
+            if (b.isUTF8) bScore -= 100;
+
             return aScore - bScore;
         });
     }
@@ -194,19 +278,24 @@ class Magic {
      * https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
      *
      * @private
-     * @param {number[]} observed 
-     * @param {number[]} expected 
-     * @returns {number}
+     * @param {number[]} observed
+     * @param {number[]} expected
+     * @param {number} ddof - Delta degrees of freedom
+     * @returns {number[]} - The score and the probability
      */
-    static _chiSqr(observed, expected) {
+    static _chiSqr(observed, expected, ddof=0) {
         let tmp,
-            res = 0;
+            score = 0;
 
         for (let i = 0; i < observed.length; i++) {
             tmp = observed[i] - expected[i];
-            res += tmp * tmp / expected[i];
+            score += tmp * tmp / expected[i];
         }
-        return res;
+
+        return [
+            score,
+            1 - chiSquared.cdf(score, observed.length - 1 - ddof)
+        ];
     }
 
     /**