add clip-bpe-ts

2024-01-04 23:43:25 +05:30 · 2024-01-04 23:43:25 +05:30 · 876f52cd70
commit 876f52cd70
parent 5b1421a8d1
4 changed files with 515 additions and 0 deletions
--- a/thirdparty/clip-bpe-ts/LICENSE
+++ b/thirdparty/clip-bpe-ts/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 josephrocca
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/thirdparty/clip-bpe-ts/README.md
+++ b/thirdparty/clip-bpe-ts/README.md
@ -0,0 +1,26 @@
+# CLIP Byte Pair Encoding JavaScript Port
+A JavaScript port of [OpenAI's CLIP byte-pair-encoding tokenizer](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py).
+
+```js
+import Tokenizer from "https://deno.land/x/clip_bpe@v0.0.6/mod.js";
+let t = new Tokenizer();
+
+t.encode("hello") // [3306]
+t.encode("magnificent") // [10724]
+t.encode("magnificently") // [9725, 2922]
+t.decode(t.encode("HELLO")) // "hello "
+t.decode(t.encode("abc123")) // "abc 1 2 3 "
+t.decode(st.encode("let's see here")) // "let 's see here "
+t.encode("hello world!") // [3306, 1002, 256]
+
+// to encode for CLIP (trims to maximum of 77 tokens and adds start and end token, and pads with zeros if less than 77 tokens):
+t.encodeForCLIP("hello world!") // [49406,3306,1002,256,49407,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+```
+
+This encoder/decoder behaves differently to the the GPT-2/3 tokenizer (JavaScript version of that [here](https://github.com/latitudegames/GPT-3-Encoder)). For example, it doesn't preserve capital letters, as shown above.
+
+The [Python version](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py) of this tokenizer uses the `ftfy` module to clean up the text before encoding it. I didn't include that module by default because currently the only version available in JavaScript is [this one](https://github.com/josephrocca/ftfy-pyodide), which requires importing a full Python runtime as a WebAssembly module. If you want the `ftfy` cleaning, just import it and clean your text with it before passing it to the `.encode()` method.
+
+# License
+
+To the extent that there is any original work in this repo, it is MIT Licensed, just like [openai/CLIP](https://github.com/openai/CLIP).
--- a/thirdparty/clip-bpe-ts/bpe_simple_vocab_16e6.mts
+++ b/thirdparty/clip-bpe-ts/bpe_simple_vocab_16e6.mts
--- a/thirdparty/clip-bpe-ts/mod.ts
+++ b/thirdparty/clip-bpe-ts/mod.ts
@ -0,0 +1,466 @@
+import * as htmlEntities from 'html-entities';
+import bpeVocabData from './bpe_simple_vocab_16e6.mjs';
+// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";
+
+function ord(c: string) {
+    return c.charCodeAt(0);
+}
+function range(start: number, stop?: number, step: number = 1) {
+    if (stop === undefined) {
+        stop = start;
+        start = 0;
+    }
+
+    if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
+        return [];
+    }
+
+    const result: number[] = [];
+    for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
+        result.push(i);
+    }
+
+    return result;
+}
+
+function bytesToUnicode() {
+    const bs = [
+        ...range(ord('!'), ord('~') + 1),
+        ...range(ord('¡'), ord('¬') + 1),
+        ...range(ord('®'), ord('ÿ') + 1),
+    ];
+    const cs = bs.slice(0);
+    let n = 0;
+    for (const b of range(2 ** 8)) {
+        if (!bs.includes(b)) {
+            bs.push(b);
+            cs.push(2 ** 8 + n);
+            n += 1;
+        }
+    }
+    const csString = cs.map((n) => String.fromCharCode(n));
+    return Object.fromEntries(bs.map((v, i) => [v, csString[i]]));
+}
+
+function getPairs(word: string | any[]) {
+    const pairs: [string, string][] = [];
+    let prevChar = word[0];
+    for (const char of word.slice(1)) {
+        pairs.push([prevChar, char]);
+        prevChar = char;
+    }
+    return pairs;
+}
+
+function basicClean(text: string) {
+    // text = ftfy.fix_text(text);
+    text = htmlEntities.decode(htmlEntities.decode(text));
+    return text.trim();
+}
+
+function whitespaceClean(text: string) {
+    return text.replace(/\s+/g, ' ').trim();
+}
+
+export default class {
+    byteEncoder;
+    byteDecoder: {
+        [k: string]: number;
+    };
+    encoder;
+    decoder: any;
+    bpeRanks: any;
+    cache: Record<string, string>;
+    pat: RegExp;
+    constructor() {
+        this.byteEncoder = bytesToUnicode();
+        this.byteDecoder = Object.fromEntries(
+            Object.entries(this.byteEncoder).map(([k, v]) => [v, Number(k)])
+        );
+        let merges = bpeVocabData.text.split('\n');
+        merges = merges.slice(1, 49152 - 256 - 2 + 1);
+        const mergedMerges = merges.map((merge) => merge.split(' '));
+        // There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:
+        let vocab = [
+            '!',
+            '"',
+            '#',
+            '$',
+            '%',
+            '&',
+            "'",
+            '(',
+            ')',
+            '*',
+            '+',
+            ',',
+            '-',
+            '.',
+            '/',
+            '0',
+            '1',
+            '2',
+            '3',
+            '4',
+            '5',
+            '6',
+            '7',
+            '8',
+            '9',
+            ':',
+            ';',
+            '<',
+            '=',
+            '>',
+            '?',
+            '@',
+            'A',
+            'B',
+            'C',
+            'D',
+            'E',
+            'F',
+            'G',
+            'H',
+            'I',
+            'J',
+            'K',
+            'L',
+            'M',
+            'N',
+            'O',
+            'P',
+            'Q',
+            'R',
+            'S',
+            'T',
+            'U',
+            'V',
+            'W',
+            'X',
+            'Y',
+            'Z',
+            '[',
+            '\\',
+            ']',
+            '^',
+            '_',
+            '`',
+            'a',
+            'b',
+            'c',
+            'd',
+            'e',
+            'f',
+            'g',
+            'h',
+            'i',
+            'j',
+            'k',
+            'l',
+            'm',
+            'n',
+            'o',
+            'p',
+            'q',
+            'r',
+            's',
+            't',
+            'u',
+            'v',
+            'w',
+            'x',
+            'y',
+            'z',
+            '{',
+            '|',
+            '}',
+            '~',
+            '¡',
+            '¢',
+            '£',
+            '¤',
+            '¥',
+            '¦',
+            '§',
+            '¨',
+            '©',
+            'ª',
+            '«',
+            '¬',
+            '®',
+            '¯',
+            '°',
+            '±',
+            '²',
+            '³',
+            '´',
+            'µ',
+            '¶',
+            '·',
+            '¸',
+            '¹',
+            'º',
+            '»',
+            '¼',
+            '½',
+            '¾',
+            '¿',
+            'À',
+            'Á',
+            'Â',
+            'Ã',
+            'Ä',
+            'Å',
+            'Æ',
+            'Ç',
+            'È',
+            'É',
+            'Ê',
+            'Ë',
+            'Ì',
+            'Í',
+            'Î',
+            'Ï',
+            'Ð',
+            'Ñ',
+            'Ò',
+            'Ó',
+            'Ô',
+            'Õ',
+            'Ö',
+            '×',
+            'Ø',
+            'Ù',
+            'Ú',
+            'Û',
+            'Ü',
+            'Ý',
+            'Þ',
+            'ß',
+            'à',
+            'á',
+            'â',
+            'ã',
+            'ä',
+            'å',
+            'æ',
+            'ç',
+            'è',
+            'é',
+            'ê',
+            'ë',
+            'ì',
+            'í',
+            'î',
+            'ï',
+            'ð',
+            'ñ',
+            'ò',
+            'ó',
+            'ô',
+            'õ',
+            'ö',
+            '÷',
+            'ø',
+            'ù',
+            'ú',
+            'û',
+            'ü',
+            'ý',
+            'þ',
+            'ÿ',
+            'Ā',
+            'ā',
+            'Ă',
+            'ă',
+            'Ą',
+            'ą',
+            'Ć',
+            'ć',
+            'Ĉ',
+            'ĉ',
+            'Ċ',
+            'ċ',
+            'Č',
+            'č',
+            'Ď',
+            'ď',
+            'Đ',
+            'đ',
+            'Ē',
+            'ē',
+            'Ĕ',
+            'ĕ',
+            'Ė',
+            'ė',
+            'Ę',
+            'ę',
+            'Ě',
+            'ě',
+            'Ĝ',
+            'ĝ',
+            'Ğ',
+            'ğ',
+            'Ġ',
+            'ġ',
+            'Ģ',
+            'ģ',
+            'Ĥ',
+            'ĥ',
+            'Ħ',
+            'ħ',
+            'Ĩ',
+            'ĩ',
+            'Ī',
+            'ī',
+            'Ĭ',
+            'ĭ',
+            'Į',
+            'į',
+            'İ',
+            'ı',
+            'Ĳ',
+            'ĳ',
+            'Ĵ',
+            'ĵ',
+            'Ķ',
+            'ķ',
+            'ĸ',
+            'Ĺ',
+            'ĺ',
+            'Ļ',
+            'ļ',
+            'Ľ',
+            'ľ',
+            'Ŀ',
+            'ŀ',
+            'Ł',
+            'ł',
+            'Ń',
+        ];
+        vocab = [...vocab, ...vocab.map((v) => v + '</w>')];
+        for (const merge of mergedMerges) {
+            vocab.push(merge.join(''));
+        }
+        vocab.push('<|startoftext|>', '<|endoftext|>');
+        this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]));
+        this.decoder = Object.fromEntries(
+            Object.entries(this.encoder).map(([k, v]) => [v, k])
+        );
+        this.bpeRanks = Object.fromEntries(
+            mergedMerges.map((v, i) => [v.join('·😎·'), i])
+        ); // ·😎· because js doesn't yet have tuples
+        this.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>',
+        };
+        this.pat =
+            /<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/giu;
+    }
+
+    bpe(token: string) {
+        if (this.cache[token] !== undefined) {
+            return this.cache[token];
+        }
+
+        let word = [...token.slice(0, -1), token.slice(-1) + '</w>'];
+        let pairs = getPairs(word);
+
+        if (pairs.length === 0) {
+            return token + '</w>';
+        }
+
+        // eslint-disable-next-line no-constant-condition
+        while (1) {
+            let bigram: [string, string] | null = null;
+            let minRank = Infinity;
+            for (const p of pairs) {
+                const r = this.bpeRanks[p.join('·😎·')];
+                if (r === undefined) continue;
+                if (r < minRank) {
+                    minRank = r;
+                    bigram = p;
+                }
+            }
+
+            if (bigram === null) {
+                break;
+            }
+
+            const [first, second] = bigram;
+            const newWord: string[] = [];
+            let i = 0;
+            while (i < word.length) {
+                const j = word.indexOf(first, i);
+
+                if (j === -1) {
+                    newWord.push(...word.slice(i));
+                    break;
+                }
+
+                newWord.push(...word.slice(i, j));
+                i = j;
+
+                if (
+                    word[i] === first &&
+                    i < word.length - 1 &&
+                    word[i + 1] === second
+                ) {
+                    newWord.push(first + second);
+                    i += 2;
+                } else {
+                    newWord.push(word[i]);
+                    i += 1;
+                }
+            }
+            word = newWord;
+            if (word.length === 1) {
+                break;
+            } else {
+                pairs = getPairs(word);
+            }
+        }
+        const joinedWord = word.join(' ');
+        this.cache[token] = joinedWord;
+        return joinedWord;
+    }
+
+    encode(text: string) {
+        const bpeTokens: number[] = [];
+        text = whitespaceClean(basicClean(text)).toLowerCase();
+        for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
+            token = [...token]
+                .map((b) => this.byteEncoder[b.charCodeAt(0) as number])
+                .join('');
+            bpeTokens.push(
+                ...this.bpe(token)
+                    .split(' ')
+                    .map((bpeToken: string) => this.encoder[bpeToken])
+            );
+        }
+        return bpeTokens;
+    }
+
+    // adds start and end token, and adds padding 0's and ensures it's 77 tokens long
+    encodeForCLIP(text: string) {
+        let tokens = this.encode(text);
+        tokens.unshift(49406); // start token
+        tokens = tokens.slice(0, 76);
+        tokens.push(49407); // end token
+        while (tokens.length < 77) tokens.push(0);
+        return tokens;
+    }
+
+    decode(tokens: any[]) {
+        let text = tokens
+            .map((token: string | number) => this.decoder[token])
+            .join('');
+        text = [...text]
+            .map((c) => this.byteDecoder[c])
+            .map((v) => String.fromCharCode(v))
+            .join('')
+            .replace(/<\/w>/g, ' ');
+        return text;
+    }
+}