add clip-bpe-ts

This commit is contained in:
Abhinav 2024-01-04 23:43:25 +05:30
parent 5b1421a8d1
commit 876f52cd70
4 changed files with 515 additions and 0 deletions

21
thirdparty/clip-bpe-ts/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 josephrocca
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

26
thirdparty/clip-bpe-ts/README.md vendored Normal file
View file

@ -0,0 +1,26 @@
# CLIP Byte Pair Encoding JavaScript Port
A JavaScript port of [OpenAI's CLIP byte-pair-encoding tokenizer](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py).
```js
import Tokenizer from "https://deno.land/x/clip_bpe@v0.0.6/mod.js";
let t = new Tokenizer();
t.encode("hello") // [3306]
t.encode("magnificent") // [10724]
t.encode("magnificently") // [9725, 2922]
t.decode(t.encode("HELLO")) // "hello "
t.decode(t.encode("abc123")) // "abc 1 2 3 "
t.decode(st.encode("let's see here")) // "let 's see here "
t.encode("hello world!") // [3306, 1002, 256]
// to encode for CLIP (trims to maximum of 77 tokens and adds start and end token, and pads with zeros if less than 77 tokens):
t.encodeForCLIP("hello world!") // [49406,3306,1002,256,49407,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
```
This encoder/decoder behaves differently to the the GPT-2/3 tokenizer (JavaScript version of that [here](https://github.com/latitudegames/GPT-3-Encoder)). For example, it doesn't preserve capital letters, as shown above.
The [Python version](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py) of this tokenizer uses the `ftfy` module to clean up the text before encoding it. I didn't include that module by default because currently the only version available in JavaScript is [this one](https://github.com/josephrocca/ftfy-pyodide), which requires importing a full Python runtime as a WebAssembly module. If you want the `ftfy` cleaning, just import it and clean your text with it before passing it to the `.encode()` method.
# License
To the extent that there is any original work in this repo, it is MIT Licensed, just like [openai/CLIP](https://github.com/openai/CLIP).

File diff suppressed because one or more lines are too long

466
thirdparty/clip-bpe-ts/mod.ts vendored Normal file
View file

@ -0,0 +1,466 @@
import * as htmlEntities from 'html-entities';
import bpeVocabData from './bpe_simple_vocab_16e6.mjs';
// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";
function ord(c: string) {
return c.charCodeAt(0);
}
function range(start: number, stop?: number, step: number = 1) {
if (stop === undefined) {
stop = start;
start = 0;
}
if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
return [];
}
const result: number[] = [];
for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
result.push(i);
}
return result;
}
function bytesToUnicode() {
const bs = [
...range(ord('!'), ord('~') + 1),
...range(ord('¡'), ord('¬') + 1),
...range(ord('®'), ord('ÿ') + 1),
];
const cs = bs.slice(0);
let n = 0;
for (const b of range(2 ** 8)) {
if (!bs.includes(b)) {
bs.push(b);
cs.push(2 ** 8 + n);
n += 1;
}
}
const csString = cs.map((n) => String.fromCharCode(n));
return Object.fromEntries(bs.map((v, i) => [v, csString[i]]));
}
function getPairs(word: string | any[]) {
const pairs: [string, string][] = [];
let prevChar = word[0];
for (const char of word.slice(1)) {
pairs.push([prevChar, char]);
prevChar = char;
}
return pairs;
}
function basicClean(text: string) {
// text = ftfy.fix_text(text);
text = htmlEntities.decode(htmlEntities.decode(text));
return text.trim();
}
function whitespaceClean(text: string) {
return text.replace(/\s+/g, ' ').trim();
}
export default class {
byteEncoder;
byteDecoder: {
[k: string]: number;
};
encoder;
decoder: any;
bpeRanks: any;
cache: Record<string, string>;
pat: RegExp;
constructor() {
this.byteEncoder = bytesToUnicode();
this.byteDecoder = Object.fromEntries(
Object.entries(this.byteEncoder).map(([k, v]) => [v, Number(k)])
);
let merges = bpeVocabData.text.split('\n');
merges = merges.slice(1, 49152 - 256 - 2 + 1);
const mergedMerges = merges.map((merge) => merge.split(' '));
// There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:
let vocab = [
'!',
'"',
'#',
'$',
'%',
'&',
"'",
'(',
')',
'*',
'+',
',',
'-',
'.',
'/',
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
':',
';',
'<',
'=',
'>',
'?',
'@',
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
'[',
'\\',
']',
'^',
'_',
'`',
'a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
't',
'u',
'v',
'w',
'x',
'y',
'z',
'{',
'|',
'}',
'~',
'¡',
'¢',
'£',
'¤',
'¥',
'¦',
'§',
'¨',
'©',
'ª',
'«',
'¬',
'®',
'¯',
'°',
'±',
'²',
'³',
'´',
'µ',
'¶',
'·',
'¸',
'¹',
'º',
'»',
'¼',
'½',
'¾',
'¿',
'À',
'Á',
'Â',
'Ã',
'Ä',
'Å',
'Æ',
'Ç',
'È',
'É',
'Ê',
'Ë',
'Ì',
'Í',
'Î',
'Ï',
'Ð',
'Ñ',
'Ò',
'Ó',
'Ô',
'Õ',
'Ö',
'×',
'Ø',
'Ù',
'Ú',
'Û',
'Ü',
'Ý',
'Þ',
'ß',
'à',
'á',
'â',
'ã',
'ä',
'å',
'æ',
'ç',
'è',
'é',
'ê',
'ë',
'ì',
'í',
'î',
'ï',
'ð',
'ñ',
'ò',
'ó',
'ô',
'õ',
'ö',
'÷',
'ø',
'ù',
'ú',
'û',
'ü',
'ý',
'þ',
'ÿ',
'Ā',
'ā',
'Ă',
'ă',
'Ą',
'ą',
'Ć',
'ć',
'Ĉ',
'ĉ',
'Ċ',
'ċ',
'Č',
'č',
'Ď',
'ď',
'Đ',
'đ',
'Ē',
'ē',
'Ĕ',
'ĕ',
'Ė',
'ė',
'Ę',
'ę',
'Ě',
'ě',
'Ĝ',
'ĝ',
'Ğ',
'ğ',
'Ġ',
'ġ',
'Ģ',
'ģ',
'Ĥ',
'ĥ',
'Ħ',
'ħ',
'Ĩ',
'ĩ',
'Ī',
'ī',
'Ĭ',
'ĭ',
'Į',
'į',
'İ',
'ı',
'IJ',
'ij',
'Ĵ',
'ĵ',
'Ķ',
'ķ',
'ĸ',
'Ĺ',
'ĺ',
'Ļ',
'ļ',
'Ľ',
'ľ',
'Ŀ',
'ŀ',
'Ł',
'ł',
'Ń',
];
vocab = [...vocab, ...vocab.map((v) => v + '</w>')];
for (const merge of mergedMerges) {
vocab.push(merge.join(''));
}
vocab.push('<|startoftext|>', '<|endoftext|>');
this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]));
this.decoder = Object.fromEntries(
Object.entries(this.encoder).map(([k, v]) => [v, k])
);
this.bpeRanks = Object.fromEntries(
mergedMerges.map((v, i) => [v.join('·😎·'), i])
); // ·😎· because js doesn't yet have tuples
this.cache = {
'<|startoftext|>': '<|startoftext|>',
'<|endoftext|>': '<|endoftext|>',
};
this.pat =
/<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/giu;
}
bpe(token: string) {
if (this.cache[token] !== undefined) {
return this.cache[token];
}
let word = [...token.slice(0, -1), token.slice(-1) + '</w>'];
let pairs = getPairs(word);
if (pairs.length === 0) {
return token + '</w>';
}
// eslint-disable-next-line no-constant-condition
while (1) {
let bigram: [string, string] | null = null;
let minRank = Infinity;
for (const p of pairs) {
const r = this.bpeRanks[p.join('·😎·')];
if (r === undefined) continue;
if (r < minRank) {
minRank = r;
bigram = p;
}
}
if (bigram === null) {
break;
}
const [first, second] = bigram;
const newWord: string[] = [];
let i = 0;
while (i < word.length) {
const j = word.indexOf(first, i);
if (j === -1) {
newWord.push(...word.slice(i));
break;
}
newWord.push(...word.slice(i, j));
i = j;
if (
word[i] === first &&
i < word.length - 1 &&
word[i + 1] === second
) {
newWord.push(first + second);
i += 2;
} else {
newWord.push(word[i]);
i += 1;
}
}
word = newWord;
if (word.length === 1) {
break;
} else {
pairs = getPairs(word);
}
}
const joinedWord = word.join(' ');
this.cache[token] = joinedWord;
return joinedWord;
}
encode(text: string) {
const bpeTokens: number[] = [];
text = whitespaceClean(basicClean(text)).toLowerCase();
for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
token = [...token]
.map((b) => this.byteEncoder[b.charCodeAt(0) as number])
.join('');
bpeTokens.push(
...this.bpe(token)
.split(' ')
.map((bpeToken: string) => this.encoder[bpeToken])
);
}
return bpeTokens;
}
// adds start and end token, and adds padding 0's and ensures it's 77 tokens long
encodeForCLIP(text: string) {
let tokens = this.encode(text);
tokens.unshift(49406); // start token
tokens = tokens.slice(0, 76);
tokens.push(49407); // end token
while (tokens.length < 77) tokens.push(0);
return tokens;
}
decode(tokens: any[]) {
let text = tokens
.map((token: string | number) => this.decoder[token])
.join('');
text = [...text]
.map((c) => this.byteDecoder[c])
.map((v) => String.fromCharCode(v))
.join('')
.replace(/<\/w>/g, ' ');
return text;
}
}