1 ano atrás · eebb90fb40
--- a/desktop/build/ggmlclip-linux
+++ b/desktop/build/ggmlclip-linux
--- a/desktop/build/ggmlclip-mac
+++ b/desktop/build/ggmlclip-mac
--- a/desktop/build/ggmlclip-windows.exe
+++ b/desktop/build/ggmlclip-windows.exe
--- a/desktop/build/msvcp140d.dll
+++ b/desktop/build/msvcp140d.dll
--- a/desktop/build/ucrtbased.dll
+++ b/desktop/build/ucrtbased.dll
--- a/desktop/build/vcruntime140_1d.dll
+++ b/desktop/build/vcruntime140_1d.dll
--- a/desktop/build/vcruntime140d.dll
+++ b/desktop/build/vcruntime140d.dll
--- a/desktop/docs/dependencies.md
+++ b/desktop/docs/dependencies.md
@@ -111,11 +111,11 @@ watcher for the watch folders functionality.
 
				 
			
 
				 ### AI/ML
			
 
				 
			
 
				--   [onnxruntime-node](https://github.com/Microsoft/onnxruntime)
			
 
				--   html-entities is used by the bundled clip-bpe-ts.
			
 
				--   GGML binaries are bundled
			
 
				--   We also use [jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) for
			
 
				-    conversion of all images to JPEG before processing.
			
 
				+-   [onnxruntime-node](https://github.com/Microsoft/onnxruntime) is used for
			
 
				+    natural language searches based on CLIP.
			
 
				+-   html-entities is used by the bundled clip-bpe-ts tokenizer.
			
 
				+-   [jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) is used for decoding
			
 
				+    JPEG data into raw RGB bytes before passing it to ONNX.
			
 
				 
			
 
				 ## ZIP
			
 
				 
			
--- a/desktop/electron-builder.yml
+++ b/desktop/electron-builder.yml
@@ -19,7 +19,6 @@ mac:
 
				         arch: [universal]
			
 
				     category: public.app-category.photography
			
 
				     hardenedRuntime: true
			
 
				-    x64ArchFiles: Contents/Resources/ggmlclip-mac
			
 
				 afterSign: electron-builder-notarize
			
 
				 extraFiles:
			
 
				     - from: build
			
--- a/desktop/src/main/ipc.ts
+++ b/desktop/src/main/ipc.ts
@@ -17,9 +17,9 @@ import {
 
				     updateAndRestart,
			
 
				 } from "../services/appUpdater";
			
 
				 import {
			
 
				-    computeImageEmbedding,
			
 
				-    computeTextEmbedding,
			
 
				-} from "../services/clipService";
			
 
				+    clipImageEmbedding,
			
 
				+    clipTextEmbedding,
			
 
				+} from "../services/clip-service";
			
 
				 import { runFFmpegCmd } from "../services/ffmpeg";
			
 
				 import { getDirFiles } from "../services/fs";
			
 
				 import {
			
@@ -44,12 +44,7 @@ import {
 
				     updateWatchMappingIgnoredFiles,
			
 
				     updateWatchMappingSyncedFiles,
			
 
				 } from "../services/watch";
			
 
				-import type {
			
 
				-    ElectronFile,
			
 
				-    FILE_PATH_TYPE,
			
 
				-    Model,
			
 
				-    WatchMapping,
			
 
				-} from "../types/ipc";
			
 
				+import type { ElectronFile, FILE_PATH_TYPE, WatchMapping } from "../types/ipc";
			
 
				 import {
			
 
				     selectDirectory,
			
 
				     showUploadDirsDialog,
			
@@ -148,14 +143,12 @@ export const attachIPCHandlers = () => {
 
				 
			
 
				     // - ML
			
 
				 
			
 
				-    ipcMain.handle(
			
 
				-        "computeImageEmbedding",
			
 
				-        (_, model: Model, imageData: Uint8Array) =>
			
 
				-            computeImageEmbedding(model, imageData),
			
 
				+    ipcMain.handle("clipImageEmbedding", (_, jpegImageData: Uint8Array) =>
			
 
				+        clipImageEmbedding(jpegImageData),
			
 
				     );
			
 
				 
			
 
				-    ipcMain.handle("computeTextEmbedding", (_, model: Model, text: string) =>
			
 
				-        computeTextEmbedding(model, text),
			
 
				+    ipcMain.handle("clipTextEmbedding", (_, text: string) =>
			
 
				+        clipTextEmbedding(text),
			
 
				     );
			
 
				 
			
 
				     // - File selection
			
--- a/desktop/src/main/log.ts
+++ b/desktop/src/main/log.ts
@@ -64,7 +64,10 @@ const logInfo = (...params: any[]) => {
 
				 };
			
 
				 
			
 
				 const logDebug = (param: () => any) => {
			
 
				-    if (isDev) console.log(`[debug] ${util.inspect(param())}`);
			
 
				+    if (isDev) {
			
 
				+        const p = param();
			
 
				+        console.log(`[debug] ${typeof p == "string" ? p : util.inspect(p)}`);
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 /**
			
--- a/desktop/src/preload.ts
+++ b/desktop/src/preload.ts
@@ -45,7 +45,6 @@ import type {
 
				     AppUpdateInfo,
			
 
				     ElectronFile,
			
 
				     FILE_PATH_TYPE,
			
 
				-    Model,
			
 
				     WatchMapping,
			
 
				 } from "./types/ipc";
			
 
				 
			
@@ -141,17 +140,11 @@ const runFFmpegCmd = (
 
				 
			
 
				 // - ML
			
 
				 
			
 
				-const computeImageEmbedding = (
			
 
				-    model: Model,
			
 
				-    imageData: Uint8Array,
			
 
				-): Promise<Float32Array> =>
			
 
				-    ipcRenderer.invoke("computeImageEmbedding", model, imageData);
			
 
				+const clipImageEmbedding = (jpegImageData: Uint8Array): Promise<Float32Array> =>
			
 
				+    ipcRenderer.invoke("clipImageEmbedding", jpegImageData);
			
 
				 
			
 
				-const computeTextEmbedding = (
			
 
				-    model: Model,
			
 
				-    text: string,
			
 
				-): Promise<Float32Array> =>
			
 
				-    ipcRenderer.invoke("computeTextEmbedding", model, text);
			
 
				+const clipTextEmbedding = (text: string): Promise<Float32Array> =>
			
 
				+    ipcRenderer.invoke("clipTextEmbedding", text);
			
 
				 
			
 
				 // - File selection
			
 
				 
			
@@ -332,8 +325,8 @@ contextBridge.exposeInMainWorld("electron", {
 
				     runFFmpegCmd,
			
 
				 
			
 
				     // - ML
			
 
				-    computeImageEmbedding,
			
 
				-    computeTextEmbedding,
			
 
				+    clipImageEmbedding,
			
 
				+    clipTextEmbedding,
			
 
				 
			
 
				     // - File selection
			
 
				     selectDirectory,
			
--- a/desktop/src/services/clip-service.ts
+++ b/desktop/src/services/clip-service.ts
@@ -0,0 +1,288 @@
 
				+/**
			
 
				+ * @file Compute CLIP embeddings
			
 
				+ *
			
 
				+ * @see `web/apps/photos/src/services/clip-service.ts` for more details. This
			
 
				+ * file implements the Node.js implementation of the actual embedding
			
 
				+ * computation. By doing it in the Node.js layer, we can use the binary ONNX
			
 
				+ * runtimes which are 10-20x faster than the WASM based web ones.
			
 
				+ *
			
 
				+ * The embeddings are computed using ONNX runtime. The model itself is not
			
 
				+ * shipped with the app but is downloaded on demand.
			
 
				+ */
			
 
				+import { app, net } from "electron/main";
			
 
				+import { existsSync } from "fs";
			
 
				+import fs from "node:fs/promises";
			
 
				+import path from "node:path";
			
 
				+import { writeStream } from "../main/fs";
			
 
				+import log from "../main/log";
			
 
				+import { CustomErrors } from "../types/ipc";
			
 
				+import Tokenizer from "../utils/clip-bpe-ts/mod";
			
 
				+import { generateTempFilePath } from "../utils/temp";
			
 
				+import { deleteTempFile } from "./ffmpeg";
			
 
				+const jpeg = require("jpeg-js");
			
 
				+const ort = require("onnxruntime-node");
			
 
				+
			
 
				+const textModelName = "clip-text-vit-32-uint8.onnx";
			
 
				+const textModelByteSize = 64173509; // 61.2 MB
			
 
				+
			
 
				+const imageModelName = "clip-image-vit-32-float32.onnx";
			
 
				+const imageModelByteSize = 351468764; // 335.2 MB
			
 
				+
			
 
				+/** Return the path where the given {@link modelName} is meant to be saved */
			
 
				+const modelSavePath = (modelName: string) =>
			
 
				+    path.join(app.getPath("userData"), "models", modelName);
			
 
				+
			
 
				+const downloadModel = async (saveLocation: string, name: string) => {
			
 
				+    // `mkdir -p` the directory where we want to save the model.
			
 
				+    const saveDir = path.dirname(saveLocation);
			
 
				+    await fs.mkdir(saveDir, { recursive: true });
			
 
				+    // Download
			
 
				+    log.info(`Downloading CLIP model from ${name}`);
			
 
				+    const url = `https://models.ente.io/${name}`;
			
 
				+    const res = await net.fetch(url);
			
 
				+    if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
			
 
				+    // Save
			
 
				+    await writeStream(saveLocation, res.body);
			
 
				+    log.info(`Downloaded CLIP model ${name}`);
			
 
				+};
			
 
				+
			
 
				+let activeImageModelDownload: Promise<void> | undefined;
			
 
				+
			
 
				+const imageModelPathDownloadingIfNeeded = async () => {
			
 
				+    try {
			
 
				+        const modelPath = modelSavePath(imageModelName);
			
 
				+        if (activeImageModelDownload) {
			
 
				+            log.info("Waiting for CLIP image model download to finish");
			
 
				+            await activeImageModelDownload;
			
 
				+        } else {
			
 
				+            if (!existsSync(modelPath)) {
			
 
				+                log.info("CLIP image model not found, downloading");
			
 
				+                activeImageModelDownload = downloadModel(
			
 
				+                    modelPath,
			
 
				+                    imageModelName,
			
 
				+                );
			
 
				+                await activeImageModelDownload;
			
 
				+            } else {
			
 
				+                const localFileSize = (await fs.stat(modelPath)).size;
			
 
				+                if (localFileSize !== imageModelByteSize) {
			
 
				+                    log.error(
			
 
				+                        `CLIP image model size ${localFileSize} does not match the expected size, downloading again`,
			
 
				+                    );
			
 
				+                    activeImageModelDownload = downloadModel(
			
 
				+                        modelPath,
			
 
				+                        imageModelName,
			
 
				+                    );
			
 
				+                    await activeImageModelDownload;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        return modelPath;
			
 
				+    } finally {
			
 
				+        activeImageModelDownload = undefined;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+let textModelDownloadInProgress = false;
			
 
				+
			
 
				+const textModelPathDownloadingIfNeeded = async () => {
			
 
				+    if (textModelDownloadInProgress)
			
 
				+        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				+
			
 
				+    const modelPath = modelSavePath(textModelName);
			
 
				+    if (!existsSync(modelPath)) {
			
 
				+        log.info("CLIP text model not found, downloading");
			
 
				+        textModelDownloadInProgress = true;
			
 
				+        downloadModel(modelPath, textModelName)
			
 
				+            .catch((e) => {
			
 
				+                // log but otherwise ignore
			
 
				+                log.error("CLIP text model download failed", e);
			
 
				+            })
			
 
				+            .finally(() => {
			
 
				+                textModelDownloadInProgress = false;
			
 
				+            });
			
 
				+        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				+    } else {
			
 
				+        const localFileSize = (await fs.stat(modelPath)).size;
			
 
				+        if (localFileSize !== textModelByteSize) {
			
 
				+            log.error(
			
 
				+                `CLIP text model size ${localFileSize} does not match the expected size, downloading again`,
			
 
				+            );
			
 
				+            textModelDownloadInProgress = true;
			
 
				+            downloadModel(modelPath, textModelName)
			
 
				+                .catch((e) => {
			
 
				+                    // log but otherwise ignore
			
 
				+                    log.error("CLIP text model download failed", e);
			
 
				+                })
			
 
				+                .finally(() => {
			
 
				+                    textModelDownloadInProgress = false;
			
 
				+                });
			
 
				+            throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return modelPath;
			
 
				+};
			
 
				+
			
 
				+const createInferenceSession = async (modelPath: string) => {
			
 
				+    return await ort.InferenceSession.create(modelPath, {
			
 
				+        intraOpNumThreads: 1,
			
 
				+        enableCpuMemArena: false,
			
 
				+    });
			
 
				+};
			
 
				+
			
 
				+let imageSessionPromise: Promise<any> | undefined;
			
 
				+
			
 
				+const onnxImageSession = async () => {
			
 
				+    if (!imageSessionPromise) {
			
 
				+        imageSessionPromise = (async () => {
			
 
				+            const modelPath = await imageModelPathDownloadingIfNeeded();
			
 
				+            return createInferenceSession(modelPath);
			
 
				+        })();
			
 
				+    }
			
 
				+    return imageSessionPromise;
			
 
				+};
			
 
				+
			
 
				+let _textSession: any = null;
			
 
				+
			
 
				+const onnxTextSession = async () => {
			
 
				+    if (!_textSession) {
			
 
				+        const modelPath = await textModelPathDownloadingIfNeeded();
			
 
				+        _textSession = await createInferenceSession(modelPath);
			
 
				+    }
			
 
				+    return _textSession;
			
 
				+};
			
 
				+
			
 
				+export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
			
 
				+    const tempFilePath = await generateTempFilePath("");
			
 
				+    const imageStream = new Response(jpegImageData.buffer).body;
			
 
				+    await writeStream(tempFilePath, imageStream);
			
 
				+    try {
			
 
				+        return await clipImageEmbedding_(tempFilePath);
			
 
				+    } finally {
			
 
				+        await deleteTempFile(tempFilePath);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+const clipImageEmbedding_ = async (jpegFilePath: string) => {
			
 
				+    const imageSession = await onnxImageSession();
			
 
				+    const t1 = Date.now();
			
 
				+    const rgbData = await getRGBData(jpegFilePath);
			
 
				+    const feeds = {
			
 
				+        input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
			
 
				+    };
			
 
				+    const t2 = Date.now();
			
 
				+    const results = await imageSession.run(feeds);
			
 
				+    log.debug(
			
 
				+        () =>
			
 
				+            `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
			
 
				+    );
			
 
				+    const imageEmbedding = results["output"].data; // Float32Array
			
 
				+    return normalizeEmbedding(imageEmbedding);
			
 
				+};
			
 
				+
			
 
				+const getRGBData = async (jpegFilePath: string) => {
			
 
				+    const jpegData = await fs.readFile(jpegFilePath);
			
 
				+    const rawImageData = jpeg.decode(jpegData, {
			
 
				+        useTArray: true,
			
 
				+        formatAsRGBA: false,
			
 
				+    });
			
 
				+
			
 
				+    const nx: number = rawImageData.width;
			
 
				+    const ny: number = rawImageData.height;
			
 
				+    const inputImage: Uint8Array = rawImageData.data;
			
 
				+
			
 
				+    const nx2: number = 224;
			
 
				+    const ny2: number = 224;
			
 
				+    const totalSize: number = 3 * nx2 * ny2;
			
 
				+
			
 
				+    const result: number[] = Array(totalSize).fill(0);
			
 
				+    const scale: number = Math.max(nx, ny) / 224;
			
 
				+
			
 
				+    const nx3: number = Math.round(nx / scale);
			
 
				+    const ny3: number = Math.round(ny / scale);
			
 
				+
			
 
				+    const mean: number[] = [0.48145466, 0.4578275, 0.40821073];
			
 
				+    const std: number[] = [0.26862954, 0.26130258, 0.27577711];
			
 
				+
			
 
				+    for (let y = 0; y < ny3; y++) {
			
 
				+        for (let x = 0; x < nx3; x++) {
			
 
				+            for (let c = 0; c < 3; c++) {
			
 
				+                // Linear interpolation
			
 
				+                const sx: number = (x + 0.5) * scale - 0.5;
			
 
				+                const sy: number = (y + 0.5) * scale - 0.5;
			
 
				+
			
 
				+                const x0: number = Math.max(0, Math.floor(sx));
			
 
				+                const y0: number = Math.max(0, Math.floor(sy));
			
 
				+
			
 
				+                const x1: number = Math.min(x0 + 1, nx - 1);
			
 
				+                const y1: number = Math.min(y0 + 1, ny - 1);
			
 
				+
			
 
				+                const dx: number = sx - x0;
			
 
				+                const dy: number = sy - y0;
			
 
				+
			
 
				+                const j00: number = 3 * (y0 * nx + x0) + c;
			
 
				+                const j01: number = 3 * (y0 * nx + x1) + c;
			
 
				+                const j10: number = 3 * (y1 * nx + x0) + c;
			
 
				+                const j11: number = 3 * (y1 * nx + x1) + c;
			
 
				+
			
 
				+                const v00: number = inputImage[j00];
			
 
				+                const v01: number = inputImage[j01];
			
 
				+                const v10: number = inputImage[j10];
			
 
				+                const v11: number = inputImage[j11];
			
 
				+
			
 
				+                const v0: number = v00 * (1 - dx) + v01 * dx;
			
 
				+                const v1: number = v10 * (1 - dx) + v11 * dx;
			
 
				+
			
 
				+                const v: number = v0 * (1 - dy) + v1 * dy;
			
 
				+
			
 
				+                const v2: number = Math.min(Math.max(Math.round(v), 0), 255);
			
 
				+
			
 
				+                // createTensorWithDataList is dumb compared to reshape and
			
 
				+                // hence has to be given with one channel after another
			
 
				+                const i: number = y * nx3 + x + (c % 3) * 224 * 224;
			
 
				+
			
 
				+                result[i] = (v2 / 255 - mean[c]) / std[c];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return result;
			
 
				+};
			
 
				+
			
 
				+const normalizeEmbedding = (embedding: Float32Array) => {
			
 
				+    let normalization = 0;
			
 
				+    for (let index = 0; index < embedding.length; index++) {
			
 
				+        normalization += embedding[index] * embedding[index];
			
 
				+    }
			
 
				+    const sqrtNormalization = Math.sqrt(normalization);
			
 
				+    for (let index = 0; index < embedding.length; index++) {
			
 
				+        embedding[index] = embedding[index] / sqrtNormalization;
			
 
				+    }
			
 
				+    return embedding;
			
 
				+};
			
 
				+
			
 
				+let _tokenizer: Tokenizer = null;
			
 
				+const getTokenizer = () => {
			
 
				+    if (!_tokenizer) {
			
 
				+        _tokenizer = new Tokenizer();
			
 
				+    }
			
 
				+    return _tokenizer;
			
 
				+};
			
 
				+
			
 
				+export const clipTextEmbedding = async (text: string) => {
			
 
				+    const imageSession = await onnxTextSession();
			
 
				+    const t1 = Date.now();
			
 
				+    const tokenizer = getTokenizer();
			
 
				+    const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text));
			
 
				+    const feeds = {
			
 
				+        input: new ort.Tensor("int32", tokenizedText, [1, 77]),
			
 
				+    };
			
 
				+    const t2 = Date.now();
			
 
				+    const results = await imageSession.run(feeds);
			
 
				+    log.debug(
			
 
				+        () =>
			
 
				+            `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
			
 
				+    );
			
 
				+    const textEmbedding = results["output"].data;
			
 
				+    return normalizeEmbedding(textEmbedding);
			
 
				+};
			
--- a/desktop/src/services/clipService.ts
+++ b/desktop/src/services/clipService.ts
@@ -1,463 +0,0 @@
 
				-import { app, net } from "electron/main";
			
 
				-import { existsSync } from "fs";
			
 
				-import fs from "node:fs/promises";
			
 
				-import path from "node:path";
			
 
				-import { writeStream } from "../main/fs";
			
 
				-import log from "../main/log";
			
 
				-import { execAsync, isDev } from "../main/util";
			
 
				-import { CustomErrors, Model, isModel } from "../types/ipc";
			
 
				-import Tokenizer from "../utils/clip-bpe-ts/mod";
			
 
				-import { getPlatform } from "../utils/common/platform";
			
 
				-import { generateTempFilePath } from "../utils/temp";
			
 
				-import { deleteTempFile } from "./ffmpeg";
			
 
				-const jpeg = require("jpeg-js");
			
 
				-
			
 
				-const CLIP_MODEL_PATH_PLACEHOLDER = "CLIP_MODEL";
			
 
				-const GGMLCLIP_PATH_PLACEHOLDER = "GGML_PATH";
			
 
				-const INPUT_PATH_PLACEHOLDER = "INPUT";
			
 
				-
			
 
				-const IMAGE_EMBEDDING_EXTRACT_CMD: string[] = [
			
 
				-    GGMLCLIP_PATH_PLACEHOLDER,
			
 
				-    "-mv",
			
 
				-    CLIP_MODEL_PATH_PLACEHOLDER,
			
 
				-    "--image",
			
 
				-    INPUT_PATH_PLACEHOLDER,
			
 
				-];
			
 
				-
			
 
				-const TEXT_EMBEDDING_EXTRACT_CMD: string[] = [
			
 
				-    GGMLCLIP_PATH_PLACEHOLDER,
			
 
				-    "-mt",
			
 
				-    CLIP_MODEL_PATH_PLACEHOLDER,
			
 
				-    "--text",
			
 
				-    INPUT_PATH_PLACEHOLDER,
			
 
				-];
			
 
				-const ort = require("onnxruntime-node");
			
 
				-
			
 
				-const TEXT_MODEL_DOWNLOAD_URL = {
			
 
				-    ggml: "https://models.ente.io/clip-vit-base-patch32_ggml-text-model-f16.gguf",
			
 
				-    onnx: "https://models.ente.io/clip-text-vit-32-uint8.onnx",
			
 
				-};
			
 
				-const IMAGE_MODEL_DOWNLOAD_URL = {
			
 
				-    ggml: "https://models.ente.io/clip-vit-base-patch32_ggml-vision-model-f16.gguf",
			
 
				-    onnx: "https://models.ente.io/clip-image-vit-32-float32.onnx",
			
 
				-};
			
 
				-
			
 
				-const TEXT_MODEL_NAME = {
			
 
				-    ggml: "clip-vit-base-patch32_ggml-text-model-f16.gguf",
			
 
				-    onnx: "clip-text-vit-32-uint8.onnx",
			
 
				-};
			
 
				-const IMAGE_MODEL_NAME = {
			
 
				-    ggml: "clip-vit-base-patch32_ggml-vision-model-f16.gguf",
			
 
				-    onnx: "clip-image-vit-32-float32.onnx",
			
 
				-};
			
 
				-
			
 
				-const IMAGE_MODEL_SIZE_IN_BYTES = {
			
 
				-    ggml: 175957504, // 167.8 MB
			
 
				-    onnx: 351468764, // 335.2 MB
			
 
				-};
			
 
				-const TEXT_MODEL_SIZE_IN_BYTES = {
			
 
				-    ggml: 127853440, // 121.9 MB,
			
 
				-    onnx: 64173509, // 61.2 MB
			
 
				-};
			
 
				-
			
 
				-/** Return the path where the given {@link modelName} is meant to be saved */
			
 
				-const getModelSavePath = (modelName: string) =>
			
 
				-    path.join(app.getPath("userData"), "models", modelName);
			
 
				-
			
 
				-async function downloadModel(saveLocation: string, url: string) {
			
 
				-    // confirm that the save location exists
			
 
				-    const saveDir = path.dirname(saveLocation);
			
 
				-    await fs.mkdir(saveDir, { recursive: true });
			
 
				-    log.info("downloading clip model");
			
 
				-    const res = await net.fetch(url);
			
 
				-    if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
			
 
				-    await writeStream(saveLocation, res.body);
			
 
				-    log.info("clip model downloaded");
			
 
				-}
			
 
				-
			
 
				-let imageModelDownloadInProgress: Promise<void> = null;
			
 
				-
			
 
				-const getClipImageModelPath = async (type: "ggml" | "onnx") => {
			
 
				-    try {
			
 
				-        const modelSavePath = getModelSavePath(IMAGE_MODEL_NAME[type]);
			
 
				-        if (imageModelDownloadInProgress) {
			
 
				-            log.info("waiting for image model download to finish");
			
 
				-            await imageModelDownloadInProgress;
			
 
				-        } else {
			
 
				-            if (!existsSync(modelSavePath)) {
			
 
				-                log.info("CLIP image model not found, downloading");
			
 
				-                imageModelDownloadInProgress = downloadModel(
			
 
				-                    modelSavePath,
			
 
				-                    IMAGE_MODEL_DOWNLOAD_URL[type],
			
 
				-                );
			
 
				-                await imageModelDownloadInProgress;
			
 
				-            } else {
			
 
				-                const localFileSize = (await fs.stat(modelSavePath)).size;
			
 
				-                if (localFileSize !== IMAGE_MODEL_SIZE_IN_BYTES[type]) {
			
 
				-                    log.info(
			
 
				-                        `CLIP image model size mismatch, downloading again got: ${localFileSize}`,
			
 
				-                    );
			
 
				-                    imageModelDownloadInProgress = downloadModel(
			
 
				-                        modelSavePath,
			
 
				-                        IMAGE_MODEL_DOWNLOAD_URL[type],
			
 
				-                    );
			
 
				-                    await imageModelDownloadInProgress;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        return modelSavePath;
			
 
				-    } finally {
			
 
				-        imageModelDownloadInProgress = null;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-let textModelDownloadInProgress: boolean = false;
			
 
				-
			
 
				-const getClipTextModelPath = async (type: "ggml" | "onnx") => {
			
 
				-    const modelSavePath = getModelSavePath(TEXT_MODEL_NAME[type]);
			
 
				-    if (textModelDownloadInProgress) {
			
 
				-        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				-    } else {
			
 
				-        if (!existsSync(modelSavePath)) {
			
 
				-            log.info("CLIP text model not found, downloading");
			
 
				-            textModelDownloadInProgress = true;
			
 
				-            downloadModel(modelSavePath, TEXT_MODEL_DOWNLOAD_URL[type])
			
 
				-                .catch((e) => {
			
 
				-                    // log but otherwise ignore
			
 
				-                    log.error("CLIP text model download failed", e);
			
 
				-                })
			
 
				-                .finally(() => {
			
 
				-                    textModelDownloadInProgress = false;
			
 
				-                });
			
 
				-            throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				-        } else {
			
 
				-            const localFileSize = (await fs.stat(modelSavePath)).size;
			
 
				-            if (localFileSize !== TEXT_MODEL_SIZE_IN_BYTES[type]) {
			
 
				-                log.info(
			
 
				-                    `CLIP text model size mismatch, downloading again got: ${localFileSize}`,
			
 
				-                );
			
 
				-                textModelDownloadInProgress = true;
			
 
				-                downloadModel(modelSavePath, TEXT_MODEL_DOWNLOAD_URL[type])
			
 
				-                    .catch((e) => {
			
 
				-                        // log but otherwise ignore
			
 
				-                        log.error("CLIP text model download failed", e);
			
 
				-                    })
			
 
				-                    .finally(() => {
			
 
				-                        textModelDownloadInProgress = false;
			
 
				-                    });
			
 
				-                throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    return modelSavePath;
			
 
				-};
			
 
				-
			
 
				-function getGGMLClipPath() {
			
 
				-    return isDev
			
 
				-        ? path.join("./build", `ggmlclip-${getPlatform()}`)
			
 
				-        : path.join(process.resourcesPath, `ggmlclip-${getPlatform()}`);
			
 
				-}
			
 
				-
			
 
				-async function createOnnxSession(modelPath: string) {
			
 
				-    return await ort.InferenceSession.create(modelPath, {
			
 
				-        intraOpNumThreads: 1,
			
 
				-        enableCpuMemArena: false,
			
 
				-    });
			
 
				-}
			
 
				-
			
 
				-let onnxImageSessionPromise: Promise<any> = null;
			
 
				-
			
 
				-async function getOnnxImageSession() {
			
 
				-    if (!onnxImageSessionPromise) {
			
 
				-        onnxImageSessionPromise = (async () => {
			
 
				-            const clipModelPath = await getClipImageModelPath("onnx");
			
 
				-            return createOnnxSession(clipModelPath);
			
 
				-        })();
			
 
				-    }
			
 
				-    return onnxImageSessionPromise;
			
 
				-}
			
 
				-
			
 
				-let onnxTextSession: any = null;
			
 
				-
			
 
				-async function getOnnxTextSession() {
			
 
				-    if (!onnxTextSession) {
			
 
				-        const clipModelPath = await getClipTextModelPath("onnx");
			
 
				-        onnxTextSession = await createOnnxSession(clipModelPath);
			
 
				-    }
			
 
				-    return onnxTextSession;
			
 
				-}
			
 
				-
			
 
				-let tokenizer: Tokenizer = null;
			
 
				-function getTokenizer() {
			
 
				-    if (!tokenizer) {
			
 
				-        tokenizer = new Tokenizer();
			
 
				-    }
			
 
				-    return tokenizer;
			
 
				-}
			
 
				-
			
 
				-export const computeImageEmbedding = async (
			
 
				-    model: Model,
			
 
				-    imageData: Uint8Array,
			
 
				-): Promise<Float32Array> => {
			
 
				-    if (!isModel(model)) throw new Error(`Invalid CLIP model ${model}`);
			
 
				-
			
 
				-    let tempInputFilePath = null;
			
 
				-    try {
			
 
				-        tempInputFilePath = await generateTempFilePath("");
			
 
				-        const imageStream = new Response(imageData.buffer).body;
			
 
				-        await writeStream(tempInputFilePath, imageStream);
			
 
				-        const embedding = await computeImageEmbedding_(
			
 
				-            model,
			
 
				-            tempInputFilePath,
			
 
				-        );
			
 
				-        return embedding;
			
 
				-    } catch (err) {
			
 
				-        if (isExecError(err)) {
			
 
				-            const parsedExecError = parseExecError(err);
			
 
				-            throw Error(parsedExecError);
			
 
				-        } else {
			
 
				-            throw err;
			
 
				-        }
			
 
				-    } finally {
			
 
				-        if (tempInputFilePath) {
			
 
				-            await deleteTempFile(tempInputFilePath);
			
 
				-        }
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-const isExecError = (err: any) => {
			
 
				-    return err.message.includes("Command failed:");
			
 
				-};
			
 
				-
			
 
				-const parseExecError = (err: any) => {
			
 
				-    const errMessage = err.message;
			
 
				-    if (errMessage.includes("Bad CPU type in executable")) {
			
 
				-        return CustomErrors.UNSUPPORTED_PLATFORM(
			
 
				-            process.platform,
			
 
				-            process.arch,
			
 
				-        );
			
 
				-    } else {
			
 
				-        return errMessage;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-async function computeImageEmbedding_(
			
 
				-    model: Model,
			
 
				-    inputFilePath: string,
			
 
				-): Promise<Float32Array> {
			
 
				-    if (!existsSync(inputFilePath)) {
			
 
				-        throw new Error("Invalid file path");
			
 
				-    }
			
 
				-    switch (model) {
			
 
				-        case "ggml-clip":
			
 
				-            return await computeGGMLImageEmbedding(inputFilePath);
			
 
				-        case "onnx-clip":
			
 
				-            return await computeONNXImageEmbedding(inputFilePath);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-const computeGGMLImageEmbedding = async (
			
 
				-    inputFilePath: string,
			
 
				-): Promise<Float32Array> => {
			
 
				-    const clipModelPath = await getClipImageModelPath("ggml");
			
 
				-    const ggmlclipPath = getGGMLClipPath();
			
 
				-    const cmd = IMAGE_EMBEDDING_EXTRACT_CMD.map((cmdPart) => {
			
 
				-        if (cmdPart === GGMLCLIP_PATH_PLACEHOLDER) {
			
 
				-            return ggmlclipPath;
			
 
				-        } else if (cmdPart === CLIP_MODEL_PATH_PLACEHOLDER) {
			
 
				-            return clipModelPath;
			
 
				-        } else if (cmdPart === INPUT_PATH_PLACEHOLDER) {
			
 
				-            return inputFilePath;
			
 
				-        } else {
			
 
				-            return cmdPart;
			
 
				-        }
			
 
				-    });
			
 
				-
			
 
				-    const { stdout } = await execAsync(cmd);
			
 
				-    // parse stdout and return embedding
			
 
				-    // get the last line of stdout
			
 
				-    const lines = stdout.split("\n");
			
 
				-    const lastLine = lines[lines.length - 1];
			
 
				-    const embedding = JSON.parse(lastLine);
			
 
				-    const embeddingArray = new Float32Array(embedding);
			
 
				-    return embeddingArray;
			
 
				-};
			
 
				-
			
 
				-const computeONNXImageEmbedding = async (
			
 
				-    inputFilePath: string,
			
 
				-): Promise<Float32Array> => {
			
 
				-    const imageSession = await getOnnxImageSession();
			
 
				-    const t1 = Date.now();
			
 
				-    const rgbData = await getRGBData(inputFilePath);
			
 
				-    const feeds = {
			
 
				-        input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
			
 
				-    };
			
 
				-    const t2 = Date.now();
			
 
				-    const results = await imageSession.run(feeds);
			
 
				-    log.info(
			
 
				-        `onnx image embedding time: ${Date.now() - t1} ms (prep:${
			
 
				-            t2 - t1
			
 
				-        } ms, extraction: ${Date.now() - t2} ms)`,
			
 
				-    );
			
 
				-    const imageEmbedding = results["output"].data; // Float32Array
			
 
				-    return normalizeEmbedding(imageEmbedding);
			
 
				-};
			
 
				-
			
 
				-async function getRGBData(inputFilePath: string) {
			
 
				-    const jpegData = await fs.readFile(inputFilePath);
			
 
				-    const rawImageData = jpeg.decode(jpegData, {
			
 
				-        useTArray: true,
			
 
				-        formatAsRGBA: false,
			
 
				-    });
			
 
				-
			
 
				-    const nx: number = rawImageData.width;
			
 
				-    const ny: number = rawImageData.height;
			
 
				-    const inputImage: Uint8Array = rawImageData.data;
			
 
				-
			
 
				-    const nx2: number = 224;
			
 
				-    const ny2: number = 224;
			
 
				-    const totalSize: number = 3 * nx2 * ny2;
			
 
				-
			
 
				-    const result: number[] = Array(totalSize).fill(0);
			
 
				-    const scale: number = Math.max(nx, ny) / 224;
			
 
				-
			
 
				-    const nx3: number = Math.round(nx / scale);
			
 
				-    const ny3: number = Math.round(ny / scale);
			
 
				-
			
 
				-    const mean: number[] = [0.48145466, 0.4578275, 0.40821073];
			
 
				-    const std: number[] = [0.26862954, 0.26130258, 0.27577711];
			
 
				-
			
 
				-    for (let y = 0; y < ny3; y++) {
			
 
				-        for (let x = 0; x < nx3; x++) {
			
 
				-            for (let c = 0; c < 3; c++) {
			
 
				-                // linear interpolation
			
 
				-                const sx: number = (x + 0.5) * scale - 0.5;
			
 
				-                const sy: number = (y + 0.5) * scale - 0.5;
			
 
				-
			
 
				-                const x0: number = Math.max(0, Math.floor(sx));
			
 
				-                const y0: number = Math.max(0, Math.floor(sy));
			
 
				-
			
 
				-                const x1: number = Math.min(x0 + 1, nx - 1);
			
 
				-                const y1: number = Math.min(y0 + 1, ny - 1);
			
 
				-
			
 
				-                const dx: number = sx - x0;
			
 
				-                const dy: number = sy - y0;
			
 
				-
			
 
				-                const j00: number = 3 * (y0 * nx + x0) + c;
			
 
				-                const j01: number = 3 * (y0 * nx + x1) + c;
			
 
				-                const j10: number = 3 * (y1 * nx + x0) + c;
			
 
				-                const j11: number = 3 * (y1 * nx + x1) + c;
			
 
				-
			
 
				-                const v00: number = inputImage[j00];
			
 
				-                const v01: number = inputImage[j01];
			
 
				-                const v10: number = inputImage[j10];
			
 
				-                const v11: number = inputImage[j11];
			
 
				-
			
 
				-                const v0: number = v00 * (1 - dx) + v01 * dx;
			
 
				-                const v1: number = v10 * (1 - dx) + v11 * dx;
			
 
				-
			
 
				-                const v: number = v0 * (1 - dy) + v1 * dy;
			
 
				-
			
 
				-                const v2: number = Math.min(Math.max(Math.round(v), 0), 255);
			
 
				-
			
 
				-                // createTensorWithDataList is dump compared to reshape and hence has to be given with one channel after another
			
 
				-                const i: number = y * nx3 + x + (c % 3) * 224 * 224;
			
 
				-
			
 
				-                result[i] = (v2 / 255 - mean[c]) / std[c];
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    return result;
			
 
				-}
			
 
				-
			
 
				-const normalizeEmbedding = (embedding: Float32Array) => {
			
 
				-    let normalization = 0;
			
 
				-    for (let index = 0; index < embedding.length; index++) {
			
 
				-        normalization += embedding[index] * embedding[index];
			
 
				-    }
			
 
				-    const sqrtNormalization = Math.sqrt(normalization);
			
 
				-    for (let index = 0; index < embedding.length; index++) {
			
 
				-        embedding[index] = embedding[index] / sqrtNormalization;
			
 
				-    }
			
 
				-    return embedding;
			
 
				-};
			
 
				-
			
 
				-export async function computeTextEmbedding(
			
 
				-    model: Model,
			
 
				-    text: string,
			
 
				-): Promise<Float32Array> {
			
 
				-    if (!isModel(model)) throw new Error(`Invalid CLIP model ${model}`);
			
 
				-
			
 
				-    try {
			
 
				-        const embedding = computeTextEmbedding_(model, text);
			
 
				-        return embedding;
			
 
				-    } catch (err) {
			
 
				-        if (isExecError(err)) {
			
 
				-            const parsedExecError = parseExecError(err);
			
 
				-            throw Error(parsedExecError);
			
 
				-        } else {
			
 
				-            throw err;
			
 
				-        }
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-async function computeTextEmbedding_(
			
 
				-    model: Model,
			
 
				-    text: string,
			
 
				-): Promise<Float32Array> {
			
 
				-    switch (model) {
			
 
				-        case "ggml-clip":
			
 
				-            return await computeGGMLTextEmbedding(text);
			
 
				-        case "onnx-clip":
			
 
				-            return await computeONNXTextEmbedding(text);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-export async function computeGGMLTextEmbedding(
			
 
				-    text: string,
			
 
				-): Promise<Float32Array> {
			
 
				-    const clipModelPath = await getClipTextModelPath("ggml");
			
 
				-    const ggmlclipPath = getGGMLClipPath();
			
 
				-    const cmd = TEXT_EMBEDDING_EXTRACT_CMD.map((cmdPart) => {
			
 
				-        if (cmdPart === GGMLCLIP_PATH_PLACEHOLDER) {
			
 
				-            return ggmlclipPath;
			
 
				-        } else if (cmdPart === CLIP_MODEL_PATH_PLACEHOLDER) {
			
 
				-            return clipModelPath;
			
 
				-        } else if (cmdPart === INPUT_PATH_PLACEHOLDER) {
			
 
				-            return text;
			
 
				-        } else {
			
 
				-            return cmdPart;
			
 
				-        }
			
 
				-    });
			
 
				-
			
 
				-    const { stdout } = await execAsync(cmd);
			
 
				-    // parse stdout and return embedding
			
 
				-    // get the last line of stdout
			
 
				-    const lines = stdout.split("\n");
			
 
				-    const lastLine = lines[lines.length - 1];
			
 
				-    const embedding = JSON.parse(lastLine);
			
 
				-    const embeddingArray = new Float32Array(embedding);
			
 
				-    return embeddingArray;
			
 
				-}
			
 
				-
			
 
				-export async function computeONNXTextEmbedding(
			
 
				-    text: string,
			
 
				-): Promise<Float32Array> {
			
 
				-    const imageSession = await getOnnxTextSession();
			
 
				-    const t1 = Date.now();
			
 
				-    const tokenizer = getTokenizer();
			
 
				-    const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text));
			
 
				-    const feeds = {
			
 
				-        input: new ort.Tensor("int32", tokenizedText, [1, 77]),
			
 
				-    };
			
 
				-    const t2 = Date.now();
			
 
				-    const results = await imageSession.run(feeds);
			
 
				-    log.info(
			
 
				-        `onnx text embedding time: ${Date.now() - t1} ms (prep:${
			
 
				-            t2 - t1
			
 
				-        } ms, extraction: ${Date.now() - t2} ms)`,
			
 
				-    );
			
 
				-    const textEmbedding = results["output"].data; // Float32Array
			
 
				-    return normalizeEmbedding(textEmbedding);
			
 
				-}
			
--- a/desktop/src/types/ipc.ts
+++ b/desktop/src/types/ipc.ts
@@ -80,7 +80,3 @@ export interface AppUpdateInfo {
 
				     autoUpdatable: boolean;
			
 
				     version: string;
			
 
				 }
			
 
				-
			
 
				-export type Model = "ggml-clip" | "onnx-clip";
			
 
				-
			
 
				-export const isModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip";
			
--- a/web/apps/photos/src/components/Sidebar/AdvancedSettings.tsx
+++ b/web/apps/photos/src/components/Sidebar/AdvancedSettings.tsx
@@ -14,7 +14,7 @@ import { EnteMenuItem } from "components/Menu/EnteMenuItem";
 
				 import { MenuItemGroup } from "components/Menu/MenuItemGroup";
			
 
				 import isElectron from "is-electron";
			
 
				 import { AppContext } from "pages/_app";
			
 
				-import { ClipExtractionStatus, ClipService } from "services/clipService";
			
 
				+import { CLIPIndexingStatus, clipService } from "services/clip-service";
			
 
				 import { formatNumber } from "utils/number/format";
			
 
				 
			
 
				 export default function AdvancedSettings({ open, onClose, onRootClose }) {
			
@@ -44,17 +44,15 @@ export default function AdvancedSettings({ open, onClose, onRootClose }) {
 
				             log.error("toggleFasterUpload failed", e);
			
 
				         }
			
 
				     };
			
 
				-    const [indexingStatus, setIndexingStatus] = useState<ClipExtractionStatus>({
			
 
				+    const [indexingStatus, setIndexingStatus] = useState<CLIPIndexingStatus>({
			
 
				         indexed: 0,
			
 
				         pending: 0,
			
 
				     });
			
 
				 
			
 
				     useEffect(() => {
			
 
				-        const main = async () => {
			
 
				-            setIndexingStatus(await ClipService.getIndexingStatus());
			
 
				-            ClipService.setOnUpdateHandler(setIndexingStatus);
			
 
				-        };
			
 
				-        main();
			
 
				+        clipService.setOnUpdateHandler(setIndexingStatus);
			
 
				+        clipService.getIndexingStatus().then((st) => setIndexingStatus(st));
			
 
				+        return () => clipService.setOnUpdateHandler(undefined);
			
 
				     }, []);
			
 
				 
			
 
				     return (
			
--- a/web/apps/photos/src/pages/gallery/index.tsx
+++ b/web/apps/photos/src/pages/gallery/index.tsx
@@ -102,7 +102,7 @@ import {
 
				 } from "constants/collection";
			
 
				 import { SYNC_INTERVAL_IN_MICROSECONDS } from "constants/gallery";
			
 
				 import { AppContext } from "pages/_app";
			
 
				-import { ClipService } from "services/clipService";
			
 
				+import { clipService } from "services/clip-service";
			
 
				 import { constructUserIDToEmailMap } from "services/collectionService";
			
 
				 import downloadManager from "services/download";
			
 
				 import { syncEmbeddings } from "services/embeddingService";
			
@@ -362,7 +362,7 @@ export default function Gallery() {
 
				                 syncWithRemote(false, true);
			
 
				             }, SYNC_INTERVAL_IN_MICROSECONDS);
			
 
				             if (electron) {
			
 
				-                void ClipService.setupOnFileUploadListener();
			
 
				+                void clipService.setupOnFileUploadListener();
			
 
				                 electron.registerForegroundEventListener(() => {
			
 
				                     syncWithRemote(false, true);
			
 
				                 });
			
@@ -373,7 +373,7 @@ export default function Gallery() {
 
				             clearInterval(syncInterval.current);
			
 
				             if (electron) {
			
 
				                 electron.registerForegroundEventListener(() => {});
			
 
				-                ClipService.removeOnFileUploadListener();
			
 
				+                clipService.removeOnFileUploadListener();
			
 
				             }
			
 
				         };
			
 
				     }, []);
			
@@ -704,8 +704,8 @@ export default function Gallery() {
 
				             await syncEntities();
			
 
				             await syncMapEnabled();
			
 
				             await syncEmbeddings();
			
 
				-            if (ClipService.isPlatformSupported()) {
			
 
				-                void ClipService.scheduleImageEmbeddingExtraction();
			
 
				+            if (clipService.isPlatformSupported()) {
			
 
				+                void clipService.scheduleImageEmbeddingExtraction();
			
 
				             }
			
 
				         } catch (e) {
			
 
				             switch (e.message) {
			
--- a/web/apps/photos/src/services/clip-service.ts
+++ b/web/apps/photos/src/services/clip-service.ts
@@ -1,5 +1,6 @@
 
				 import { ensureElectron } from "@/next/electron";
			
 
				 import log from "@/next/log";
			
 
				+import type { Electron } from "@/next/types/ipc";
			
 
				 import ComlinkCryptoWorker from "@ente/shared/crypto";
			
 
				 import { CustomError } from "@ente/shared/error";
			
 
				 import { Events, eventBus } from "@ente/shared/events";
			
@@ -7,29 +8,71 @@ import { LS_KEYS, getData } from "@ente/shared/storage/localStorage";
 
				 import { FILE_TYPE } from "constants/file";
			
 
				 import isElectron from "is-electron";
			
 
				 import PQueue from "p-queue";
			
 
				-import { Embedding, Model } from "types/embedding";
			
 
				+import { Embedding } from "types/embedding";
			
 
				 import { EnteFile } from "types/file";
			
 
				 import { getPersonalFiles } from "utils/file";
			
 
				 import downloadManager from "./download";
			
 
				 import { getLocalEmbeddings, putEmbedding } from "./embeddingService";
			
 
				 import { getAllLocalFiles, getLocalFiles } from "./fileService";
			
 
				 
			
 
				-const CLIP_EMBEDDING_LENGTH = 512;
			
 
				-
			
 
				-export interface ClipExtractionStatus {
			
 
				+/** Status of CLIP indexing on the images in the user's local library. */
			
 
				+export interface CLIPIndexingStatus {
			
 
				+    /** Number of items pending indexing. */
			
 
				     pending: number;
			
 
				+    /** Number of items that have already been indexed. */
			
 
				     indexed: number;
			
 
				 }
			
 
				 
			
 
				-class ClipServiceImpl {
			
 
				+/**
			
 
				+ * Use a CLIP based neural network for natural language search.
			
 
				+ *
			
 
				+ * [Note: CLIP based magic search]
			
 
				+ *
			
 
				+ * CLIP (Contrastive Language-Image Pretraining) is a neural network trained on
			
 
				+ * (image, text) pairs. It can be thought of as two separate (but jointly
			
 
				+ * trained) encoders - one for images, and one for text - that both map to the
			
 
				+ * same embedding space.
			
 
				+ *
			
 
				+ * We use this for natural language search within the app (aka "magic search"):
			
 
				+ *
			
 
				+ * 1. Pre-compute an embedding for each image.
			
 
				+ *
			
 
				+ * 2. When the user searches, compute an embedding for the search term.
			
 
				+ *
			
 
				+ * 3. Use cosine similarity to find the find the image (embedding) closest to
			
 
				+ *    the text (embedding).
			
 
				+ *
			
 
				+ * More details are in our [blog
			
 
				+ * post](https://ente.io/blog/image-search-with-clip-ggml/) that describes the
			
 
				+ * initial launch of this feature using the GGML runtime.
			
 
				+ *
			
 
				+ * Since the initial launch, we've switched over to another runtime,
			
 
				+ * [ONNX](https://onnxruntime.ai).
			
 
				+ *
			
 
				+ * Note that we don't train the neural network - we only use one of the publicly
			
 
				+ * available pre-trained neural networks for inference. These neural networks
			
 
				+ * are wholly defined by their connectivity and weights. ONNX, our ML runtimes,
			
 
				+ * loads these weights and instantiates a running network that we can use to
			
 
				+ * compute the embeddings.
			
 
				+ *
			
 
				+ * Theoretically, the same CLIP model can be loaded by different frameworks /
			
 
				+ * runtimes, but in practice each runtime has its own preferred format, and
			
 
				+ * there are also quantization tradeoffs. So there is a specific model (a binary
			
 
				+ * encoding of weights) tied to our current runtime that we use.
			
 
				+ *
			
 
				+ * To ensure that the embeddings, for the most part, can be shared, whenever
			
 
				+ * possible we try to ensure that all the preprocessing steps, and the model
			
 
				+ * itself, is the same across clients - web and mobile.
			
 
				+ */
			
 
				+class CLIPService {
			
 
				+    private electron: Electron;
			
 
				     private embeddingExtractionInProgress: AbortController | null = null;
			
 
				     private reRunNeeded = false;
			
 
				-    private clipExtractionStatus: ClipExtractionStatus = {
			
 
				+    private indexingStatus: CLIPIndexingStatus = {
			
 
				         pending: 0,
			
 
				         indexed: 0,
			
 
				     };
			
 
				-    private onUpdateHandler: ((status: ClipExtractionStatus) => void) | null =
			
 
				-        null;
			
 
				+    private onUpdateHandler: ((status: CLIPIndexingStatus) => void) | undefined;
			
 
				     private liveEmbeddingExtractionQueue: PQueue;
			
 
				     private onFileUploadedHandler:
			
 
				         | ((arg: { enteFile: EnteFile; localFile: globalThis.File }) => void)
			
@@ -37,6 +80,7 @@ class ClipServiceImpl {
 
				     private unsupportedPlatform = false;
			
 
				 
			
 
				     constructor() {
			
 
				+        this.electron = ensureElectron();
			
 
				         this.liveEmbeddingExtractionQueue = new PQueue({
			
 
				             concurrency: 1,
			
 
				         });
			
@@ -96,28 +140,23 @@ class ClipServiceImpl {
 
				     };
			
 
				 
			
 
				     getIndexingStatus = async () => {
			
 
				-        try {
			
 
				-            if (
			
 
				-                !this.clipExtractionStatus ||
			
 
				-                (this.clipExtractionStatus.pending === 0 &&
			
 
				-                    this.clipExtractionStatus.indexed === 0)
			
 
				-            ) {
			
 
				-                this.clipExtractionStatus = await getClipExtractionStatus();
			
 
				-            }
			
 
				-            return this.clipExtractionStatus;
			
 
				-        } catch (e) {
			
 
				-            log.error("failed to get clip indexing status", e);
			
 
				+        if (
			
 
				+            this.indexingStatus.pending === 0 &&
			
 
				+            this.indexingStatus.indexed === 0
			
 
				+        ) {
			
 
				+            this.indexingStatus = await initialIndexingStatus();
			
 
				         }
			
 
				+        return this.indexingStatus;
			
 
				     };
			
 
				 
			
 
				-    setOnUpdateHandler = (handler: (status: ClipExtractionStatus) => void) => {
			
 
				+    /**
			
 
				+     * Set the {@link handler} to invoke whenever our indexing status changes.
			
 
				+     */
			
 
				+    setOnUpdateHandler = (handler?: (status: CLIPIndexingStatus) => void) => {
			
 
				         this.onUpdateHandler = handler;
			
 
				-        handler(this.clipExtractionStatus);
			
 
				     };
			
 
				 
			
 
				-    scheduleImageEmbeddingExtraction = async (
			
 
				-        model: Model = Model.ONNX_CLIP,
			
 
				-    ) => {
			
 
				+    scheduleImageEmbeddingExtraction = async () => {
			
 
				         try {
			
 
				             if (this.embeddingExtractionInProgress) {
			
 
				                 log.info(
			
@@ -133,7 +172,7 @@ class ClipServiceImpl {
 
				             const canceller = new AbortController();
			
 
				             this.embeddingExtractionInProgress = canceller;
			
 
				             try {
			
 
				-                await this.runClipEmbeddingExtraction(canceller, model);
			
 
				+                await this.runClipEmbeddingExtraction(canceller);
			
 
				             } finally {
			
 
				                 this.embeddingExtractionInProgress = null;
			
 
				                 if (!canceller.signal.aborted && this.reRunNeeded) {
			
@@ -152,25 +191,19 @@ class ClipServiceImpl {
 
				         }
			
 
				     };
			
 
				 
			
 
				-    getTextEmbedding = async (
			
 
				-        text: string,
			
 
				-        model: Model = Model.ONNX_CLIP,
			
 
				-    ): Promise<Float32Array> => {
			
 
				+    getTextEmbedding = async (text: string): Promise<Float32Array> => {
			
 
				         try {
			
 
				-            return ensureElectron().computeTextEmbedding(model, text);
			
 
				+            return electron.clipTextEmbedding(text);
			
 
				         } catch (e) {
			
 
				             if (e?.message?.includes(CustomError.UNSUPPORTED_PLATFORM)) {
			
 
				                 this.unsupportedPlatform = true;
			
 
				             }
			
 
				-            log.error("failed to compute text embedding", e);
			
 
				+            log.error("Failed to compute CLIP text embedding", e);
			
 
				             throw e;
			
 
				         }
			
 
				     };
			
 
				 
			
 
				-    private runClipEmbeddingExtraction = async (
			
 
				-        canceller: AbortController,
			
 
				-        model: Model,
			
 
				-    ) => {
			
 
				+    private runClipEmbeddingExtraction = async (canceller: AbortController) => {
			
 
				         try {
			
 
				             if (this.unsupportedPlatform) {
			
 
				                 log.info(
			
@@ -183,12 +216,12 @@ class ClipServiceImpl {
 
				                 return;
			
 
				             }
			
 
				             const localFiles = getPersonalFiles(await getAllLocalFiles(), user);
			
 
				-            const existingEmbeddings = await getLocalEmbeddings(model);
			
 
				+            const existingEmbeddings = await getLocalEmbeddings();
			
 
				             const pendingFiles = await getNonClipEmbeddingExtractedFiles(
			
 
				                 localFiles,
			
 
				                 existingEmbeddings,
			
 
				             );
			
 
				-            this.updateClipEmbeddingExtractionStatus({
			
 
				+            this.updateIndexingStatus({
			
 
				                 indexed: existingEmbeddings.length,
			
 
				                 pending: pendingFiles.length,
			
 
				             });
			
@@ -208,15 +241,11 @@ class ClipServiceImpl {
 
				                         throw Error(CustomError.REQUEST_CANCELLED);
			
 
				                     }
			
 
				                     const embeddingData =
			
 
				-                        await this.extractFileClipImageEmbedding(model, file);
			
 
				+                        await this.extractFileClipImageEmbedding(file);
			
 
				                     log.info(
			
 
				                         `successfully extracted clip embedding for file: ${file.metadata.title} fileID: ${file.id} embedding length: ${embeddingData?.length}`,
			
 
				                     );
			
 
				-                    await this.encryptAndUploadEmbedding(
			
 
				-                        model,
			
 
				-                        file,
			
 
				-                        embeddingData,
			
 
				-                    );
			
 
				+                    await this.encryptAndUploadEmbedding(file, embeddingData);
			
 
				                     this.onSuccessStatusUpdater();
			
 
				                     log.info(
			
 
				                         `successfully put clip embedding to server for file: ${file.metadata.title} fileID: ${file.id}`,
			
@@ -249,13 +278,10 @@ class ClipServiceImpl {
 
				         }
			
 
				     };
			
 
				 
			
 
				-    private async runLocalFileClipExtraction(
			
 
				-        arg: {
			
 
				-            enteFile: EnteFile;
			
 
				-            localFile: globalThis.File;
			
 
				-        },
			
 
				-        model: Model = Model.ONNX_CLIP,
			
 
				-    ) {
			
 
				+    private async runLocalFileClipExtraction(arg: {
			
 
				+        enteFile: EnteFile;
			
 
				+        localFile: globalThis.File;
			
 
				+    }) {
			
 
				         const { enteFile, localFile } = arg;
			
 
				         log.info(
			
 
				             `clip embedding extraction onFileUploadedHandler file: ${enteFile.metadata.title} fileID: ${enteFile.id}`,
			
@@ -279,15 +305,9 @@ class ClipServiceImpl {
 
				         );
			
 
				         try {
			
 
				             await this.liveEmbeddingExtractionQueue.add(async () => {
			
 
				-                const embedding = await this.extractLocalFileClipImageEmbedding(
			
 
				-                    model,
			
 
				-                    localFile,
			
 
				-                );
			
 
				-                await this.encryptAndUploadEmbedding(
			
 
				-                    model,
			
 
				-                    enteFile,
			
 
				-                    embedding,
			
 
				-                );
			
 
				+                const embedding =
			
 
				+                    await this.extractLocalFileClipImageEmbedding(localFile);
			
 
				+                await this.encryptAndUploadEmbedding(enteFile, embedding);
			
 
				             });
			
 
				             log.info(
			
 
				                 `successfully extracted clip embedding for file: ${enteFile.metadata.title} fileID: ${enteFile.id}`,
			
@@ -297,26 +317,18 @@ class ClipServiceImpl {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    private extractLocalFileClipImageEmbedding = async (
			
 
				-        model: Model,
			
 
				-        localFile: File,
			
 
				-    ) => {
			
 
				+    private extractLocalFileClipImageEmbedding = async (localFile: File) => {
			
 
				         const file = await localFile
			
 
				             .arrayBuffer()
			
 
				             .then((buffer) => new Uint8Array(buffer));
			
 
				-        const embedding = await ensureElectron().computeImageEmbedding(
			
 
				-            model,
			
 
				-            file,
			
 
				-        );
			
 
				-        return embedding;
			
 
				+        return await electron.clipImageEmbedding(file);
			
 
				     };
			
 
				 
			
 
				     private encryptAndUploadEmbedding = async (
			
 
				-        model: Model,
			
 
				         file: EnteFile,
			
 
				         embeddingData: Float32Array,
			
 
				     ) => {
			
 
				-        if (embeddingData?.length !== CLIP_EMBEDDING_LENGTH) {
			
 
				+        if (embeddingData?.length !== 512) {
			
 
				             throw Error(
			
 
				                 `invalid length embedding data length: ${embeddingData?.length}`,
			
 
				             );
			
@@ -331,38 +343,31 @@ class ClipServiceImpl {
 
				             fileID: file.id,
			
 
				             encryptedEmbedding: encryptedEmbeddingData.encryptedData,
			
 
				             decryptionHeader: encryptedEmbeddingData.decryptionHeader,
			
 
				-            model,
			
 
				+            model: "onnx-clip",
			
 
				         });
			
 
				     };
			
 
				 
			
 
				-    updateClipEmbeddingExtractionStatus = (status: ClipExtractionStatus) => {
			
 
				-        this.clipExtractionStatus = status;
			
 
				-        if (this.onUpdateHandler) {
			
 
				-            this.onUpdateHandler(status);
			
 
				-        }
			
 
				+    private updateIndexingStatus = (status: CLIPIndexingStatus) => {
			
 
				+        this.indexingStatus = status;
			
 
				+        const handler = this.onUpdateHandler;
			
 
				+        if (handler) handler(status);
			
 
				     };
			
 
				 
			
 
				-    private extractFileClipImageEmbedding = async (
			
 
				-        model: Model,
			
 
				-        file: EnteFile,
			
 
				-    ) => {
			
 
				+    private extractFileClipImageEmbedding = async (file: EnteFile) => {
			
 
				         const thumb = await downloadManager.getThumbnail(file);
			
 
				-        const embedding = await ensureElectron().computeImageEmbedding(
			
 
				-            model,
			
 
				-            thumb,
			
 
				-        );
			
 
				+        const embedding = await ensureElectron().clipImageEmbedding(thumb);
			
 
				         return embedding;
			
 
				     };
			
 
				 
			
 
				     private onSuccessStatusUpdater = () => {
			
 
				-        this.updateClipEmbeddingExtractionStatus({
			
 
				-            pending: this.clipExtractionStatus.pending - 1,
			
 
				-            indexed: this.clipExtractionStatus.indexed + 1,
			
 
				+        this.updateIndexingStatus({
			
 
				+            pending: this.indexingStatus.pending - 1,
			
 
				+            indexed: this.indexingStatus.indexed + 1,
			
 
				         });
			
 
				     };
			
 
				 }
			
 
				 
			
 
				-export const ClipService = new ClipServiceImpl();
			
 
				+export const clipService = new CLIPService();
			
 
				 
			
 
				 const getNonClipEmbeddingExtractedFiles = async (
			
 
				     files: EnteFile[],
			
@@ -412,14 +417,10 @@ export const computeClipMatchScore = async (
 
				     return score;
			
 
				 };
			
 
				 
			
 
				-const getClipExtractionStatus = async (
			
 
				-    model: Model = Model.ONNX_CLIP,
			
 
				-): Promise<ClipExtractionStatus> => {
			
 
				+const initialIndexingStatus = async (): Promise<CLIPIndexingStatus> => {
			
 
				     const user = getData(LS_KEYS.USER);
			
 
				-    if (!user) {
			
 
				-        return;
			
 
				-    }
			
 
				-    const allEmbeddings = await getLocalEmbeddings(model);
			
 
				+    if (!user) throw new Error("Orphan CLIP indexing without a login");
			
 
				+    const allEmbeddings = await getLocalEmbeddings();
			
 
				     const localFiles = getPersonalFiles(await getLocalFiles(), user);
			
 
				     const pendingFiles = await getNonClipEmbeddingExtractedFiles(
			
 
				         localFiles,
			
--- a/web/apps/photos/src/services/embeddingService.ts
+++ b/web/apps/photos/src/services/embeddingService.ts
@@ -5,11 +5,11 @@ import HTTPService from "@ente/shared/network/HTTPService";
 
				 import { getEndpoint } from "@ente/shared/network/api";
			
 
				 import localForage from "@ente/shared/storage/localForage";
			
 
				 import { getToken } from "@ente/shared/storage/localStorage/helpers";
			
 
				-import {
			
 
				+import type {
			
 
				     Embedding,
			
 
				+    EmbeddingModel,
			
 
				     EncryptedEmbedding,
			
 
				     GetEmbeddingDiffResponse,
			
 
				-    Model,
			
 
				     PutEmbeddingRequest,
			
 
				 } from "types/embedding";
			
 
				 import { EnteFile } from "types/file";
			
@@ -38,12 +38,12 @@ export const getAllLocalEmbeddings = async () => {
 
				     return embeddings;
			
 
				 };
			
 
				 
			
 
				-export const getLocalEmbeddings = async (model: Model) => {
			
 
				+export const getLocalEmbeddings = async () => {
			
 
				     const embeddings = await getAllLocalEmbeddings();
			
 
				-    return embeddings.filter((embedding) => embedding.model === model);
			
 
				+    return embeddings.filter((embedding) => embedding.model === "onnx-clip");
			
 
				 };
			
 
				 
			
 
				-const getModelEmbeddingSyncTime = async (model: Model) => {
			
 
				+const getModelEmbeddingSyncTime = async (model: EmbeddingModel) => {
			
 
				     return (
			
 
				         (await localForage.getItem<number>(
			
 
				             `${model}-${EMBEDDING_SYNC_TIME_TABLE}`,
			
@@ -51,11 +51,15 @@ const getModelEmbeddingSyncTime = async (model: Model) => {
 
				     );
			
 
				 };
			
 
				 
			
 
				-const setModelEmbeddingSyncTime = async (model: Model, time: number) => {
			
 
				+const setModelEmbeddingSyncTime = async (
			
 
				+    model: EmbeddingModel,
			
 
				+    time: number,
			
 
				+) => {
			
 
				     await localForage.setItem(`${model}-${EMBEDDING_SYNC_TIME_TABLE}`, time);
			
 
				 };
			
 
				 
			
 
				-export const syncEmbeddings = async (models: Model[] = [Model.ONNX_CLIP]) => {
			
 
				+export const syncEmbeddings = async () => {
			
 
				+    const models: EmbeddingModel[] = ["onnx-clip"];
			
 
				     try {
			
 
				         let allEmbeddings = await getAllLocalEmbeddings();
			
 
				         const localFiles = await getAllLocalFiles();
			
@@ -138,7 +142,7 @@ export const syncEmbeddings = async (models: Model[] = [Model.ONNX_CLIP]) => {
 
				 
			
 
				 export const getEmbeddingsDiff = async (
			
 
				     sinceTime: number,
			
 
				-    model: Model,
			
 
				+    model: EmbeddingModel,
			
 
				 ): Promise<GetEmbeddingDiffResponse> => {
			
 
				     try {
			
 
				         const token = getToken();
			
--- a/web/apps/photos/src/services/searchService.ts
+++ b/web/apps/photos/src/services/searchService.ts
@@ -4,7 +4,6 @@ import * as chrono from "chrono-node";
 
				 import { FILE_TYPE } from "constants/file";
			
 
				 import { t } from "i18next";
			
 
				 import { Collection } from "types/collection";
			
 
				-import { Model } from "types/embedding";
			
 
				 import { EntityType, LocationTag, LocationTagData } from "types/entity";
			
 
				 import { EnteFile } from "types/file";
			
 
				 import { Person, Thing } from "types/machineLearning";
			
@@ -22,7 +21,7 @@ import { getAllPeople } from "utils/machineLearning";
 
				 import { getMLSyncConfig } from "utils/machineLearning/config";
			
 
				 import { getFormattedDate } from "utils/search";
			
 
				 import mlIDbStorage from "utils/storage/mlIDbStorage";
			
 
				-import { ClipService, computeClipMatchScore } from "./clipService";
			
 
				+import { clipService, computeClipMatchScore } from "./clip-service";
			
 
				 import { getLocalEmbeddings } from "./embeddingService";
			
 
				 import { getLatestEntities } from "./entityService";
			
 
				 import locationSearchService, { City } from "./locationSearchService";
			
@@ -305,7 +304,7 @@ async function getThingSuggestion(searchPhrase: string): Promise<Suggestion[]> {
 
				 
			
 
				 async function getClipSuggestion(searchPhrase: string): Promise<Suggestion> {
			
 
				     try {
			
 
				-        if (!ClipService.isPlatformSupported()) {
			
 
				+        if (!clipService.isPlatformSupported()) {
			
 
				             return null;
			
 
				         }
			
 
				 
			
@@ -396,8 +395,8 @@ async function searchThing(searchPhrase: string) {
 
				 }
			
 
				 
			
 
				 async function searchClip(searchPhrase: string): Promise<ClipSearchScores> {
			
 
				-    const imageEmbeddings = await getLocalEmbeddings(Model.ONNX_CLIP);
			
 
				-    const textEmbedding = await ClipService.getTextEmbedding(searchPhrase);
			
 
				+    const imageEmbeddings = await getLocalEmbeddings();
			
 
				+    const textEmbedding = await clipService.getTextEmbedding(searchPhrase);
			
 
				     const clipSearchResult = new Map<number, number>(
			
 
				         (
			
 
				             await Promise.all(
			
--- a/web/apps/photos/src/types/embedding.tsx
+++ b/web/apps/photos/src/types/embedding.tsx
@@ -1,11 +1,16 @@
 
				-export enum Model {
			
 
				-    GGML_CLIP = "ggml-clip",
			
 
				-    ONNX_CLIP = "onnx-clip",
			
 
				-}
			
 
				+/**
			
 
				+ * The embeddings models that we support.
			
 
				+ *
			
 
				+ * This is an exhaustive set of values we pass when PUT-ting encrypted
			
 
				+ * embeddings on the server. However, we should be prepared to receive an
			
 
				+ * {@link EncryptedEmbedding} with a model value distinct from one of these.
			
 
				+ */
			
 
				+export type EmbeddingModel = "onnx-clip";
			
 
				 
			
 
				 export interface EncryptedEmbedding {
			
 
				     fileID: number;
			
 
				-    model: Model;
			
 
				+    /** @see {@link EmbeddingModel} */
			
 
				+    model: string;
			
 
				     encryptedEmbedding: string;
			
 
				     decryptionHeader: string;
			
 
				     updatedAt: number;
			
@@ -25,7 +30,7 @@ export interface GetEmbeddingDiffResponse {
 
				 
			
 
				 export interface PutEmbeddingRequest {
			
 
				     fileID: number;
			
 
				-    model: Model;
			
 
				+    model: EmbeddingModel;
			
 
				     encryptedEmbedding: string;
			
 
				     decryptionHeader: string;
			
 
				 }
			
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -10,11 +10,6 @@ export interface AppUpdateInfo {
 
				     version: string;
			
 
				 }
			
 
				 
			
 
				-export enum Model {
			
 
				-    GGML_CLIP = "ggml-clip",
			
 
				-    ONNX_CLIP = "onnx-clip",
			
 
				-}
			
 
				-
			
 
				 export enum FILE_PATH_TYPE {
			
 
				     FILES = "files",
			
 
				     ZIPS = "zips",
			
@@ -147,12 +142,27 @@ export interface Electron {
 
				 
			
 
				     // - ML
			
 
				 
			
 
				-    computeImageEmbedding: (
			
 
				-        model: Model,
			
 
				-        imageData: Uint8Array,
			
 
				-    ) => Promise<Float32Array>;
			
 
				+    /**
			
 
				+     * Compute and return a CLIP embedding of the given image.
			
 
				+     *
			
 
				+     * See: [Note: CLIP based magic search]
			
 
				+     *
			
 
				+     * @param jpegImageData The raw bytes of the image encoded as an JPEG.
			
 
				+     *
			
 
				+     * @returns A CLIP embedding.
			
 
				+     */
			
 
				+    clipImageEmbedding: (jpegImageData: Uint8Array) => Promise<Float32Array>;
			
 
				 
			
 
				-    computeTextEmbedding: (model: Model, text: string) => Promise<Float32Array>;
			
 
				+    /**
			
 
				+     * Compute and return a CLIP embedding of the given image.
			
 
				+     *
			
 
				+     * See: [Note: CLIP based magic search]
			
 
				+     *
			
 
				+     * @param text The string whose embedding we want to compute.
			
 
				+     *
			
 
				+     * @returns A CLIP embedding.
			
 
				+     */
			
 
				+    clipTextEmbedding: (text: string) => Promise<Float32Array>;
			
 
				 
			
 
				     // - File selection
			
 
				     // TODO: Deprecated - use dialogs on the renderer process itself