Document

2024-04-09 15:38:47 +05:30 · 2024-04-09 15:38:47 +05:30 · 544e6be3fd
commit 544e6be3fd
parent 4f76cfb912
2 changed files with 56 additions and 1 deletions
--- a/web/apps/photos/src/services/clipService.ts
+++ b/web/apps/photos/src/services/clipService.ts
@ -21,6 +21,47 @@ export interface ClipExtractionStatus {
    indexed: number;
 }

+/**
+ * Use a CLIP based neural network for natural language search.
+ *
+ * [Note: CLIP based magic search]
+ *
+ * CLIP (Contrastive Language-Image Pretraining) is a neural network trained on
+ * (image, text) pairs. It can be thought of as two separate (but jointly
+ * trained) encoders - one for images, and one for text - that both map to the
+ * same embedding space.
+ *
+ * We use this for natural language search within the app (aka "magic search"):
+ *
+ * 1. Pre-compute an embedding for each image.
+ *
+ * 2. When the user searches, compute an embedding for the search term.
+ *
+ * 3. Use cosine similarity to find the find the image (embedding) closest to
+ *    the text (embedding).
+ *
+ * More details are in the blog post that describes the initial launch of this
+ * feature using the GGML runtime:
+ * https://ente.io/blog/image-search-with-clip-ggml/
+ *
+ * Since the initial launch, we've added support for another runtime, ONNX.
+ *
+ * Note that we don't train the neural network - we use one of the publicly
+ * available pre-trained neural networks (which are wholly defined by their
+ * connectivity and weights), and use one of the standard ML runtimes to load
+ * these weights and instantiate a running network that we can use to compute
+ * the embeddings. Theoretically, the same CLIP model can be loaded by different
+ * frameworks / runtimes, but in practice each runtime has its own preferred
+ * format, and there are also quantization tradeoffs. So for each runtime that
+ * we support we download a distinct model (binary encoding of weights).
+ *
+ * Currently supported runtimes are:
+ *
+ * - [GGML](https://github.com/monatis/clip.cpp)
+ * - [ONNX](https://onnx.ai)
+ *
+ * Both these currently have one (and only one) associated model.
+ */
 class ClipServiceImpl {
    private embeddingExtractionInProgress: AbortController | null = null;
    private reRunNeeded = false;
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@ -10,6 +10,10 @@ export interface AppUpdateInfo {
    version: string;
 }

+export type CLIPModel = "ggml-clip" | "onnx-clip";
+
+export const isCLIPModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip";
+
 export enum Model {
    GGML_CLIP = "ggml-clip",
    ONNX_CLIP = "onnx-clip",
@ -147,9 +151,19 @@ export interface Electron {

    // - ML

+    /**
+     * Compute and return a CLIP embedding of the given image.
+     *
+     * See: [Note: CLIP based magic search]
+     *
+     * @param model The CLIP model and ML runtime combination to use.
+     * @param jpegImageData The raw bytes of the image encoded as an JPEG.
+     *
+     * @returns A CLIP embedding.
+     */
    computeImageEmbedding: (
        model: Model,
-        imageData: Uint8Array,
+        jpegImageData: Uint8Array,
    ) => Promise<Float32Array>;

    computeTextEmbedding: (model: Model, text: string) => Promise<Float32Array>;