This commit is contained in:
Manav Rathi 2024-04-09 15:38:47 +05:30
parent 4f76cfb912
commit 544e6be3fd
No known key found for this signature in database
2 changed files with 56 additions and 1 deletions

View file

@ -21,6 +21,47 @@ export interface ClipExtractionStatus {
indexed: number;
}
/**
* Use a CLIP based neural network for natural language search.
*
* [Note: CLIP based magic search]
*
* CLIP (Contrastive Language-Image Pretraining) is a neural network trained on
* (image, text) pairs. It can be thought of as two separate (but jointly
* trained) encoders - one for images, and one for text - that both map to the
* same embedding space.
*
* We use this for natural language search within the app (aka "magic search"):
*
* 1. Pre-compute an embedding for each image.
*
* 2. When the user searches, compute an embedding for the search term.
*
* 3. Use cosine similarity to find the find the image (embedding) closest to
* the text (embedding).
*
* More details are in the blog post that describes the initial launch of this
* feature using the GGML runtime:
* https://ente.io/blog/image-search-with-clip-ggml/
*
* Since the initial launch, we've added support for another runtime, ONNX.
*
* Note that we don't train the neural network - we use one of the publicly
* available pre-trained neural networks (which are wholly defined by their
* connectivity and weights), and use one of the standard ML runtimes to load
* these weights and instantiate a running network that we can use to compute
* the embeddings. Theoretically, the same CLIP model can be loaded by different
* frameworks / runtimes, but in practice each runtime has its own preferred
* format, and there are also quantization tradeoffs. So for each runtime that
* we support we download a distinct model (binary encoding of weights).
*
* Currently supported runtimes are:
*
* - [GGML](https://github.com/monatis/clip.cpp)
* - [ONNX](https://onnx.ai)
*
* Both these currently have one (and only one) associated model.
*/
class ClipServiceImpl {
private embeddingExtractionInProgress: AbortController | null = null;
private reRunNeeded = false;

View file

@ -10,6 +10,10 @@ export interface AppUpdateInfo {
version: string;
}
export type CLIPModel = "ggml-clip" | "onnx-clip";
export const isCLIPModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip";
export enum Model {
GGML_CLIP = "ggml-clip",
ONNX_CLIP = "onnx-clip",
@ -147,9 +151,19 @@ export interface Electron {
// - ML
/**
* Compute and return a CLIP embedding of the given image.
*
* See: [Note: CLIP based magic search]
*
* @param model The CLIP model and ML runtime combination to use.
* @param jpegImageData The raw bytes of the image encoded as an JPEG.
*
* @returns A CLIP embedding.
*/
computeImageEmbedding: (
model: Model,
imageData: Uint8Array,
jpegImageData: Uint8Array,
) => Promise<Float32Array>;
computeTextEmbedding: (model: Model, text: string) => Promise<Float32Array>;