Document
This commit is contained in:
parent
4f76cfb912
commit
544e6be3fd
2 changed files with 56 additions and 1 deletions
|
@ -21,6 +21,47 @@ export interface ClipExtractionStatus {
|
|||
indexed: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use a CLIP based neural network for natural language search.
|
||||
*
|
||||
* [Note: CLIP based magic search]
|
||||
*
|
||||
* CLIP (Contrastive Language-Image Pretraining) is a neural network trained on
|
||||
* (image, text) pairs. It can be thought of as two separate (but jointly
|
||||
* trained) encoders - one for images, and one for text - that both map to the
|
||||
* same embedding space.
|
||||
*
|
||||
* We use this for natural language search within the app (aka "magic search"):
|
||||
*
|
||||
* 1. Pre-compute an embedding for each image.
|
||||
*
|
||||
* 2. When the user searches, compute an embedding for the search term.
|
||||
*
|
||||
* 3. Use cosine similarity to find the find the image (embedding) closest to
|
||||
* the text (embedding).
|
||||
*
|
||||
* More details are in the blog post that describes the initial launch of this
|
||||
* feature using the GGML runtime:
|
||||
* https://ente.io/blog/image-search-with-clip-ggml/
|
||||
*
|
||||
* Since the initial launch, we've added support for another runtime, ONNX.
|
||||
*
|
||||
* Note that we don't train the neural network - we use one of the publicly
|
||||
* available pre-trained neural networks (which are wholly defined by their
|
||||
* connectivity and weights), and use one of the standard ML runtimes to load
|
||||
* these weights and instantiate a running network that we can use to compute
|
||||
* the embeddings. Theoretically, the same CLIP model can be loaded by different
|
||||
* frameworks / runtimes, but in practice each runtime has its own preferred
|
||||
* format, and there are also quantization tradeoffs. So for each runtime that
|
||||
* we support we download a distinct model (binary encoding of weights).
|
||||
*
|
||||
* Currently supported runtimes are:
|
||||
*
|
||||
* - [GGML](https://github.com/monatis/clip.cpp)
|
||||
* - [ONNX](https://onnx.ai)
|
||||
*
|
||||
* Both these currently have one (and only one) associated model.
|
||||
*/
|
||||
class ClipServiceImpl {
|
||||
private embeddingExtractionInProgress: AbortController | null = null;
|
||||
private reRunNeeded = false;
|
||||
|
|
|
@ -10,6 +10,10 @@ export interface AppUpdateInfo {
|
|||
version: string;
|
||||
}
|
||||
|
||||
export type CLIPModel = "ggml-clip" | "onnx-clip";
|
||||
|
||||
export const isCLIPModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip";
|
||||
|
||||
export enum Model {
|
||||
GGML_CLIP = "ggml-clip",
|
||||
ONNX_CLIP = "onnx-clip",
|
||||
|
@ -147,9 +151,19 @@ export interface Electron {
|
|||
|
||||
// - ML
|
||||
|
||||
/**
|
||||
* Compute and return a CLIP embedding of the given image.
|
||||
*
|
||||
* See: [Note: CLIP based magic search]
|
||||
*
|
||||
* @param model The CLIP model and ML runtime combination to use.
|
||||
* @param jpegImageData The raw bytes of the image encoded as an JPEG.
|
||||
*
|
||||
* @returns A CLIP embedding.
|
||||
*/
|
||||
computeImageEmbedding: (
|
||||
model: Model,
|
||||
imageData: Uint8Array,
|
||||
jpegImageData: Uint8Array,
|
||||
) => Promise<Float32Array>;
|
||||
|
||||
computeTextEmbedding: (model: Model, text: string) => Promise<Float32Array>;
|
||||
|
|
Loading…
Add table
Reference in a new issue