This commit is contained in:
Manav Rathi 2024-05-18 09:13:34 +05:30
parent e224ad19d3
commit ae70eb33dd
No known key found for this signature in database
2 changed files with 262 additions and 264 deletions

View file

@ -1,262 +0,0 @@
import { workerBridge } from "@/next/worker/worker-bridge";
import { euclidean } from "hdbscan";
import { Box, Dimensions, Point, newBox } from "services/face/geom";
import { FaceDetection } from "services/face/types";
import { clamp, getPixelBilinear, normalizePixelBetween0And1 } from "./image";
import { transformFaceDetections } from "./transform-box";
/**
* Detect faces in the given {@link imageBitmap}.
*
* The model used is YOLO, running in an ONNX runtime.
*/
export const detectFaces = async (
imageBitmap: ImageBitmap,
): Promise<Array<FaceDetection>> => {
const maxFaceDistancePercent = Math.sqrt(2) / 100;
const maxFaceDistance = imageBitmap.width * maxFaceDistancePercent;
const preprocessResult = preprocessImageBitmapToFloat32ChannelsFirst(
imageBitmap,
640,
640,
);
const data = preprocessResult.data;
const resized = preprocessResult.newSize;
const outputData = await workerBridge.detectFaces(data);
const faces = getFacesFromYOLOOutput(outputData as Float32Array, 0.7);
const inBox = newBox(0, 0, resized.width, resized.height);
const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
const faceDetections = transformFaceDetections(faces, inBox, toBox);
return removeDuplicateDetections(faceDetections, maxFaceDistance);
};
const preprocessImageBitmapToFloat32ChannelsFirst = (
imageBitmap: ImageBitmap,
requiredWidth: number,
requiredHeight: number,
maintainAspectRatio: boolean = true,
normFunction: (pixelValue: number) => number = normalizePixelBetween0And1,
) => {
// Create an OffscreenCanvas and set its size.
const offscreenCanvas = new OffscreenCanvas(
imageBitmap.width,
imageBitmap.height,
);
const ctx = offscreenCanvas.getContext("2d");
ctx.drawImage(imageBitmap, 0, 0, imageBitmap.width, imageBitmap.height);
const imageData = ctx.getImageData(
0,
0,
imageBitmap.width,
imageBitmap.height,
);
const pixelData = imageData.data;
let scaleW = requiredWidth / imageBitmap.width;
let scaleH = requiredHeight / imageBitmap.height;
if (maintainAspectRatio) {
const scale = Math.min(
requiredWidth / imageBitmap.width,
requiredHeight / imageBitmap.height,
);
scaleW = scale;
scaleH = scale;
}
const scaledWidth = clamp(
Math.round(imageBitmap.width * scaleW),
0,
requiredWidth,
);
const scaledHeight = clamp(
Math.round(imageBitmap.height * scaleH),
0,
requiredHeight,
);
const processedImage = new Float32Array(
1 * 3 * requiredWidth * requiredHeight,
);
// Populate the Float32Array with normalized pixel values
let pixelIndex = 0;
const channelOffsetGreen = requiredHeight * requiredWidth;
const channelOffsetBlue = 2 * requiredHeight * requiredWidth;
for (let h = 0; h < requiredHeight; h++) {
for (let w = 0; w < requiredWidth; w++) {
let pixel: {
r: number;
g: number;
b: number;
};
if (w >= scaledWidth || h >= scaledHeight) {
pixel = { r: 114, g: 114, b: 114 };
} else {
pixel = getPixelBilinear(
w / scaleW,
h / scaleH,
pixelData,
imageBitmap.width,
imageBitmap.height,
);
}
processedImage[pixelIndex] = normFunction(pixel.r);
processedImage[pixelIndex + channelOffsetGreen] = normFunction(
pixel.g,
);
processedImage[pixelIndex + channelOffsetBlue] = normFunction(
pixel.b,
);
pixelIndex++;
}
}
return {
data: processedImage,
originalSize: {
width: imageBitmap.width,
height: imageBitmap.height,
},
newSize: { width: scaledWidth, height: scaledHeight },
};
};
/**
* @param rowOutput A Float32Array of shape [25200, 16], where each row
* represents a bounding box.
*/
const getFacesFromYOLOOutput = (
rowOutput: Float32Array,
minScore: number,
): Array<FaceDetection> => {
const faces: Array<FaceDetection> = [];
// Iterate over each row.
for (let i = 0; i < rowOutput.length; i += 16) {
const score = rowOutput[i + 4];
if (score < minScore) {
continue;
}
// The first 4 values represent the bounding box's coordinates:
//
// (x1, y1, x2, y2)
//
const xCenter = rowOutput[i];
const yCenter = rowOutput[i + 1];
const width = rowOutput[i + 2];
const height = rowOutput[i + 3];
const xMin = xCenter - width / 2.0; // topLeft
const yMin = yCenter - height / 2.0; // topLeft
const leftEyeX = rowOutput[i + 5];
const leftEyeY = rowOutput[i + 6];
const rightEyeX = rowOutput[i + 7];
const rightEyeY = rowOutput[i + 8];
const noseX = rowOutput[i + 9];
const noseY = rowOutput[i + 10];
const leftMouthX = rowOutput[i + 11];
const leftMouthY = rowOutput[i + 12];
const rightMouthX = rowOutput[i + 13];
const rightMouthY = rowOutput[i + 14];
const box = new Box({
x: xMin,
y: yMin,
width: width,
height: height,
});
const probability = score as number;
const landmarks = [
new Point(leftEyeX, leftEyeY),
new Point(rightEyeX, rightEyeY),
new Point(noseX, noseY),
new Point(leftMouthX, leftMouthY),
new Point(rightMouthX, rightMouthY),
];
faces.push({ box, landmarks, probability });
}
return faces;
};
export const getRelativeDetection = (
faceDetection: FaceDetection,
dimensions: Dimensions,
): FaceDetection => {
const oldBox: Box = faceDetection.box;
const box = new Box({
x: oldBox.x / dimensions.width,
y: oldBox.y / dimensions.height,
width: oldBox.width / dimensions.width,
height: oldBox.height / dimensions.height,
});
const oldLandmarks: Point[] = faceDetection.landmarks;
const landmarks = oldLandmarks.map((l) => {
return new Point(l.x / dimensions.width, l.y / dimensions.height);
});
const probability = faceDetection.probability;
return { box, landmarks, probability };
};
/**
* Removes duplicate face detections from an array of detections.
*
* This function sorts the detections by their probability in descending order,
* then iterates over them.
*
* For each detection, it calculates the Euclidean distance to all other
* detections.
*
* If the distance is less than or equal to the specified threshold
* (`withinDistance`), the other detection is considered a duplicate and is
* removed.
*
* @param detections - An array of face detections to remove duplicates from.
*
* @param withinDistance - The maximum Euclidean distance between two detections
* for them to be considered duplicates.
*
* @returns An array of face detections with duplicates removed.
*/
const removeDuplicateDetections = (
detections: Array<FaceDetection>,
withinDistance: number,
) => {
detections.sort((a, b) => b.probability - a.probability);
const isSelected = new Map<number, boolean>();
for (let i = 0; i < detections.length; i++) {
if (isSelected.get(i) === false) {
continue;
}
isSelected.set(i, true);
for (let j = i + 1; j < detections.length; j++) {
if (isSelected.get(j) === false) {
continue;
}
const centeri = getDetectionCenter(detections[i]);
const centerj = getDetectionCenter(detections[j]);
const dist = euclidean(
[centeri.x, centeri.y],
[centerj.x, centerj.y],
);
if (dist <= withinDistance) {
isSelected.set(j, false);
}
}
}
const uniques: Array<FaceDetection> = [];
for (let i = 0; i < detections.length; i++) {
isSelected.get(i) && uniques.push(detections[i]);
}
return uniques;
};
function getDetectionCenter(detection: FaceDetection) {
const center = new Point(0, 0);
// TODO: first 4 landmarks is applicable to blazeface only
// this needs to consider eyes, nose and mouth landmarks to take center
detection.landmarks?.slice(0, 4).forEach((p) => {
center.x += p.x;
center.y += p.y;
});
return new Point(center.x / 4, center.y / 4);
}

View file

@ -1,10 +1,10 @@
import { openCache } from "@/next/blob-cache";
import log from "@/next/log";
import { workerBridge } from "@/next/worker/worker-bridge";
import { euclidean } from "hdbscan";
import { Matrix } from "ml-matrix";
import mlIDbStorage from "services/face/db";
import { detectFaces, getRelativeDetection } from "services/face/detect";
import { Box, Point, enlargeBox } from "services/face/geom";
import { Box, Dimensions, Point, enlargeBox, newBox } from "services/face/geom";
import {
DetectedFace,
Face,
@ -19,15 +19,19 @@ import { defaultMLVersion } from "services/machineLearning/machineLearningServic
import { getSimilarityTransformation } from "similarity-transformation";
import type { EnteFile } from "types/file";
import {
clamp,
createGrayscaleIntMatrixFromNormalized2List,
cropWithRotation,
fetchImageBitmap,
fetchImageBitmapForContext,
getFaceId,
getLocalFile,
getPixelBilinear,
imageBitmapToBlob,
normalizePixelBetween0And1,
warpAffineFloat32List,
} from "./image";
import { transformFaceDetections } from "./transform-box";
/**
* Index faces in the given file.
@ -112,6 +116,262 @@ const syncFileFaceDetections = async (fileContext: MLSyncFileContext) => {
log.info("[MLService] Detected Faces: ", newMlFile.faces?.length);
};
/**
* Detect faces in the given {@link imageBitmap}.
*
* The model used is YOLO, running in an ONNX runtime.
*/
export const detectFaces = async (
imageBitmap: ImageBitmap,
): Promise<Array<FaceDetection>> => {
const maxFaceDistancePercent = Math.sqrt(2) / 100;
const maxFaceDistance = imageBitmap.width * maxFaceDistancePercent;
const preprocessResult = preprocessImageBitmapToFloat32ChannelsFirst(
imageBitmap,
640,
640,
);
const data = preprocessResult.data;
const resized = preprocessResult.newSize;
const outputData = await workerBridge.detectFaces(data);
const faces = getFacesFromYOLOOutput(outputData as Float32Array, 0.7);
const inBox = newBox(0, 0, resized.width, resized.height);
const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
const faceDetections = transformFaceDetections(faces, inBox, toBox);
return removeDuplicateDetections(faceDetections, maxFaceDistance);
};
const preprocessImageBitmapToFloat32ChannelsFirst = (
imageBitmap: ImageBitmap,
requiredWidth: number,
requiredHeight: number,
maintainAspectRatio: boolean = true,
normFunction: (pixelValue: number) => number = normalizePixelBetween0And1,
) => {
// Create an OffscreenCanvas and set its size.
const offscreenCanvas = new OffscreenCanvas(
imageBitmap.width,
imageBitmap.height,
);
const ctx = offscreenCanvas.getContext("2d");
ctx.drawImage(imageBitmap, 0, 0, imageBitmap.width, imageBitmap.height);
const imageData = ctx.getImageData(
0,
0,
imageBitmap.width,
imageBitmap.height,
);
const pixelData = imageData.data;
let scaleW = requiredWidth / imageBitmap.width;
let scaleH = requiredHeight / imageBitmap.height;
if (maintainAspectRatio) {
const scale = Math.min(
requiredWidth / imageBitmap.width,
requiredHeight / imageBitmap.height,
);
scaleW = scale;
scaleH = scale;
}
const scaledWidth = clamp(
Math.round(imageBitmap.width * scaleW),
0,
requiredWidth,
);
const scaledHeight = clamp(
Math.round(imageBitmap.height * scaleH),
0,
requiredHeight,
);
const processedImage = new Float32Array(
1 * 3 * requiredWidth * requiredHeight,
);
// Populate the Float32Array with normalized pixel values
let pixelIndex = 0;
const channelOffsetGreen = requiredHeight * requiredWidth;
const channelOffsetBlue = 2 * requiredHeight * requiredWidth;
for (let h = 0; h < requiredHeight; h++) {
for (let w = 0; w < requiredWidth; w++) {
let pixel: {
r: number;
g: number;
b: number;
};
if (w >= scaledWidth || h >= scaledHeight) {
pixel = { r: 114, g: 114, b: 114 };
} else {
pixel = getPixelBilinear(
w / scaleW,
h / scaleH,
pixelData,
imageBitmap.width,
imageBitmap.height,
);
}
processedImage[pixelIndex] = normFunction(pixel.r);
processedImage[pixelIndex + channelOffsetGreen] = normFunction(
pixel.g,
);
processedImage[pixelIndex + channelOffsetBlue] = normFunction(
pixel.b,
);
pixelIndex++;
}
}
return {
data: processedImage,
originalSize: {
width: imageBitmap.width,
height: imageBitmap.height,
},
newSize: { width: scaledWidth, height: scaledHeight },
};
};
/**
* @param rowOutput A Float32Array of shape [25200, 16], where each row
* represents a bounding box.
*/
const getFacesFromYOLOOutput = (
rowOutput: Float32Array,
minScore: number,
): Array<FaceDetection> => {
const faces: Array<FaceDetection> = [];
// Iterate over each row.
for (let i = 0; i < rowOutput.length; i += 16) {
const score = rowOutput[i + 4];
if (score < minScore) {
continue;
}
// The first 4 values represent the bounding box's coordinates:
//
// (x1, y1, x2, y2)
//
const xCenter = rowOutput[i];
const yCenter = rowOutput[i + 1];
const width = rowOutput[i + 2];
const height = rowOutput[i + 3];
const xMin = xCenter - width / 2.0; // topLeft
const yMin = yCenter - height / 2.0; // topLeft
const leftEyeX = rowOutput[i + 5];
const leftEyeY = rowOutput[i + 6];
const rightEyeX = rowOutput[i + 7];
const rightEyeY = rowOutput[i + 8];
const noseX = rowOutput[i + 9];
const noseY = rowOutput[i + 10];
const leftMouthX = rowOutput[i + 11];
const leftMouthY = rowOutput[i + 12];
const rightMouthX = rowOutput[i + 13];
const rightMouthY = rowOutput[i + 14];
const box = new Box({
x: xMin,
y: yMin,
width: width,
height: height,
});
const probability = score as number;
const landmarks = [
new Point(leftEyeX, leftEyeY),
new Point(rightEyeX, rightEyeY),
new Point(noseX, noseY),
new Point(leftMouthX, leftMouthY),
new Point(rightMouthX, rightMouthY),
];
faces.push({ box, landmarks, probability });
}
return faces;
};
export const getRelativeDetection = (
faceDetection: FaceDetection,
dimensions: Dimensions,
): FaceDetection => {
const oldBox: Box = faceDetection.box;
const box = new Box({
x: oldBox.x / dimensions.width,
y: oldBox.y / dimensions.height,
width: oldBox.width / dimensions.width,
height: oldBox.height / dimensions.height,
});
const oldLandmarks: Point[] = faceDetection.landmarks;
const landmarks = oldLandmarks.map((l) => {
return new Point(l.x / dimensions.width, l.y / dimensions.height);
});
const probability = faceDetection.probability;
return { box, landmarks, probability };
};
/**
* Removes duplicate face detections from an array of detections.
*
* This function sorts the detections by their probability in descending order,
* then iterates over them.
*
* For each detection, it calculates the Euclidean distance to all other
* detections.
*
* If the distance is less than or equal to the specified threshold
* (`withinDistance`), the other detection is considered a duplicate and is
* removed.
*
* @param detections - An array of face detections to remove duplicates from.
*
* @param withinDistance - The maximum Euclidean distance between two detections
* for them to be considered duplicates.
*
* @returns An array of face detections with duplicates removed.
*/
const removeDuplicateDetections = (
detections: Array<FaceDetection>,
withinDistance: number,
) => {
detections.sort((a, b) => b.probability - a.probability);
const isSelected = new Map<number, boolean>();
for (let i = 0; i < detections.length; i++) {
if (isSelected.get(i) === false) {
continue;
}
isSelected.set(i, true);
for (let j = i + 1; j < detections.length; j++) {
if (isSelected.get(j) === false) {
continue;
}
const centeri = getDetectionCenter(detections[i]);
const centerj = getDetectionCenter(detections[j]);
const dist = euclidean(
[centeri.x, centeri.y],
[centerj.x, centerj.y],
);
if (dist <= withinDistance) {
isSelected.set(j, false);
}
}
}
const uniques: Array<FaceDetection> = [];
for (let i = 0; i < detections.length; i++) {
isSelected.get(i) && uniques.push(detections[i]);
}
return uniques;
};
function getDetectionCenter(detection: FaceDetection) {
const center = new Point(0, 0);
// TODO: first 4 landmarks is applicable to blazeface only
// this needs to consider eyes, nose and mouth landmarks to take center
detection.landmarks?.slice(0, 4).forEach((p) => {
center.x += p.x;
center.y += p.y;
});
return new Point(center.x / 4, center.y / 4);
}
const syncFileFaceCrops = async (fileContext: MLSyncFileContext) => {
const { newMlFile } = fileContext;
const imageBitmap = await fetchImageBitmapForContext(fileContext);