Browse Source

Support for clustering in buckets

laurenspriem 1 year ago
parent
commit
b1b3bcc534

+ 8 - 8
mobile/lib/face/db.dart

@@ -343,15 +343,13 @@ class FaceMLDataDB {
   Future<Map<String, (int?, Uint8List)>> getFaceEmbeddingMap({
   Future<Map<String, (int?, Uint8List)>> getFaceEmbeddingMap({
     double minScore = kMinHighQualityFaceScore,
     double minScore = kMinHighQualityFaceScore,
     int minClarity = kLaplacianThreshold,
     int minClarity = kLaplacianThreshold,
-    int maxRows = 20000,
+    int maxFaces = 20000,
+    int offset = 0,
+    int batchSize = 10000,
   }) async {
   }) async {
     _logger.info('reading as float');
     _logger.info('reading as float');
     final db = await instance.database;
     final db = await instance.database;
 
 
-    // Define the batch size
-    const batchSize = 10000;
-    int offset = 0;
-
     final Map<String, (int?, Uint8List)> result = {};
     final Map<String, (int?, Uint8List)> result = {};
     while (true) {
     while (true) {
       // Query a batch of rows
       // Query a batch of rows
@@ -373,7 +371,7 @@ class FaceMLDataDB {
         result[faceID] =
         result[faceID] =
             (map[faceClusterId] as int?, map[faceEmbeddingBlob] as Uint8List);
             (map[faceClusterId] as int?, map[faceEmbeddingBlob] as Uint8List);
       }
       }
-      if (result.length >= 20000) {
+      if (result.length >= maxFaces) {
         break;
         break;
       }
       }
       offset += batchSize;
       offset += batchSize;
@@ -419,10 +417,12 @@ class FaceMLDataDB {
     return result;
     return result;
   }
   }
 
 
-  Future<int> getTotalFaceCount() async {
+  Future<int> getTotalFaceCount({
+    double minFaceScore = kMinHighQualityFaceScore,
+  }) async {
     final db = await instance.database;
     final db = await instance.database;
     final List<Map<String, dynamic>> maps = await db.rawQuery(
     final List<Map<String, dynamic>> maps = await db.rawQuery(
-      'SELECT COUNT(*) as count FROM $facesTable WHERE $faceScore > $kMinHighQualityFaceScore AND $faceBlur > $kLaplacianThreshold',
+      'SELECT COUNT(*) as count FROM $facesTable WHERE $faceScore > $minFaceScore AND $faceBlur > $kLaplacianThreshold',
     );
     );
     return maps.first['count'] as int;
     return maps.first['count'] as int;
   }
   }

+ 88 - 36
mobile/lib/services/machine_learning/face_ml/face_ml_service.dart

@@ -365,48 +365,98 @@ class FaceMlService {
 
 
   Future<void> clusterAllImages({
   Future<void> clusterAllImages({
     double minFaceScore = kMinHighQualityFaceScore,
     double minFaceScore = kMinHighQualityFaceScore,
+    bool clusterInBuckets = false,
   }) async {
   }) async {
     _logger.info("`clusterAllImages()` called");
     _logger.info("`clusterAllImages()` called");
 
 
     try {
     try {
-      // Read all the embeddings from the database, in a map from faceID to embedding
-      final clusterStartTime = DateTime.now();
-      final faceIdToEmbedding = await FaceMLDataDB.instance.getFaceEmbeddingMap(
-        minScore: minFaceScore,
-      );
-      final gotFaceEmbeddingsTime = DateTime.now();
-      _logger.info(
-        'read embeddings ${faceIdToEmbedding.length} in ${gotFaceEmbeddingsTime.difference(clusterStartTime).inMilliseconds} ms',
-      );
+      if (clusterInBuckets) {
+        // Get a sense of the total number of faces in the database
+        final int totalFaces = await FaceMLDataDB.instance
+            .getTotalFaceCount(minFaceScore: minFaceScore);
+
+        // read the creation times from Files DB, in a map from fileID to creation time
+        final fileIDToCreationTime =
+            await FilesDB.instance.getFileIDToCreationTime();
+
+        const int bucketSize = 10000;
+        const int offsetIncrement = 7500;
+        const int batchSize = 5000;
+        int offset = 0;
+
+        while (true) {
+          final faceIdToEmbeddingBucket =
+              await FaceMLDataDB.instance.getFaceEmbeddingMap(
+            minScore: minFaceScore,
+            maxFaces: bucketSize,
+            offset: offset,
+            batchSize: batchSize,
+          );
+          if (faceIdToEmbeddingBucket.isEmpty) {
+            break;
+          }
+          if (offset > totalFaces) {
+            _logger.warning(
+              'offset > totalFaces, this should ideally not happen. offset: $offset, totalFaces: $totalFaces',
+            );
+            break;
+          }
 
 
-      // Read the creation times from Files DB, in a map from fileID to creation time
-      final fileIDToCreationTime =
-          await FilesDB.instance.getFileIDToCreationTime();
-      _logger.info('read creation times from FilesDB in '
-          '${DateTime.now().difference(gotFaceEmbeddingsTime).inMilliseconds} ms');
+          final faceIdToCluster = await FaceLinearClustering.instance.predict(
+            faceIdToEmbeddingBucket,
+            fileIDToCreationTime: fileIDToCreationTime,
+          );
+          if (faceIdToCluster == null) {
+            _logger.warning("faceIdToCluster is null");
+            return;
+          }
 
 
-      // Cluster the embeddings using the linear clustering algorithm, returning a map from faceID to clusterID
-      final faceIdToCluster = await FaceLinearClustering.instance.predict(
-        faceIdToEmbedding,
-        fileIDToCreationTime: fileIDToCreationTime,
-      );
-      if (faceIdToCluster == null) {
-        _logger.warning("faceIdToCluster is null");
-        return;
-      }
-      final clusterDoneTime = DateTime.now();
-      _logger.info(
-        'done with clustering ${faceIdToEmbedding.length} in ${clusterDoneTime.difference(clusterStartTime).inSeconds} seconds ',
-      );
+          await FaceMLDataDB.instance
+              .updatePersonIDForFaceIDIFNotSet(faceIdToCluster);
 
 
-      // Store the updated clusterIDs in the database
-      _logger.info(
-        'Updating ${faceIdToCluster.length} FaceIDs with clusterIDs in the DB',
-      );
-      await FaceMLDataDB.instance
-          .updatePersonIDForFaceIDIFNotSet(faceIdToCluster);
-      _logger.info('Done updating FaceIDs with clusterIDs in the DB, in '
-          '${DateTime.now().difference(clusterDoneTime).inSeconds} seconds');
+          offset += offsetIncrement;
+        }
+      } else {
+        // Read all the embeddings from the database, in a map from faceID to embedding
+        final clusterStartTime = DateTime.now();
+        final faceIdToEmbedding =
+            await FaceMLDataDB.instance.getFaceEmbeddingMap(
+          minScore: minFaceScore,
+        );
+        final gotFaceEmbeddingsTime = DateTime.now();
+        _logger.info(
+          'read embeddings ${faceIdToEmbedding.length} in ${gotFaceEmbeddingsTime.difference(clusterStartTime).inMilliseconds} ms',
+        );
+
+        // Read the creation times from Files DB, in a map from fileID to creation time
+        final fileIDToCreationTime =
+            await FilesDB.instance.getFileIDToCreationTime();
+        _logger.info('read creation times from FilesDB in '
+            '${DateTime.now().difference(gotFaceEmbeddingsTime).inMilliseconds} ms');
+
+        // Cluster the embeddings using the linear clustering algorithm, returning a map from faceID to clusterID
+        final faceIdToCluster = await FaceLinearClustering.instance.predict(
+          faceIdToEmbedding,
+          fileIDToCreationTime: fileIDToCreationTime,
+        );
+        if (faceIdToCluster == null) {
+          _logger.warning("faceIdToCluster is null");
+          return;
+        }
+        final clusterDoneTime = DateTime.now();
+        _logger.info(
+          'done with clustering ${faceIdToEmbedding.length} in ${clusterDoneTime.difference(clusterStartTime).inSeconds} seconds ',
+        );
+
+        // Store the updated clusterIDs in the database
+        _logger.info(
+          'Updating ${faceIdToCluster.length} FaceIDs with clusterIDs in the DB',
+        );
+        await FaceMLDataDB.instance
+            .updatePersonIDForFaceIDIFNotSet(faceIdToCluster);
+        _logger.info('Done updating FaceIDs with clusterIDs in the DB, in '
+            '${DateTime.now().difference(clusterDoneTime).inSeconds} seconds');
+      }
     } catch (e, s) {
     } catch (e, s) {
       _logger.severe("`clusterAllImages` failed", e, s);
       _logger.severe("`clusterAllImages` failed", e, s);
     }
     }
@@ -522,7 +572,9 @@ class FaceMlService {
           _logger.info(
           _logger.info(
             "indexAllImages() analyzed $fileAnalyzedCount images, cooldown for 1 minute",
             "indexAllImages() analyzed $fileAnalyzedCount images, cooldown for 1 minute",
           );
           );
-          await Future.delayed(const Duration(minutes: 1));
+          await Future.delayed(const Duration(minutes: 1), () {
+            _logger.info("indexAllImages() cooldown finished");
+          });
         }
         }
       }
       }