Deduplication: Refactor + Dedupe by hash and size (instead of just size) (#1172)

This commit is contained in:
Neeraj Gupta 2023-05-31 13:39:23 +05:30 committed by GitHub
commit e7e520f9f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 80 deletions

View file

@ -10,7 +10,6 @@ import 'package:photos/models/ente_file.dart';
import 'package:photos/models/file_type.dart'; import 'package:photos/models/file_type.dart';
import 'package:photos/models/location/location.dart'; import 'package:photos/models/location/location.dart';
import "package:photos/models/metadata/file_magic.dart"; import "package:photos/models/metadata/file_magic.dart";
import 'package:photos/services/feature_flag_service.dart'; import 'package:photos/services/feature_flag_service.dart';
import 'package:photos/utils/date_time_util.dart'; import 'package:photos/utils/date_time_util.dart';
import 'package:photos/utils/exif_util.dart'; import 'package:photos/utils/exif_util.dart';
@ -159,11 +158,11 @@ class File extends EnteFile {
// handle past live photos upload from web client // handle past live photos upload from web client
if (hash == null && if (hash == null &&
fileType == FileType.livePhoto && fileType == FileType.livePhoto &&
metadata.containsKey('imgHash') && metadata.containsKey('imageHash') &&
metadata.containsKey('vidHash')) { metadata.containsKey('videoHash')) {
// convert to imgHash:vidHash // convert to imgHash:vidHash
hash = hash =
'${metadata['imgHash']}$kLivePhotoHashSeparator${metadata['vidHash']}'; '${metadata['imageHash']}$kLivePhotoHashSeparator${metadata['videoHash']}';
} }
metadataVersion = metadata["version"] ?? 0; metadataVersion = metadata["version"] ?? 0;
} }

View file

@ -66,79 +66,33 @@ class DeduplicationService {
} }
} }
List<DuplicateFiles> clubDuplicatesByTime(List<DuplicateFiles> dupes) { List<DuplicateFiles> clubDuplicates(
final result = <DuplicateFiles>[]; List<DuplicateFiles> dupesBySize, {
for (final dupe in dupes) { required String? Function(File) clubbingKey,
final files = <File>[]; }) {
final Map<int, int> creationTimeCounter = {}; final dupesBySizeAndClubKey = <DuplicateFiles>[];
int mostFrequentCreationTime = 0, mostFrequentCreationTimeCount = 0; for (final sizeBasedDupe in dupesBySize) {
// Counts the frequency of creationTimes within the supposed duplicates final Map<String, List<File>> clubKeyToFilesMap = {};
for (final file in dupe.files) { for (final file in sizeBasedDupe.files) {
if (creationTimeCounter.containsKey(file.creationTime!)) { final String? clubKey = clubbingKey(file);
creationTimeCounter[file.creationTime!] = if (clubKey == null || clubKey.isEmpty) {
creationTimeCounter[file.creationTime!]! + 1; continue;
} else {
creationTimeCounter[file.creationTime!] = 0;
} }
if (creationTimeCounter[file.creationTime]! > if (!clubKeyToFilesMap.containsKey(clubKey)) {
mostFrequentCreationTimeCount) { clubKeyToFilesMap[clubKey] = <File>[];
mostFrequentCreationTimeCount =
creationTimeCounter[file.creationTime]!;
mostFrequentCreationTime = file.creationTime!;
} }
files.add(file); clubKeyToFilesMap[clubKey]!.add(file);
} }
// Ignores those files that were not created within the most common creationTime for (final clubbingKey in clubKeyToFilesMap.keys) {
final incorrectDuplicates = <File>{}; final clubbedFiles = clubKeyToFilesMap[clubbingKey]!;
for (final file in files) { if (clubbedFiles.length > 1) {
if (file.creationTime != mostFrequentCreationTime) { dupesBySizeAndClubKey.add(
incorrectDuplicates.add(file); DuplicateFiles(clubbedFiles, sizeBasedDupe.size),
);
} }
} }
files.removeWhere((file) => incorrectDuplicates.contains(file));
if (files.length > 1) {
result.add(DuplicateFiles(files, dupe.size));
}
} }
return result; return dupesBySizeAndClubKey;
}
List<DuplicateFiles> clubDuplicatesByName(List<DuplicateFiles> dupes) {
final result = <DuplicateFiles>[];
for (final dupe in dupes) {
final files = <File>[];
final Map<String, int> fileNameCounter = {};
String mostFrequentFileName = "";
int mostFrequentFileNameCount = 0;
// Counts the frequency of creationTimes within the supposed duplicates
for (final file in dupe.files) {
if (fileNameCounter.containsKey(file.displayName)) {
fileNameCounter[file.displayName] =
fileNameCounter[file.displayName]! + 1;
} else {
fileNameCounter[file.displayName] = 0;
}
if (fileNameCounter[file.displayName]! >
mostFrequentFileNameCount) {
mostFrequentFileNameCount =
fileNameCounter[file.displayName]!;
mostFrequentFileName = file.displayName;
}
files.add(file);
}
// Ignores those files that were not created within the most common creationTime
final incorrectDuplicates = <File>{};
for (final file in files) {
if (file.displayName != mostFrequentFileName) {
incorrectDuplicates.add(file);
}
}
files.removeWhere((file) => incorrectDuplicates.contains(file));
if (files.length > 1) {
result.add(DuplicateFiles(files, dupe.size));
}
}
return result;
} }
Future<DuplicateFilesResponse> _fetchDuplicateFileIDs() async { Future<DuplicateFilesResponse> _fetchDuplicateFileIDs() async {

View file

@ -48,7 +48,7 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
final Set<File> _selectedFiles = <File>{}; final Set<File> _selectedFiles = <File>{};
final Map<int?, int> _fileSizeMap = {}; final Map<int?, int> _fileSizeMap = {};
late List<DuplicateFiles> _duplicates; late List<DuplicateFiles> _duplicates;
bool _shouldClubByCaptureTime = true; bool _shouldClubByCaptureTime = false;
bool _shouldClubByFileName = false; bool _shouldClubByFileName = false;
bool toastShown = false; bool toastShown = false;
@ -56,8 +56,10 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
@override @override
void initState() { void initState() {
_duplicates = _duplicates = DeduplicationService.instance.clubDuplicates(
DeduplicationService.instance.clubDuplicatesByTime(widget.duplicates); widget.duplicates,
clubbingKey: (File f) => f.hash,
);
_selectAllFilesButFirst(); _selectAllFilesButFirst();
super.initState(); super.initState();
@ -228,6 +230,9 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
value: _shouldClubByFileName, value: _shouldClubByFileName,
onChanged: (value) { onChanged: (value) {
_shouldClubByFileName = value!; _shouldClubByFileName = value!;
if (_shouldClubByFileName) {
_shouldClubByCaptureTime = false;
}
_resetEntriesAndSelection(); _resetEntriesAndSelection();
setState(() {}); setState(() {});
}, },
@ -237,6 +242,9 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
value: _shouldClubByCaptureTime, value: _shouldClubByCaptureTime,
onChanged: (value) { onChanged: (value) {
_shouldClubByCaptureTime = value!; _shouldClubByCaptureTime = value!;
if (_shouldClubByCaptureTime) {
_shouldClubByFileName = false;
}
_resetEntriesAndSelection(); _resetEntriesAndSelection();
setState(() {}); setState(() {});
}, },
@ -258,14 +266,18 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
void _resetEntriesAndSelection() { void _resetEntriesAndSelection() {
_duplicates = widget.duplicates; _duplicates = widget.duplicates;
late String? Function(File) clubbingKeyFn;
if (_shouldClubByCaptureTime) { if (_shouldClubByCaptureTime) {
_duplicates = clubbingKeyFn = (File f) => f.creationTime?.toString() ?? '';
DeduplicationService.instance.clubDuplicatesByTime(_duplicates); } else if (_shouldClubByFileName) {
} clubbingKeyFn = (File f) => f.displayName;
if (_shouldClubByFileName) { } else {
_duplicates = clubbingKeyFn = (File f) => f.hash;
DeduplicationService.instance.clubDuplicatesByName(_duplicates);
} }
_duplicates = DeduplicationService.instance.clubDuplicates(
_duplicates,
clubbingKey: clubbingKeyFn,
);
_selectAllFilesButFirst(); _selectAllFilesButFirst();
} }