Deduplication: Refactor + Dedupe by hash and size (instead of just size) (#1172)

This commit is contained in:
Neeraj Gupta 2023-05-31 13:39:23 +05:30 committed by GitHub
commit e7e520f9f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 80 deletions

View file

@ -10,7 +10,6 @@ import 'package:photos/models/ente_file.dart';
import 'package:photos/models/file_type.dart';
import 'package:photos/models/location/location.dart';
import "package:photos/models/metadata/file_magic.dart";
import 'package:photos/services/feature_flag_service.dart';
import 'package:photos/utils/date_time_util.dart';
import 'package:photos/utils/exif_util.dart';
@ -159,11 +158,11 @@ class File extends EnteFile {
// handle past live photos upload from web client
if (hash == null &&
fileType == FileType.livePhoto &&
metadata.containsKey('imgHash') &&
metadata.containsKey('vidHash')) {
metadata.containsKey('imageHash') &&
metadata.containsKey('videoHash')) {
// convert to imgHash:vidHash
hash =
'${metadata['imgHash']}$kLivePhotoHashSeparator${metadata['vidHash']}';
'${metadata['imageHash']}$kLivePhotoHashSeparator${metadata['videoHash']}';
}
metadataVersion = metadata["version"] ?? 0;
}

View file

@ -66,79 +66,33 @@ class DeduplicationService {
}
}
List<DuplicateFiles> clubDuplicatesByTime(List<DuplicateFiles> dupes) {
final result = <DuplicateFiles>[];
for (final dupe in dupes) {
final files = <File>[];
final Map<int, int> creationTimeCounter = {};
int mostFrequentCreationTime = 0, mostFrequentCreationTimeCount = 0;
// Counts the frequency of creationTimes within the supposed duplicates
for (final file in dupe.files) {
if (creationTimeCounter.containsKey(file.creationTime!)) {
creationTimeCounter[file.creationTime!] =
creationTimeCounter[file.creationTime!]! + 1;
} else {
creationTimeCounter[file.creationTime!] = 0;
List<DuplicateFiles> clubDuplicates(
List<DuplicateFiles> dupesBySize, {
required String? Function(File) clubbingKey,
}) {
final dupesBySizeAndClubKey = <DuplicateFiles>[];
for (final sizeBasedDupe in dupesBySize) {
final Map<String, List<File>> clubKeyToFilesMap = {};
for (final file in sizeBasedDupe.files) {
final String? clubKey = clubbingKey(file);
if (clubKey == null || clubKey.isEmpty) {
continue;
}
if (creationTimeCounter[file.creationTime]! >
mostFrequentCreationTimeCount) {
mostFrequentCreationTimeCount =
creationTimeCounter[file.creationTime]!;
mostFrequentCreationTime = file.creationTime!;
if (!clubKeyToFilesMap.containsKey(clubKey)) {
clubKeyToFilesMap[clubKey] = <File>[];
}
files.add(file);
clubKeyToFilesMap[clubKey]!.add(file);
}
// Ignores those files that were not created within the most common creationTime
final incorrectDuplicates = <File>{};
for (final file in files) {
if (file.creationTime != mostFrequentCreationTime) {
incorrectDuplicates.add(file);
for (final clubbingKey in clubKeyToFilesMap.keys) {
final clubbedFiles = clubKeyToFilesMap[clubbingKey]!;
if (clubbedFiles.length > 1) {
dupesBySizeAndClubKey.add(
DuplicateFiles(clubbedFiles, sizeBasedDupe.size),
);
}
}
files.removeWhere((file) => incorrectDuplicates.contains(file));
if (files.length > 1) {
result.add(DuplicateFiles(files, dupe.size));
}
}
return result;
}
List<DuplicateFiles> clubDuplicatesByName(List<DuplicateFiles> dupes) {
final result = <DuplicateFiles>[];
for (final dupe in dupes) {
final files = <File>[];
final Map<String, int> fileNameCounter = {};
String mostFrequentFileName = "";
int mostFrequentFileNameCount = 0;
// Counts the frequency of creationTimes within the supposed duplicates
for (final file in dupe.files) {
if (fileNameCounter.containsKey(file.displayName)) {
fileNameCounter[file.displayName] =
fileNameCounter[file.displayName]! + 1;
} else {
fileNameCounter[file.displayName] = 0;
}
if (fileNameCounter[file.displayName]! >
mostFrequentFileNameCount) {
mostFrequentFileNameCount =
fileNameCounter[file.displayName]!;
mostFrequentFileName = file.displayName;
}
files.add(file);
}
// Ignores those files that were not created within the most common creationTime
final incorrectDuplicates = <File>{};
for (final file in files) {
if (file.displayName != mostFrequentFileName) {
incorrectDuplicates.add(file);
}
}
files.removeWhere((file) => incorrectDuplicates.contains(file));
if (files.length > 1) {
result.add(DuplicateFiles(files, dupe.size));
}
}
return result;
return dupesBySizeAndClubKey;
}
Future<DuplicateFilesResponse> _fetchDuplicateFileIDs() async {

View file

@ -48,7 +48,7 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
final Set<File> _selectedFiles = <File>{};
final Map<int?, int> _fileSizeMap = {};
late List<DuplicateFiles> _duplicates;
bool _shouldClubByCaptureTime = true;
bool _shouldClubByCaptureTime = false;
bool _shouldClubByFileName = false;
bool toastShown = false;
@ -56,8 +56,10 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
@override
void initState() {
_duplicates =
DeduplicationService.instance.clubDuplicatesByTime(widget.duplicates);
_duplicates = DeduplicationService.instance.clubDuplicates(
widget.duplicates,
clubbingKey: (File f) => f.hash,
);
_selectAllFilesButFirst();
super.initState();
@ -228,6 +230,9 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
value: _shouldClubByFileName,
onChanged: (value) {
_shouldClubByFileName = value!;
if (_shouldClubByFileName) {
_shouldClubByCaptureTime = false;
}
_resetEntriesAndSelection();
setState(() {});
},
@ -237,6 +242,9 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
value: _shouldClubByCaptureTime,
onChanged: (value) {
_shouldClubByCaptureTime = value!;
if (_shouldClubByCaptureTime) {
_shouldClubByFileName = false;
}
_resetEntriesAndSelection();
setState(() {});
},
@ -258,14 +266,18 @@ class _DeduplicatePageState extends State<DeduplicatePage> {
void _resetEntriesAndSelection() {
_duplicates = widget.duplicates;
late String? Function(File) clubbingKeyFn;
if (_shouldClubByCaptureTime) {
_duplicates =
DeduplicationService.instance.clubDuplicatesByTime(_duplicates);
}
if (_shouldClubByFileName) {
_duplicates =
DeduplicationService.instance.clubDuplicatesByName(_duplicates);
clubbingKeyFn = (File f) => f.creationTime?.toString() ?? '';
} else if (_shouldClubByFileName) {
clubbingKeyFn = (File f) => f.displayName;
} else {
clubbingKeyFn = (File f) => f.hash;
}
_duplicates = DeduplicationService.instance.clubDuplicates(
_duplicates,
clubbingKey: clubbingKeyFn,
);
_selectAllFilesButFirst();
}