From 6eeaeb46f2f69d0410d6587989f14c1153fdde33 Mon Sep 17 00:00:00 2001 From: kremnik Date: Wed, 14 Aug 2024 01:53:26 +0300 Subject: [PATCH 1/2] Update internal db storage --- deepface/modules/recognition.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index f755a73..8537337 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -1,7 +1,7 @@ # built-in dependencies import os import pickle -from typing import List, Union, Optional, Dict, Any +from typing import List, Union, Optional, Dict, Any, Set import time # 3rd party dependencies @@ -141,7 +141,7 @@ def find( # check each item of representations list has required keys for i, current_representation in enumerate(representations): - missing_keys = list(set(df_cols) - set(current_representation.keys())) + missing_keys = set(df_cols) - set(current_representation.keys()) if len(missing_keys) > 0: raise ValueError( f"{i}-th item does not have some required keys - {missing_keys}." @@ -160,8 +160,6 @@ def find( raise ValueError(f"Nothing is found in {datastore_path}") must_save_pickle = False - new_images = [] - old_images = [] replaced_images = [] if not refresh_database: @@ -172,8 +170,8 @@ def find( # Enforce data consistency amongst on disk images and pickle file if refresh_database: - new_images = list(set(storage_images) - set(pickled_images)) # images added to storage - old_images = list(set(pickled_images) - set(storage_images)) # images removed from storage + new_images = set(storage_images) - set(pickled_images) # images added to storage + old_images = set(pickled_images) - set(storage_images) # images removed from storage # detect replaced images for current_representation in representations: @@ -194,8 +192,8 @@ def find( ) # append replaced images into both old and new images. these will be dropped and re-added. - new_images = new_images + replaced_images - old_images = old_images + replaced_images + new_images.update(replaced_images) + old_images.update(replaced_images) # remove old images first if len(old_images) > 0: @@ -316,7 +314,7 @@ def find( def __find_bulk_embeddings( - employees: List[str], + employees: Set[str], model_name: str = "VGG-Face", detector_backend: str = "opencv", enforce_detection: bool = True, From fc5c4b9155a424abd492814e9093b19ce04ed1cf Mon Sep 17 00:00:00 2001 From: kremnik Date: Wed, 14 Aug 2024 20:32:54 +0300 Subject: [PATCH 2/2] Add tests for refresh_database=False --- deepface/modules/recognition.py | 4 +-- tests/test_find.py | 50 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index 8537337..baa50ed 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -160,7 +160,7 @@ def find( raise ValueError(f"Nothing is found in {datastore_path}") must_save_pickle = False - replaced_images = [] + new_images, old_images, replaced_images = set(), set(), set() if not refresh_database: logger.info( @@ -182,7 +182,7 @@ def find( beta_hash = image_utils.find_image_hash(identity) if alpha_hash != beta_hash: logger.debug(f"Even though {identity} represented before, it's replaced later.") - replaced_images.append(identity) + replaced_images.add(identity) if not silent and (len(new_images) > 0 or len(old_images) > 0 or len(replaced_images) > 0): logger.info( diff --git a/tests/test_find.py b/tests/test_find.py index b6845a4..ffea91b 100644 --- a/tests/test_find.py +++ b/tests/test_find.py @@ -101,3 +101,53 @@ def test_filetype_for_find_bulk_embeddings(): # img47 is webp even though its extension is jpg assert "dataset/img47.jpg" not in imgs + + +def test_find_without_refresh_database(): + import shutil, hashlib + + img_path = os.path.join("dataset", "img1.jpg") + + # 1. Calculate hash of the .pkl file; + # 2. Move random image to the temporary created directory; + # 3. As a result, there will be a difference between the .pkl file and the disk files; + # 4. If refresh_database=False, then .pkl file should not be updated. + # Recalculate hash and compare it with the hash from pt. 1; + # 5. After successful check, the image will be moved back to the original destination; + + pkl_path = "dataset/ds_model_vggface_detector_opencv_aligned_normalization_base_expand_0.pkl" + with open(pkl_path, "rb") as f: + hash_before = hashlib.sha256(f.read()) + + image_name = "img28.jpg" + tmp_dir = "dataset/temp_image" + os.mkdir(tmp_dir) + shutil.move(os.path.join("dataset", image_name), os.path.join(tmp_dir, image_name)) + + dfs = DeepFace.find(img_path=img_path, db_path="dataset", silent=True, refresh_database=False) + + with open(pkl_path, "rb") as f: + hash_after = hashlib.sha256(f.read()) + + shutil.move(os.path.join(tmp_dir, image_name), os.path.join("dataset", image_name)) + os.rmdir(tmp_dir) + + assert hash_before.hexdigest() == hash_after.hexdigest() + + logger.info("✅ .pkl hashes before and after the recognition process are the same") + + assert len(dfs) > 0 + for df in dfs: + assert isinstance(df, pd.DataFrame) + + # one is img1.jpg itself + identity_df = df[df["identity"] == img_path] + assert identity_df.shape[0] > 0 + + # validate reproducability + assert identity_df["distance"].values[0] < threshold + + df = df[df["identity"] != img_path] + logger.debug(df.head()) + assert df.shape[0] > 0 + logger.info("✅ test find without refresh database done")