From ca9ecbb3cab99ecccbc5286595034987214d0c09 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:06:37 -0500 Subject: [PATCH 01/11] list_images now stores valid image and PIL exts in sets built ahead of time rather than on each iteration. exact_path is not created unless the file's ext is a valid image ext. --- deepface/commons/image_utils.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index b72ce0b..f0facd6 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -23,18 +23,15 @@ def list_images(path: str) -> List[str]: images (list): list of exact image paths """ images = [] + image_exts = {".jpg", ".jpeg", ".png"} + pil_exts = {"jpeg", "png"} for r, _, f in os.walk(path): for file in f: - exact_path = os.path.join(r, file) - - ext_lower = os.path.splitext(exact_path)[-1].lower() - - if ext_lower not in {".jpg", ".jpeg", ".png"}: - continue - - with Image.open(exact_path) as img: # lazy - if img.format.lower() in {"jpeg", "png"}: - images.append(exact_path) + if os.path.splitext(file)[1].lower() in image_exts: + exact_path = os.path.join(r, file) + with Image.open(exact_path) as img: # lazy + if img.format.lower() in pil_exts: + images.append(exact_path) return images From b11eec0eab1f20c5e7d0b746e73a37e6cba97279 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:08:24 -0500 Subject: [PATCH 02/11] created a new yield_images generator function to yield the images in a given path. The functionality is equivalent to list_images, but, instead of building then return a list, it yields the image path at each iteration. --- deepface/commons/image_utils.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index f0facd6..40bf925 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -1,7 +1,7 @@ # built-in dependencies import os import io -from typing import List, Union, Tuple +from typing import Generator, List, Union, Tuple import hashlib import base64 from pathlib import Path @@ -35,6 +35,26 @@ def list_images(path: str) -> List[str]: return images +def yield_images(path: str) -> Generator[str]: + """ + List images in a given path + Args: + path (str): path's location + Yields: + image (str): image path + """ + images = [] + image_exts = {".jpg", ".jpeg", ".png"} + pil_exts = {"jpeg", "png"} + for r, _, f in os.walk(path): + for file in f: + if os.path.splitext(file)[1].lower() in image_exts: + exact_path = os.path.join(r, file) + with Image.open(exact_path) as img: # lazy + if img.format.lower() in pil_exts: + yield exact_path + + def find_image_hash(file_path: str) -> str: """ Find the hash of given image file with its properties From 9dc261b080f2e4f4e097da68e3752c7c9e3ddc65 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:08:48 -0500 Subject: [PATCH 03/11] clarify docstring --- deepface/commons/image_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index 40bf925..941513f 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -37,7 +37,7 @@ def list_images(path: str) -> List[str]: def yield_images(path: str) -> Generator[str]: """ - List images in a given path + Yield images in a given path Args: path (str): path's location Yields: From 2ee02e0003cff0d33c8bffe228692aadef4cea2b Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:11:23 -0500 Subject: [PATCH 04/11] storage_images is now built as a set with the new deepface.commons.image_utils.yield_images generator function. Previously, storage_images was created with deepface.commons.image_utils.list_images as a list, then converted to a set while never being used as purely a list. --- deepface/modules/recognition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index f153132..4dc440f 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -168,7 +168,7 @@ def find( pickled_images = [representation["identity"] for representation in representations] # Get the list of images on storage - storage_images = image_utils.list_images(path=db_path) + storage_images = set(image_utils.yield_images(path=db_path)) if len(storage_images) == 0 and refresh_database is True: raise ValueError(f"No item found in {db_path}") @@ -186,8 +186,8 @@ def find( # Enforce data consistency amongst on disk images and pickle file if refresh_database: - new_images = set(storage_images) - set(pickled_images) # images added to storage - old_images = set(pickled_images) - set(storage_images) # images removed from storage + new_images = storage_images - set(pickled_images) # images added to storage + old_images = set(pickled_images) - storage_images # images removed from storage # detect replaced images for current_representation in representations: From 799cb0f6cfb56595535df2b7f6f32de7bfdd2697 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:13:01 -0500 Subject: [PATCH 05/11] pickled_images is now created using a set comprehension, instead of a list comprehension as before. Like storage_images, all subsequent actions where set and not list actions, so it saves time re-creating the list as a set later on. --- deepface/modules/recognition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index 4dc440f..d964693 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -165,7 +165,7 @@ def find( ) # embedded images - pickled_images = [representation["identity"] for representation in representations] + pickled_images = {representation["identity"] for representation in representations} # Get the list of images on storage storage_images = set(image_utils.yield_images(path=db_path)) @@ -186,8 +186,8 @@ def find( # Enforce data consistency amongst on disk images and pickle file if refresh_database: - new_images = storage_images - set(pickled_images) # images added to storage - old_images = set(pickled_images) - storage_images # images removed from storage + new_images = storage_images - pickled_images # images added to storage + old_images = pickled_images - storage_images # images removed from storage # detect replaced images for current_representation in representations: From 56d3b66a5cf08484c5975d48f3ab6ccf63d32310 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:15:38 -0500 Subject: [PATCH 06/11] df_cols is now created as a set. All operations were already set operations on the object. If the order of the columns will need to be maintained in future versions, this should be restored to the original list. --- deepface/modules/recognition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index d964693..fc6f35b 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -136,7 +136,7 @@ def find( representations = [] # required columns for representations - df_cols = [ + df_cols = { "identity", "hash", "embedding", @@ -144,7 +144,7 @@ def find( "target_y", "target_w", "target_h", - ] + } # Ensure the proper pickle file exists if not os.path.exists(datastore_path): @@ -157,7 +157,7 @@ def find( # check each item of representations list has required keys for i, current_representation in enumerate(representations): - missing_keys = set(df_cols) - set(current_representation.keys()) + missing_keys = df_cols - set(current_representation.keys()) if len(missing_keys) > 0: raise ValueError( f"{i}-th item does not have some required keys - {missing_keys}." From 661f13f3b3ee47322903ca0043d1fcb11ea134a3 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:22:18 -0500 Subject: [PATCH 07/11] Remove no longer needed images list --- deepface/commons/image_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index 941513f..1393af1 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -43,7 +43,6 @@ def yield_images(path: str) -> Generator[str]: Yields: image (str): image path """ - images = [] image_exts = {".jpg", ".jpeg", ".png"} pil_exts = {"jpeg", "png"} for r, _, f in os.walk(path): From 9995343e248adbb1d7ed30b77bd0f63e25964d90 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:34:23 -0500 Subject: [PATCH 08/11] delay creating pickled_images until necessary --- deepface/modules/recognition.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py index fc6f35b..90e8c29 100644 --- a/deepface/modules/recognition.py +++ b/deepface/modules/recognition.py @@ -164,9 +164,6 @@ def find( f"Consider to delete {datastore_path}" ) - # embedded images - pickled_images = {representation["identity"] for representation in representations} - # Get the list of images on storage storage_images = set(image_utils.yield_images(path=db_path)) @@ -186,6 +183,11 @@ def find( # Enforce data consistency amongst on disk images and pickle file if refresh_database: + # embedded images + pickled_images = { + representation["identity"] for representation in representations + } + new_images = storage_images - pickled_images # images added to storage old_images = pickled_images - storage_images # images removed from storage From 2aa8ebfec89f908030613c183cd7904701d53335 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 09:43:29 -0500 Subject: [PATCH 09/11] explicitly specified the SendType and ReturnType paramaters for yield_images --- deepface/commons/image_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index 1393af1..b984f91 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -35,7 +35,7 @@ def list_images(path: str) -> List[str]: return images -def yield_images(path: str) -> Generator[str]: +def yield_images(path: str) -> Generator[str, None, None]: """ Yield images in a given path Args: From 6a7505269cff4663908598e4bd44e62f9cab4291 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Mon, 6 Jan 2025 10:11:10 -0500 Subject: [PATCH 10/11] Test image_utils.yield_images returns the same files as image_utils.list_images --- tests/test_find.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/test_find.py b/tests/test_find.py index ffea91b..de8956d 100644 --- a/tests/test_find.py +++ b/tests/test_find.py @@ -95,12 +95,23 @@ def test_filetype_for_find(): def test_filetype_for_find_bulk_embeddings(): - imgs = image_utils.list_images("dataset") + # List + list_imgs = image_utils.list_images("dataset") - assert len(imgs) > 0 + assert len(list_imgs) > 0 # img47 is webp even though its extension is jpg - assert "dataset/img47.jpg" not in imgs + assert "dataset/img47.jpg" not in list_imgs + + # Generator + gen_imgs = list(image_utils.yield_images("dataset")) + + assert len(gen_imgs) > 0 + + # img47 is webp even though its extension is jpg + assert "dataset/img47.jpg" not in gen_imgs + + assert gen_imgs == list_imgs def test_find_without_refresh_database(): From 83031a427d29322d9476e939c45bca7f6d303724 Mon Sep 17 00:00:00 2001 From: "Samuel J. Woodward" Date: Tue, 7 Jan 2025 10:25:40 -0500 Subject: [PATCH 11/11] image_exts and pil_exts are now global variables and are now named as IMAGE_EXTS and PIL_EXTS to match Python naming conventions. --- deepface/commons/image_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deepface/commons/image_utils.py b/deepface/commons/image_utils.py index b984f91..868eaf2 100644 --- a/deepface/commons/image_utils.py +++ b/deepface/commons/image_utils.py @@ -14,6 +14,10 @@ from PIL import Image from werkzeug.datastructures import FileStorage +IMAGE_EXTS = {".jpg", ".jpeg", ".png"} +PIL_EXTS = {"jpeg", "png"} + + def list_images(path: str) -> List[str]: """ List images in a given path @@ -23,14 +27,12 @@ def list_images(path: str) -> List[str]: images (list): list of exact image paths """ images = [] - image_exts = {".jpg", ".jpeg", ".png"} - pil_exts = {"jpeg", "png"} for r, _, f in os.walk(path): for file in f: - if os.path.splitext(file)[1].lower() in image_exts: + if os.path.splitext(file)[1].lower() in IMAGE_EXTS: exact_path = os.path.join(r, file) with Image.open(exact_path) as img: # lazy - if img.format.lower() in pil_exts: + if img.format.lower() in PIL_EXTS: images.append(exact_path) return images @@ -43,14 +45,12 @@ def yield_images(path: str) -> Generator[str, None, None]: Yields: image (str): image path """ - image_exts = {".jpg", ".jpeg", ".png"} - pil_exts = {"jpeg", "png"} for r, _, f in os.walk(path): for file in f: - if os.path.splitext(file)[1].lower() in image_exts: + if os.path.splitext(file)[1].lower() in IMAGE_EXTS: exact_path = os.path.join(r, file) with Image.open(exact_path) as img: # lazy - if img.format.lower() in pil_exts: + if img.format.lower() in PIL_EXTS: yield exact_path