Merge pull request #1420 from PyWoody/image_utils

Optimizations for deepface.recognition.find, Optimization and New Iterator Functionality in image_utils
This commit is contained in:
Sefik Ilkin Serengil 2025-01-07 15:42:06 +00:00 committed by GitHub
commit 5bab888411
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 52 additions and 23 deletions

View File

@ -1,7 +1,7 @@
# built-in dependencies # built-in dependencies
import os import os
import io import io
from typing import List, Union, Tuple from typing import Generator, List, Union, Tuple
import hashlib import hashlib
import base64 import base64
from pathlib import Path from pathlib import Path
@ -14,6 +14,10 @@ from PIL import Image
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
IMAGE_EXTS = {".jpg", ".jpeg", ".png"}
PIL_EXTS = {"jpeg", "png"}
def list_images(path: str) -> List[str]: def list_images(path: str) -> List[str]:
""" """
List images in a given path List images in a given path
@ -25,19 +29,31 @@ def list_images(path: str) -> List[str]:
images = [] images = []
for r, _, f in os.walk(path): for r, _, f in os.walk(path):
for file in f: for file in f:
exact_path = os.path.join(r, file) if os.path.splitext(file)[1].lower() in IMAGE_EXTS:
exact_path = os.path.join(r, file)
ext_lower = os.path.splitext(exact_path)[-1].lower() with Image.open(exact_path) as img: # lazy
if img.format.lower() in PIL_EXTS:
if ext_lower not in {".jpg", ".jpeg", ".png"}: images.append(exact_path)
continue
with Image.open(exact_path) as img: # lazy
if img.format.lower() in {"jpeg", "png"}:
images.append(exact_path)
return images return images
def yield_images(path: str) -> Generator[str, None, None]:
"""
Yield images in a given path
Args:
path (str): path's location
Yields:
image (str): image path
"""
for r, _, f in os.walk(path):
for file in f:
if os.path.splitext(file)[1].lower() in IMAGE_EXTS:
exact_path = os.path.join(r, file)
with Image.open(exact_path) as img: # lazy
if img.format.lower() in PIL_EXTS:
yield exact_path
def find_image_hash(file_path: str) -> str: def find_image_hash(file_path: str) -> str:
""" """
Find the hash of given image file with its properties Find the hash of given image file with its properties

View File

@ -136,7 +136,7 @@ def find(
representations = [] representations = []
# required columns for representations # required columns for representations
df_cols = [ df_cols = {
"identity", "identity",
"hash", "hash",
"embedding", "embedding",
@ -144,7 +144,7 @@ def find(
"target_y", "target_y",
"target_w", "target_w",
"target_h", "target_h",
] }
# Ensure the proper pickle file exists # Ensure the proper pickle file exists
if not os.path.exists(datastore_path): if not os.path.exists(datastore_path):
@ -157,18 +157,15 @@ def find(
# check each item of representations list has required keys # check each item of representations list has required keys
for i, current_representation in enumerate(representations): for i, current_representation in enumerate(representations):
missing_keys = set(df_cols) - set(current_representation.keys()) missing_keys = df_cols - set(current_representation.keys())
if len(missing_keys) > 0: if len(missing_keys) > 0:
raise ValueError( raise ValueError(
f"{i}-th item does not have some required keys - {missing_keys}." f"{i}-th item does not have some required keys - {missing_keys}."
f"Consider to delete {datastore_path}" f"Consider to delete {datastore_path}"
) )
# embedded images
pickled_images = [representation["identity"] for representation in representations]
# Get the list of images on storage # Get the list of images on storage
storage_images = image_utils.list_images(path=db_path) storage_images = set(image_utils.yield_images(path=db_path))
if len(storage_images) == 0 and refresh_database is True: if len(storage_images) == 0 and refresh_database is True:
raise ValueError(f"No item found in {db_path}") raise ValueError(f"No item found in {db_path}")
@ -186,8 +183,13 @@ def find(
# Enforce data consistency amongst on disk images and pickle file # Enforce data consistency amongst on disk images and pickle file
if refresh_database: if refresh_database:
new_images = set(storage_images) - set(pickled_images) # images added to storage # embedded images
old_images = set(pickled_images) - set(storage_images) # images removed from storage pickled_images = {
representation["identity"] for representation in representations
}
new_images = storage_images - pickled_images # images added to storage
old_images = pickled_images - storage_images # images removed from storage
# detect replaced images # detect replaced images
for current_representation in representations: for current_representation in representations:

View File

@ -95,12 +95,23 @@ def test_filetype_for_find():
def test_filetype_for_find_bulk_embeddings(): def test_filetype_for_find_bulk_embeddings():
imgs = image_utils.list_images("dataset") # List
list_imgs = image_utils.list_images("dataset")
assert len(imgs) > 0 assert len(list_imgs) > 0
# img47 is webp even though its extension is jpg # img47 is webp even though its extension is jpg
assert "dataset/img47.jpg" not in imgs assert "dataset/img47.jpg" not in list_imgs
# Generator
gen_imgs = list(image_utils.yield_images("dataset"))
assert len(gen_imgs) > 0
# img47 is webp even though its extension is jpg
assert "dataset/img47.jpg" not in gen_imgs
assert gen_imgs == list_imgs
def test_find_without_refresh_database(): def test_find_without_refresh_database():