Merge pull request #967 from serengil/feat-task-2301-vgg-normalization-layer

vgg normalization layer bug for gpu users
2025-07-23 10:20:03 +00:00 · 2024-01-23 22:37:45 +00:00 · 2024-01-23 22:37:45 +00:00 · 88814e6d2b
commit 88814e6d2b
parent 3265be2d3a 5ffa7bfb95
5 changed files with 84 additions and 141 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![PyPI Downloads](https://static.pepy.tech/personalized-badge/deepface?period=total&units=international_system&left_color=grey&right_color=blue&left_text=pypi%20downloads)](https://pepy.tech/project/deepface)
 [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/deepface?color=green&label=conda%20downloads)](https://anaconda.org/conda-forge/deepface)
-[![Stars](https://img.shields.io/github/stars/serengil/deepface?color=yellow&style=flat)](https://github.com/serengil/deepface/stargazers)
+[![Stars](https://img.shields.io/github/stars/serengil/deepface?color=yellow&style=flat&label=%E2%AD%90%20stars)](https://github.com/serengil/deepface/stargazers)
 [![License](http://img.shields.io/:license-MIT-green.svg?style=flat)](https://github.com/serengil/deepface/blob/master/LICENSE)
 [![Tests](https://github.com/serengil/deepface/actions/workflows/tests.yml/badge.svg)](https://github.com/serengil/deepface/actions/workflows/tests.yml)
--- a/deepface/DeepFace.py
+++ b/deepface/DeepFace.py
@ -45,7 +45,7 @@ def build_model(model_name: str) -> Any:
            VGG-Face, Facenet, OpenFace, DeepFace, DeepID for face recognition
            Age, Gender, Emotion, Race for facial attributes
    Returns:
-            built model with corresponding class
+        built_model
    """
    return modeling.build_model(model_name=model_name)
@ -62,57 +62,37 @@ def verify(
 ) -> Dict[str, Any]:
    """
    Verify if an image pair represents the same person or different persons.
    The verification function converts facial images to vectors and calculates the similarity
    between those vectors. Vectors of images of the same person should exhibit higher similarity
    (or lower distance) than vectors of images of different persons.
    Args:
        img1_path (str or np.ndarray): Path to the first image. Accepts exact image path
            as a string, numpy array (BGR), or base64 encoded images.
        img2_path (str or np.ndarray): Path to the second image. Accepts exact image path
            as a string, numpy array (BGR), or base64 encoded images.
        model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv)
        distance_metric (string): Metric for measuring similarity. Options: 'cosine',
            'euclidean', 'euclidean_l2' (default is cosine).
        enforce_detection (boolean): If no face is detected in an image, raise an exception.
            Set to False to avoid the exception for low-resolution images (default is True).
        align (bool): Flag to enable face alignment (default is True).
        normalization (string): Normalize the input image before feeding it to the model.
            Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace (default is base)
    Returns:
-        result (dict): A dictionary containing verification results.
+        result (dict): A dictionary containing verification results with following keys.
        - 'verified' (bool): Indicates whether the images represent the same person (True)
            or different persons (False).
        - 'distance' (float): The distance measure between the face vectors.
            A lower distance indicates higher similarity.
        - 'max_threshold_to_verify' (float): The maximum threshold used for verification.
            If the distance is below this threshold, the images are considered a match.
        - 'model' (str): The chosen face recognition model.
        - 'similarity_metric' (str): The chosen similarity metric for measuring distances.
        - 'facial_areas' (dict): Rectangular regions of interest for faces in both images.
            - 'img1': {'x': int, 'y': int, 'w': int, 'h': int}
                    Region of interest for the first image.
            - 'img2': {'x': int, 'y': int, 'w': int, 'h': int}
                    Region of interest for the second image.
        - 'time' (float): Time taken for the verification process in seconds.
    """
@ -138,77 +118,59 @@ def analyze(
 ) -> List[Dict[str, Any]]:
    """
    Analyze facial attributes such as age, gender, emotion, and race in the provided image.
    Args:
        img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
            or a base64 encoded image. If the source image contains multiple faces, the result will
            include information for each detected face.
        actions (tuple): Attributes to analyze. The default is ('age', 'gender', 'emotion', 'race').
            You can exclude some of these attributes from the analysis if needed.
        enforce_detection (boolean): If no face is detected in an image, raise an exception.
            Set to False to avoid the exception for low-resolution images (default is True).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
        distance_metric (string): Metric for measuring similarity. Options: 'cosine',
            'euclidean', 'euclidean_l2' (default is cosine).
        align (boolean): Perform alignment based on the eye positions (default is True).
        silent (boolean): Suppress or allow some log messages for a quieter analysis process
            (default is False).
    Returns:
        results (List[Dict[str, Any]]): A list of dictionaries, where each dictionary represents
-           the analysis results for a detected face.
+           the analysis results for a detected face. Each dictionary in the list contains the
-
+           following keys:
-           Each dictionary in the list contains the following keys:
+        - 'region' (dict): Represents the rectangular region of the detected face in the image.
-
+            - 'x': x-coordinate of the top-left corner of the face.
-           - 'region' (dict): Represents the rectangular region of the detected face in the image.
+            - 'y': y-coordinate of the top-left corner of the face.
-               - 'x': x-coordinate of the top-left corner of the face.
+            - 'w': Width of the detected face region.
-               - 'y': y-coordinate of the top-left corner of the face.
+            - 'h': Height of the detected face region.
-               - 'w': Width of the detected face region.
+        - 'age' (float): Estimated age of the detected face.
-               - 'h': Height of the detected face region.
+        - 'face_confidence' (float): Confidence score for the detected face.
-
+            Indicates the reliability of the face detection.
-           - 'age' (float): Estimated age of the detected face.
+        - 'dominant_gender' (str): The dominant gender in the detected face.
-
+            Either "Man" or "Woman."
-           - 'face_confidence' (float): Confidence score for the detected face.
+        - 'gender' (dict): Confidence scores for each gender category.
-                Indicates the reliability of the face detection.
+            - 'Man': Confidence score for the male gender.
-
+            - 'Woman': Confidence score for the female gender.
-           - 'dominant_gender' (str): The dominant gender in the detected face.
+        - 'dominant_emotion' (str): The dominant emotion in the detected face.
-                Either "Man" or "Woman."
+            Possible values include "sad," "angry," "surprise," "fear," "happy,"
-
+            "disgust," and "neutral."
-           - 'gender' (dict): Confidence scores for each gender category.
+        - 'emotion' (dict): Confidence scores for each emotion category.
-               - 'Man': Confidence score for the male gender.
+            - 'sad': Confidence score for sadness.
-               - 'Woman': Confidence score for the female gender.
+            - 'angry': Confidence score for anger.
-
+            - 'surprise': Confidence score for surprise.
-           - 'dominant_emotion' (str): The dominant emotion in the detected face.
+            - 'fear': Confidence score for fear.
-                Possible values include "sad," "angry," "surprise," "fear," "happy,"
+            - 'happy': Confidence score for happiness.
-                "disgust," and "neutral."
+            - 'disgust': Confidence score for disgust.
-
+            - 'neutral': Confidence score for neutrality.
-           - 'emotion' (dict): Confidence scores for each emotion category.
+        - 'dominant_race' (str): The dominant race in the detected face.
-               - 'sad': Confidence score for sadness.
+            Possible values include "indian," "asian," "latino hispanic,"
-               - 'angry': Confidence score for anger.
+            "black," "middle eastern," and "white."
-               - 'surprise': Confidence score for surprise.
+        - 'race' (dict): Confidence scores for each race category.
-               - 'fear': Confidence score for fear.
+            - 'indian': Confidence score for Indian ethnicity.
-               - 'happy': Confidence score for happiness.
+            - 'asian': Confidence score for Asian ethnicity.
-               - 'disgust': Confidence score for disgust.
+            - 'latino hispanic': Confidence score for Latino/Hispanic ethnicity.
-               - 'neutral': Confidence score for neutrality.
+            - 'black': Confidence score for Black ethnicity.
-
+            - 'middle eastern': Confidence score for Middle Eastern ethnicity.
-           - 'dominant_race' (str): The dominant race in the detected face.
+            - 'white': Confidence score for White ethnicity.
                Possible values include "indian," "asian," "latino hispanic,"
                "black," "middle eastern," and "white."
           - 'race' (dict): Confidence scores for each race category.
               - 'indian': Confidence score for Indian ethnicity.
               - 'asian': Confidence score for Asian ethnicity.
               - 'latino hispanic': Confidence score for Latino/Hispanic ethnicity.
               - 'black': Confidence score for Black ethnicity.
               - 'middle eastern': Confidence score for Middle Eastern ethnicity.
               - 'white': Confidence score for White ethnicity.
    """
    return demography.analyze(
        img_path=img_path,
@ -233,46 +195,36 @@ def find(
 ) -> List[pd.DataFrame]:
    """
    Identify individuals in a database
    Args:
        img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
            or a base64 encoded image. If the source image contains multiple faces, the result will
            include information for each detected face.
        db_path (string): Path to the folder containing image files. All detected faces
            in the database will be considered in the decision-making process.
        model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
        distance_metric (string): Metric for measuring similarity. Options: 'cosine',
-            'euclidean', 'euclidean_l2'.
+            'euclidean', 'euclidean_l2' (default is cosine).
        enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
+            Set to False to avoid the exception for low-resolution images (default is True).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
-
+        align (boolean): Perform alignment based on the eye positions (default is True).
        align (boolean): Perform alignment based on the eye positions.
        normalization (string): Normalize the input image before feeding it to the model.
-            Default is base. Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace
+            Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace (default is base).
-
+        silent (boolean): Suppress or allow some log messages for a quieter analysis process
-        silent (boolean): Suppress or allow some log messages for a quieter analysis process.
+            (default is False).
    Returns:
        results (List[pd.DataFrame]): A list of pandas dataframes. Each dataframe corresponds
            to the identity information for an individual detected in the source image.
            The DataFrame columns include:
-
+        - 'identity': Identity label of the detected individual.
-            - 'identity': Identity label of the detected individual.
+        - 'target_x', 'target_y', 'target_w', 'target_h': Bounding box coordinates of the
-            - 'target_x', 'target_y', 'target_w', 'target_h': Bounding box coordinates of the
+                target face in the database.
-                    target face in the database.
+        - 'source_x', 'source_y', 'source_w', 'source_h': Bounding box coordinates of the
-            - 'source_x', 'source_y', 'source_w', 'source_h': Bounding box coordinates of the
+                detected face in the source image.
-                    detected face in the source image.
+        - '{model_name}_{distance_metric}': Similarity score between the faces based on the
-            - '{model_name}_{distance_metric}': Similarity score between the faces based on the
+                specified model and distance metric
                    specified model and distance metric
    """
    return recognition.find(
        img_path=img_path,
@ -302,25 +254,20 @@ def represent(
        img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
            or a base64 encoded image. If the source image contains multiple faces, the result will
            include information for each detected face.
        model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face.).
        enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
+            Default is True. Set to False to avoid the exception for low-resolution images
-
+            (default is True).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
-
+        align (boolean): Perform alignment based on the eye positions (default is True).
        align (boolean): Perform alignment based on the eye positions.
        normalization (string): Normalize the input image before feeding it to the model.
            Default is base. Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace
-
+            (default is base).
    Returns:
        results (List[Dict[str, Any]]): A list of dictionaries, each containing the
            following fields:
        - embedding (np.array): Multidimensional vector representing facial features.
            The number of dimensions varies based on the reference model
            (e.g., FaceNet returns 128 dimensions, VGG-Face returns 4096 dimensions).
@ -359,13 +306,13 @@ def stream(
            in the database will be considered in the decision-making process.
        model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
        distance_metric (string): Metric for measuring similarity. Options: 'cosine',
-            'euclidean', 'euclidean_l2'.
+            'euclidean', 'euclidean_l2' (default is cosine).
        enable_face_analysis (bool): Flag to enable face analysis (default is True).
@ -408,22 +355,15 @@ def extract_faces(
    Args:
        img_path (str or np.ndarray): Path to the first image. Accepts exact image path
            as a string, numpy array (BGR), or base64 encoded images.
        target_size (tuple): final shape of facial image. black pixels will be
-            added to resize the image.
+            added to resize the image (default is (224, 224)).
        detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv)
        enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
+            Set to False to avoid the exception for low-resolution images (default is True).
        align (bool): Flag to enable face alignment (default is True).
        grayscale (boolean): Flag to convert the image to grayscale before
            processing (default is False).
    Returns:
        results (List[Dict[str, Any]]): A list of dictionaries, where each dictionary contains:
        - "face" (np.ndarray): The detected face as a NumPy array.
--- a/deepface/basemodels/VGGFace.py
+++ b/deepface/basemodels/VGGFace.py
@ -2,9 +2,9 @@ from typing import List
 import os
 import gdown
 import numpy as np
-from deepface.commons import functions
+from deepface.commons import functions, distance
 from deepface.commons.logger import Logger
 from deepface.models.FacialRecognition import FacialRecognition
 from deepface.commons.logger import Logger
 logger = Logger(module="basemodels.VGGFace")
@ -20,9 +20,7 @@ if tf_version == 1:
        Flatten,
        Dropout,
        Activation,
        Lambda,
    )
    from keras import backend as K
 else:
    from tensorflow.keras.models import Model, Sequential
    from tensorflow.keras.layers import (
@ -32,9 +30,7 @@ else:
        Flatten,
        Dropout,
        Activation,
        Lambda,
    )
    from tensorflow.keras import backend as K
 # ---------------------------------------
@ -58,7 +54,11 @@ class VggFaceClient(FacialRecognition):
        """
        # model.predict causes memory issue when it is called in a for loop
        # embedding = model.predict(img, verbose=0)[0].tolist()
-        return self.model(img, training=False).numpy()[0].tolist()
+        # having normalization layer in descriptor troubles for some gpu users (e.g. issue 957, 966)
        # instead we are now calculating it with traditional way not with keras backend
        embedding = self.model(img, training=False).numpy()[0].tolist()
        embedding = distance.l2_normalize(embedding)
        return embedding.tolist()
 def base_model() -> Sequential:
@ -144,9 +144,10 @@ def load_model(
    # as described here: https://github.com/serengil/deepface/issues/944
    base_model_output = Sequential()
    base_model_output = Flatten()(model.layers[-5].output)
-    base_model_output = Lambda(lambda x: K.l2_normalize(x, axis=1), name="norm_layer")(
+    # keras backend's l2 normalization layer troubles some gpu users (e.g. issue 957, 966)
-        base_model_output
+    # base_model_output = Lambda(lambda x: K.l2_normalize(x, axis=1), name="norm_layer")(
-    )
+    #     base_model_output
    # )
    vgg_face_descriptor = Model(inputs=model.input, outputs=base_model_output)
    return vgg_face_descriptor
--- a/deepface/commons/distance.py
+++ b/deepface/commons/distance.py
@ -32,7 +32,9 @@ def findEuclideanDistance(
    return euclidean_distance
-def l2_normalize(x: np.ndarray) -> np.ndarray:
+def l2_normalize(x: Union[np.ndarray, list]) -> np.ndarray:
    if isinstance(x, list):
        x = np.array(x)
    return x / np.sqrt(np.sum(np.multiply(x, x)))
--- a/tests/test_find.py
+++ b/tests/test_find.py
@ -6,6 +6,8 @@ from deepface.commons.logger import Logger
 logger = Logger("tests/test_find.py")
 threshold = distance.findThreshold(model_name="VGG-Face", distance_metric="cosine")
 def test_find_with_exact_path():
    img_path = "dataset/img1.jpg"
@ -19,7 +21,7 @@ def test_find_with_exact_path():
        assert identity_df.shape[0] > 0
        # validate reproducability
-        assert identity_df["VGG-Face_cosine"].values[0] == 0
+        assert identity_df["VGG-Face_cosine"].values[0] < threshold
        df = df[df["identity"] != img_path]
        logger.debug(df.head())
@ -40,7 +42,7 @@ def test_find_with_array_input():
        assert identity_df.shape[0] > 0
        # validate reproducability
-        assert identity_df["VGG-Face_cosine"].values[0] == 0
+        assert identity_df["VGG-Face_cosine"].values[0] < threshold
        df = df[df["identity"] != img_path]
        logger.debug(df.head())
@ -63,9 +65,7 @@ def test_find_with_extracted_faces():
        assert identity_df.shape[0] > 0
        # validate reproducability
-        assert identity_df["VGG-Face_cosine"].values[0] < (
+        assert identity_df["VGG-Face_cosine"].values[0] < threshold
            distance.findThreshold(model_name="VGG-Face", distance_metric="cosine")
        )
        df = df[df["identity"] != img_path]
        logger.debug(df.head())