Merge pull request #888 from roboflow/joao/gaze-detection-block

Gaze detection Workflow block
roboflow · Dec 20, 2024 · bbd3719 · bbd3719
2 parents 6c5e4fa + 8358782
commit bbd3719
Show file tree

Hide file tree

Showing 17 changed files with 612 additions and 23 deletions.
diff --git a/.github/workflows/integration_tests_inference_cli_depending_on_inference_x86.yml b/.github/workflows/integration_tests_inference_cli_depending_on_inference_x86.yml
@@ -1,8 +1,6 @@
 name: INTEGRATION TESTS - inference CLI + inference CORE
 
 on:
-  pull_request:
-    branches: [main]
   push:
     branches: [main]
   workflow_dispatch:

diff --git a/.github/workflows/integration_tests_inference_cli_x86.yml b/.github/workflows/integration_tests_inference_cli_x86.yml
@@ -1,8 +1,6 @@
 name: INTEGRATION TESTS - inference CLI
 
 on:
-  pull_request:
-    branches: [main]
   push:
     branches: [main]
   workflow_dispatch:

diff --git a/.github/workflows/integration_tests_inference_models.yml b/.github/workflows/integration_tests_inference_models.yml
@@ -1,8 +1,6 @@
 name: INTEGRATION TESTS - inference models
 
 on:
-  pull_request:
-    branches: [main]
   push:
     branches: [main]
   workflow_dispatch:

diff --git a/.github/workflows/integration_tests_workflows_x86.yml b/.github/workflows/integration_tests_workflows_x86.yml
@@ -44,6 +44,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install --upgrade setuptools
-          pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements/_requirements.txt -r requirements/requirements.cpu.txt -r requirements/requirements.sdk.http.txt -r requirements/requirements.test.unit.txt -r requirements/requirements.http.txt -r requirements/requirements.yolo_world.txt -r requirements/requirements.doctr.txt -r requirements/requirements.sam.txt -r requirements/requirements.transformers.txt
+          pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements/_requirements.txt -r requirements/requirements.sam.txt -r requirements/requirements.cpu.txt -r requirements/requirements.http.txt -r requirements/requirements.test.unit.txt -r requirements/requirements.doctr.txt -r requirements/requirements.yolo_world.txt -r requirements/requirements.transformers.txt -r requirements/requirements.sdk.http.txt
       - name: 🧪 Integration Tests of Workflows
         run: ROBOFLOW_API_KEY=${{ secrets.API_KEY }} SKIP_FLORENCE2_TEST=FALSE LOAD_ENTERPRISE_BLOCKS=TRUE python -m pytest tests/workflows/integration_tests
diff --git a/inference/core/env.py b/inference/core/env.py
@@ -73,10 +73,10 @@
 CLIP_MODEL_ID = f"clip/{CLIP_VERSION_ID}"
 
 # Gaze version ID, default is "L2CS"
-GAZE_VERSION_ID = os.getenv("GAZE_VERSION_ID", "L2CS")
+GAZE_VERSION_ID = os.getenv("GAZE_VERSION_ID", "l2cs")
 
 # Gaze model ID
-GAZE_MODEL_ID = f"gaze/{CLIP_VERSION_ID}"
+GAZE_MODEL_ID = f"gaze/{GAZE_VERSION_ID}"
 
 # OWLv2 version ID, default is "owlv2-large-patch14-ensemble"
 OWLV2_VERSION_ID = os.getenv("OWLV2_VERSION_ID", "owlv2-large-patch14-ensemble")

diff --git a/inference/core/version.py b/inference/core/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.31.2rc1"
+__version__ = "0.32.0"
 
 
 if __name__ == "__main__":

diff --git a/inference/core/workflows/core_steps/common/utils.py b/inference/core/workflows/core_steps/common/utils.py
@@ -11,6 +11,7 @@
 from inference.core.entities.requests.clip import ClipCompareRequest
 from inference.core.entities.requests.cogvlm import CogVLMInferenceRequest
 from inference.core.entities.requests.doctr import DoctrOCRInferenceRequest
+from inference.core.entities.requests.gaze import GazeDetectionInferenceRequest
 from inference.core.entities.requests.sam2 import Sam2InferenceRequest
 from inference.core.entities.requests.yolo_world import YOLOWorldInferenceRequest
 from inference.core.managers.base import ModelManager
@@ -58,6 +59,7 @@ def load_core_model(
         CogVLMInferenceRequest,
         YOLOWorldInferenceRequest,
         Sam2InferenceRequest,
+        GazeDetectionInferenceRequest,
     ],
     core_model: str,
 ) -> str:

diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
@@ -168,6 +168,7 @@
 from inference.core.workflows.core_steps.models.foundation.florence2.v2 import (
     Florence2BlockV2,
 )
+from inference.core.workflows.core_steps.models.foundation.gaze.v1 import GazeBlockV1
 from inference.core.workflows.core_steps.models.foundation.google_gemini.v1 import (
     GoogleGeminiBlockV1,
 )
@@ -598,6 +599,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         EnvironmentSecretsStoreBlockV1,
         SlackNotificationBlockV1,
         TwilioSMSNotificationBlockV1,
+        GazeBlockV1,
     ]
 
 

diff --git a/inference/core/workflows/core_steps/models/foundation/gaze/__init__.py b/inference/core/workflows/core_steps/models/foundation/gaze/__init__.py
diff --git a/inference/core/workflows/core_steps/models/foundation/gaze/v1.py b/inference/core/workflows/core_steps/models/foundation/gaze/v1.py
@@ -0,0 +1,247 @@
+from typing import List, Literal, Optional, Tuple, Type, Union
+
+import numpy as np
+import supervision as sv
+from pydantic import ConfigDict, Field
+
+from inference.core.entities.requests.gaze import GazeDetectionInferenceRequest
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.core_steps.common.utils import (
+    add_inference_keypoints_to_sv_detections,
+    attach_parents_coordinates_to_batch_of_sv_detections,
+    attach_prediction_type_info_to_sv_detections_batch,
+    convert_inference_detections_batch_to_sv_detections,
+    load_core_model,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    BOOLEAN_KIND,
+    FLOAT_KIND,
+    IMAGE_KIND,
+    KEYPOINT_DETECTION_PREDICTION_KIND,
+    ImageInputField,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Run L2CS Gaze detection model on faces in images.
+
+This block can:
+1. Detect faces in images and estimate their gaze direction
+2. Estimate gaze direction on pre-cropped face images
+
+The gaze direction is represented by yaw and pitch angles in degrees.
+"""
+
+
+def convert_gaze_detections_to_sv_detections_and_angles(
+    images: Batch[WorkflowImageData],
+    gaze_predictions: List[dict],
+) -> Tuple[List[sv.Detections], List[List[float]], List[List[float]]]:
+    """Convert gaze detection results to supervision detections and angle lists."""
+    face_predictions = []
+    yaw_degrees = []
+    pitch_degrees = []
+
+    for single_image, predictions in zip(images, gaze_predictions):
+        height, width = single_image.numpy_image.shape[:2]
+
+        # Format predictions for this image
+        image_face_preds = {
+            "predictions": [],
+            "image": {"width": width, "height": height},
+        }
+        batch_yaw = []
+        batch_pitch = []
+
+        for p in predictions:  # predictions is already a list
+            p_dict = p.model_dump(by_alias=True, exclude_none=True)
+            for pred in p_dict["predictions"]:
+                face = pred["face"]
+
+                # Face detection with landmarks
+                face_pred = {
+                    "x": face["x"],
+                    "y": face["y"],
+                    "width": face["width"],
+                    "height": face["height"],
+                    "confidence": face["confidence"],
+                    "class": "face",
+                    "class_id": 0,
+                    "keypoints": [
+                        {
+                            "x": l["x"],
+                            "y": l["y"],
+                            "confidence": face["confidence"],
+                            "class_name": str(i),
+                            "class_id": i,
+                        }
+                        for i, l in enumerate(face["landmarks"])
+                    ],
+                }
+
+                image_face_preds["predictions"].append(face_pred)
+
+                # Store angles in degrees
+                batch_yaw.append(pred["yaw"] * 180 / np.pi)
+                batch_pitch.append(pred["pitch"] * 180 / np.pi)
+
+        face_predictions.append(image_face_preds)
+        yaw_degrees.append(batch_yaw)
+        pitch_degrees.append(batch_pitch)
+
+    # Process predictions
+    face_preds = convert_inference_detections_batch_to_sv_detections(face_predictions)
+
+    # Add keypoints to supervision detections
+    for prediction, detections in zip(face_predictions, face_preds):
+        add_inference_keypoints_to_sv_detections(
+            inference_prediction=prediction["predictions"],
+            detections=detections,
+        )
+
+    face_preds = attach_prediction_type_info_to_sv_detections_batch(
+        predictions=face_preds,
+        prediction_type="facial-landmark",
+    )
+    face_preds = attach_parents_coordinates_to_batch_of_sv_detections(
+        images=images,
+        predictions=face_preds,
+    )
+
+    return face_preds, yaw_degrees, pitch_degrees
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Gaze Detection",
+            "version": "v1",
+            "short_description": "Detect faces and estimate gaze direction",
+            "long_description": LONG_DESCRIPTION,
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": ["gaze", "face"],
+        },
+        protected_namespaces=(),
+    )
+
+    type: Literal["roboflow_core/gaze@v1"]
+    images: Selector(kind=[IMAGE_KIND]) = ImageInputField
+    do_run_face_detection: Union[bool, Selector(kind=[BOOLEAN_KIND])] = Field(
+        default=True,
+        description="Whether to run face detection. Set to False if input images are pre-cropped face images.",
+    )
+
+    @classmethod
+    def get_parameters_accepting_batches(cls) -> List[str]:
+        return ["images"]
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="face_predictions",
+                kind=[KEYPOINT_DETECTION_PREDICTION_KIND],
+                description="Facial landmark predictions",
+            ),
+            OutputDefinition(
+                name="yaw_degrees",
+                kind=[FLOAT_KIND],
+                description="Yaw angle in degrees (-180 to 180, negative is left)",
+            ),
+            OutputDefinition(
+                name="pitch_degrees",
+                kind=[FLOAT_KIND],
+                description="Pitch angle in degrees (-90 to 90, negative is down)",
+            ),
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class GazeBlockV1(WorkflowBlock):
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+        step_execution_mode: StepExecutionMode,
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+        self._step_execution_mode = step_execution_mode
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key", "step_execution_mode"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        do_run_face_detection: bool,
+    ) -> BlockResult:
+        if self._step_execution_mode is StepExecutionMode.LOCAL:
+            return self.run_locally(
+                images=images,
+                do_run_face_detection=do_run_face_detection,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported step execution mode: {self._step_execution_mode}"
+            )
+
+    def run_locally(
+        self,
+        images: Batch[WorkflowImageData],
+        do_run_face_detection: bool,
+    ) -> BlockResult:
+        predictions = []
+
+        for single_image in images:
+            inference_request = GazeDetectionInferenceRequest(
+                image=single_image.to_inference_format(numpy_preferred=True),
+                do_run_face_detection=do_run_face_detection,
+                api_key=self._api_key,
+            )
+            gaze_model_id = load_core_model(
+                model_manager=self._model_manager,
+                inference_request=inference_request,
+                core_model="gaze",
+            )
+            prediction = self._model_manager.infer_from_request_sync(
+                gaze_model_id, inference_request
+            )
+            predictions.append(prediction)
+
+        # Convert predictions to supervision format and get angles
+        face_preds, yaw_degrees, pitch_degrees = (
+            convert_gaze_detections_to_sv_detections_and_angles(
+                images=images,
+                gaze_predictions=predictions,
+            )
+        )
+
+        return [
+            {
+                "face_predictions": face_pred,
+                "yaw_degrees": yaw,
+                "pitch_degrees": pitch,
+            }
+            for face_pred, yaw, pitch in zip(face_preds, yaw_degrees, pitch_degrees)
+        ]