Add CV example: Video Dataset Management and Frame-Level Annotation f…

…or YOLOv8 (#20) * Update contact info in fashion_product_images tutorial * Initial commit - create DataChain for Kinetics dataset * Add Video Dataset Management example - kinetics_actions_video * Rename to video-pose-detection-yolov8 * Remove old datachain-logo * Remove unnecessary src/.gitignore file * Rename example folder * Rename example folder * fix some issues, pick video ids properly * use GS for data * use google cloud, local path to cached files --------- Co-authored-by: Mikhail <[email protected]>
iterative · Sep 30, 2024 · fbc0026 · fbc0026
1 parent 8445997
commit fbc0026
Show file tree

Hide file tree

Showing 7 changed files with 17,823 additions and 2 deletions.
diff --git a/computer_vision/fashion_product_images/README.md b/computer_vision/fashion_product_images/README.md
@@ -53,8 +53,7 @@ datachain query scripts/1-quick-start.py
 We'd love to have you join our growing community of DataChain users and contributors! Here's how you can get involved:
 
 - ⭐ Give us a star on [GitHub](https://github.com/iterative/datachain) to show your support
-- 🌐 Visit the [dvc.ai website](https://dvc.ai/) to learn more about our products and services
-- 📞 Contact us to discuss on scaling 🚀 DataChain for your project!
+- 🌐 Visit the [datachain.ai](https://datachain.ai) website to learn more about products and services
 - 🙌 Follow us on [LinkedIn](https://www.linkedin.com/company/dvc-ai/) and [Twitter](https://x.com/DVCorg) for the latest updates and insights
 
 Thanks for choosing DataChain, and happy coding! 😄
diff --git a/computer_vision/video_pose_detection_yolov8/.gitignore b/computer_vision/video_pose_detection_yolov8/.gitignore
@@ -0,0 +1,5 @@
+.datachain
+data
+dev
+test-notebooks.sh
+yolov8n-pose.pt
diff --git a/computer_vision/video_pose_detection_yolov8/README.md b/computer_vision/video_pose_detection_yolov8/README.md
@@ -0,0 +1,47 @@
+<img src="https://raw.githubusercontent.com/iterative/datachain/main/docs/assets/datachain-sys-theme.svg" alt="Dataset" style="width: 200px;"/>
+
+# Tutorial: Enhancing Video Dataset Management and Frame-Level Annotation for YOLOv8 Pose Detection projects
+
+This tutorial dives into techniques to manage video and image datasets for Video Analytics and Vision AI projects. 
+
+📋 Topics covered:
+1. Building a Video DataChain for `kinetics-700-2020` video dataset
+2. Creating a Data Model for YOLOv8 Pose Detection projects
+3. Integrating Video-Level Annotations from CSV
+4. Extract and Manage Video Frames
+5. Running Pose Detection with YOLOv8 and Saving to DataChain
+6. Visualizing Pose Data
+
+
+## 🛠️ Install
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+You might also need to run:
+
+```
+apt-get update
+apt-get install -y --no-install-recommends libgl1 libglib2.0-0
+```
+
+## 🚀 Run Jupyter Notebooks
+
+The tutorial is available in Jupyter Notebooks. Start Jupyter Notebook server and follow the instructions.
+
+```bash
+jupyter notebook
+```
+
+## 🤝 Get Involved
+
+We'd love to have you join our growing community of DataChain users and contributors! Here's how you can get involved:
+
+- ⭐ Give us a star on [GitHub](https://github.com/iterative/datachain) to show your support
+- 🌐 Visit the [datachain.ai](https://datachain.ai) website to learn more about products and services
+- 🙌 Follow us on [LinkedIn](https://www.linkedin.com/company/dvc-ai/) and [Twitter](https://x.com/DVCorg) for the latest updates and insights
+
+Thanks for choosing DataChain, and happy coding! 😄
diff --git a/computer_vision/video_pose_detection_yolov8/requirements.txt b/computer_vision/video_pose_detection_yolov8/requirements.txt
@@ -0,0 +1,2 @@
+datachain
+ultralytics
diff --git a/computer_vision/video_pose_detection_yolov8/src/ultralitics_utils.py b/computer_vision/video_pose_detection_yolov8/src/ultralitics_utils.py
@@ -0,0 +1,170 @@
+import numpy as np
+import torch
+import ultralytics as ur
+from PIL import Image
+
+from ultralytics.engine.results import Boxes, Keypoints, Results
+
+
+def normalize_to_pixel(value: float, dim_size: int) -> int:
+    """
+    Convert a normalized coordinate to pixel value.
+
+    Args:
+        value (float): Normalized coordinate value (0-1).
+        dim_size (int): Dimension size (width or height).
+
+    Returns:
+        int: Pixel coordinate value.
+    """
+    return int(value * dim_size)
+
+
+def normalize_bbox(data: dict, height: int, width: int) -> list[float]:
+    """
+    Normalize bounding box coordinates to pixel values.
+
+    Args:
+        data (dict): Dictionary containing bounding box data.
+        height (int): Image height.
+        width (int): Image width.
+
+    Returns:
+        list[float]: Normalized bounding box coordinates and metadata.
+    """
+    return [
+        normalize_to_pixel(data['boxes']['x1'], width),
+        normalize_to_pixel(data['boxes']['y1'], height),
+        normalize_to_pixel(data['boxes']['x2'], width),
+        normalize_to_pixel(data['boxes']['y2'], height),
+        data['confidence'],
+        data['cls']
+    ]
+
+
+def process_annotation(pose_annotation) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Process a single pose annotation.
+
+    Args:
+        pose_annotation: Pose annotation object.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: Processed boxes and keypoints data.
+    """
+    data = pose_annotation.dict()
+    orig_shape = tuple(data['orig_shape'])
+    height, width = orig_shape
+
+    keypoints_data = torch.tensor([
+        [normalize_to_pixel(x, width),
+         normalize_to_pixel(y, height),
+         v]
+        for x, y, v in zip(
+            data['keypoints']['x'],
+            data['keypoints']['y'],
+            data['keypoints']['visible']
+        )
+    ])
+    keypoints_data = keypoints_data.unsqueeze(0)  # Add batch dimension: Shape: (1, 17, 3)
+    boxes_data = torch.tensor([normalize_bbox(data, height, width)])
+
+    return boxes_data, keypoints_data
+
+
+def extract_yolo_results(detections: list) -> tuple[Boxes, Keypoints]:
+    """
+    Extract YOLO results from a list of detections.
+
+    Args:
+        detections (list): list of detection objects.
+
+    Returns:
+        tuple[Boxes, Keypoints]: Processed boxes and keypoints.
+    """
+    all_boxes_data = []
+    all_keypoints_data = []
+
+    for pose in detections:
+        boxes, keypoints = process_annotation(pose)
+        all_boxes_data.append(boxes)
+        all_keypoints_data.append(keypoints)
+
+    if all_boxes_data:
+        all_boxes_data = torch.cat(all_boxes_data, dim=0)
+    if all_keypoints_data:
+        all_keypoints_data = torch.cat(all_keypoints_data, dim=0)
+
+    orig_shape = tuple(detections[0].orig_shape)
+    boxes = Boxes(all_boxes_data, orig_shape)
+    keypoints = Keypoints(all_keypoints_data, orig_shape)
+
+    return boxes, keypoints
+
+
+def visualize_ultralytics_results(results: Results, scale: float = 1.0) -> Image.Image:
+    """
+    Visualize Ultralytics Results object.
+
+    Args:
+        results (Results): Results object from Ultralytics model.
+        scale (float): Scale factor for resizing the image. Default is 1.0.
+
+    Returns:
+        Image.Image: Visualized and resized image.
+    """
+    im_bgr = results.plot(
+        font_size=20,
+        kpt_radius=5,
+    )
+
+    im_rgb = Image.fromarray(im_bgr[..., ::-1])
+
+    orig_height, orig_width = results.orig_shape
+    new_size = (int(orig_width * scale), int(orig_height * scale))
+
+    im_rgb = im_rgb.resize(new_size, Image.LANCZOS)
+    return im_rgb
+
+
+def fetch_frame_ids(dc_pose) -> list[str]:
+    """
+    Fetch frame IDs for a given video based on pose confidence.
+
+    Args:
+        dc_pose: DataChain pose object.
+
+    Returns:
+        list[str]: list of frame IDs.
+    """
+    return list(dc_pose.distinct('frame.frame_id').collect('frame.frame_id'))
+
+
+def process_frame2results(frame_file, pose_detections: list) -> Results:
+    """
+    Process a single frame to prepare for plotting.
+
+    Args:
+        frame_file: Frame file object.
+        pose_detections (list): list of pose detections.
+
+    Returns:
+        Results: Processed results for plotting.
+    """
+    img_file_path = frame_file.get_path()
+    img_pil = Image.open(img_file_path)
+    rgb_array = np.asarray(img_pil)
+    if rgb_array.ndim == 3 and rgb_array.shape[2] == 3:
+        bgr_array = rgb_array[:, :, ::-1]  # RGB to BGR conversion
+    else:
+        bgr_array = rgb_array  # Handle grayscale images
+
+    boxes, keypoints = extract_yolo_results(pose_detections)
+
+    return Results(
+        bgr_array,
+        path=frame_file.get_path(),
+        names={0: 'person'},
+        boxes=boxes.data,
+        keypoints=keypoints.data,
+    )
diff --git a/computer_vision/video_pose_detection_yolov8/src/video_utils.py b/computer_vision/video_pose_detection_yolov8/src/video_utils.py
@@ -0,0 +1,111 @@
+import os
+import sys
+from typing import Dict
+
+import cv2
+import shutil
+
+
+def get_video_metadata(video_path: str) -> Dict[str, float]:
+    """
+    Extract metadata from a video file.
+
+    Args:
+        video_path (str): Path to the video file.
+
+    Returns:
+        Dict[str, float]: Dictionary containing video metadata.
+
+    Raises:
+        SystemExit: If the video file cannot be opened.
+    """
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Could not open video file {video_path}")
+        sys.exit(1)
+
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    codec = int(cap.get(cv2.CAP_PROP_FOURCC))
+    duration = frame_count / fps if fps > 0 else 0
+
+    metadata = {
+        "width": width,
+        "height": height,
+        "fps": fps,
+        "frame_count": frame_count,
+        "duration": duration,
+        "codec": codec
+    }
+
+    cap.release()
+
+    return metadata
+
+
+def split_video_to_frames(
+    video_path: str,
+    output_dir: str,
+    prefix: str = "frame",
+    step: int = 1,
+    rewrite: bool = True
+) -> None:
+    """
+    Split a video into frames and save them as images.
+
+    Args:
+        video_path (str): Path to the input video file.
+        output_dir (str): Directory to save the output frames.
+        prefix (str, optional): Prefix for frame filenames. Defaults to "frame".
+        step (int, optional): Step size for frame extraction. Defaults to 1.
+        rewrite (bool, optional): Whether to overwrite existing output directory. Defaults to True.
+
+    Raises:
+        SystemExit: If the video file cannot be opened or if the step size is invalid.
+    """
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Could not open video file {video_path}")
+        sys.exit(1)
+
+    video_name = os.path.splitext(os.path.basename(video_path))[0].split('_')[0]
+    video_output_dir = os.path.join(output_dir, video_name)
+
+    if os.path.exists(video_output_dir):
+        if rewrite:
+            shutil.rmtree(video_output_dir)
+        else:
+            print(f"Subdirectory already exists: {video_output_dir}")
+            sys.exit(1)
+
+    os.makedirs(video_output_dir)
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+
+    if not isinstance(step, int) or step <= 0 or step >= fps:
+        print("Error: Step must be an integer greater than 0 and less than the video's FPS.")
+        sys.exit(1)
+
+    frame_number = 0
+    saved_frames = 0
+
+    while True:
+        ret, frame = cap.read()
+
+        if not ret:
+            break
+
+        if frame_number % step == 0:
+            frame_filename = os.path.join(video_output_dir, f"{prefix}_{frame_number:04d}.jpg")
+            cv2.imwrite(frame_filename, frame)
+            saved_frames += 1
+
+        frame_number += 1
+
+    cap.release()
+
+    print(f"Saved {saved_frames} frames from {video_path} to {video_output_dir}")