From e635d41acd7d2ee6b4b369c06243d6352af0ed7a Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Mon, 25 Nov 2024 17:45:26 -0500
Subject: [PATCH 01/13] initial changes

---
 src/init.hpp               |  1 +
 src/json_serialization.hpp |  1 +
 src/types.hpp              | 11 ++++++-----
 3 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/src/init.hpp b/src/init.hpp
index 7c458168..2101c4e2 100755
--- a/src/init.hpp
+++ b/src/init.hpp
@@ -24,6 +24,7 @@ namespace gpudrive
         MapVector2 position[MAX_POSITIONS];
         float width;
         float length;
+        float height;
         float heading[MAX_POSITIONS];
         MapVector2 velocity[MAX_POSITIONS];
         bool valid[MAX_POSITIONS];
diff --git a/src/json_serialization.hpp b/src/json_serialization.hpp
index 40e8762c..2cf932d4 100644
--- a/src/json_serialization.hpp
+++ b/src/json_serialization.hpp
@@ -34,6 +34,7 @@ namespace gpudrive
         obj.numPositions = i;
         j.at("width").get_to(obj.width);
         j.at("length").get_to(obj.length);
+        j.at("height").get_to(obj.height);
 
         i = 0;
         for (const auto &h : j.at("heading"))
diff --git a/src/types.hpp b/src/types.hpp
index 13dc4dd4..d8051a11 100755
--- a/src/types.hpp
+++ b/src/types.hpp
@@ -72,6 +72,7 @@ struct AgentID {
     {
         float length;
         float width;
+        float height;
     };
 
     struct Goal
@@ -184,14 +185,14 @@ struct AgentID {
         {
             return SelfObservation{
                 .speed = 0,
-                .vehicle_size = {0, 0},
+                .vehicle_size = {0, 0, 0},
                 .goal = {.position = {0, 0}},
                 .collisionState = 0,
             .id = -1};
         }
     };
 
-    const size_t SelfObservationExportSize = 7;
+    const size_t SelfObservationExportSize = 8;
 
     static_assert(sizeof(SelfObservation) == sizeof(float) * SelfObservationExportSize);
 
@@ -235,7 +236,7 @@ struct AgentID {
             .speed = 0,
             .position = {0, 0},
             .heading = 0,
-            .vehicle_size = {0, 0},
+            .vehicle_size = {0, 0, 0},
             .type = static_cast<float>(EntityType::None),
             .id = -1};
     }
@@ -255,7 +256,7 @@ struct AgentID {
         PartnerObservation obs[consts::kMaxAgentCount - 1];
     };
 
-    const size_t PartnerObservationExportSize = 8;
+    const size_t PartnerObservationExportSize = 9;
 
     static_assert(sizeof(PartnerObservations) == sizeof(float) *
                                                      (consts::kMaxAgentCount - 1) * PartnerObservationExportSize);
@@ -353,7 +354,7 @@ struct AgentID {
         float id;
     };
 
-    const size_t AbsoluteSelfObservationExportSize = 13; // 3 + 4 + 1 + 2 + 2
+    const size_t AbsoluteSelfObservationExportSize = 14; // 3 + 4 + 1 + 2 + 2 ??
 
     static_assert(sizeof(AbsoluteSelfObservation) == sizeof(float) * AbsoluteSelfObservationExportSize);
 

From b81617bbd2d13699edf1b1593b663cb0eba7207e Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Tue, 3 Dec 2024 18:03:03 -0500
Subject: [PATCH 02/13] using vehicle_size struct and comments

---
 src/init.hpp               |  4 +--
 src/json_serialization.hpp |  6 ++--
 src/level_gen.cpp          |  4 +--
 src/types.hpp              | 59 +++++++++++++++++++-------------------
 4 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/init.hpp b/src/init.hpp
index 2101c4e2..3b522034 100755
--- a/src/init.hpp
+++ b/src/init.hpp
@@ -22,9 +22,7 @@ namespace gpudrive
     struct MapObject
     {
         MapVector2 position[MAX_POSITIONS];
-        float width;
-        float length;
-        float height;
+        VehicleSize vehicle_size;
         float heading[MAX_POSITIONS];
         MapVector2 velocity[MAX_POSITIONS];
         bool valid[MAX_POSITIONS];
diff --git a/src/json_serialization.hpp b/src/json_serialization.hpp
index 2cf932d4..86b0b552 100644
--- a/src/json_serialization.hpp
+++ b/src/json_serialization.hpp
@@ -32,9 +32,9 @@ namespace gpudrive
             }
         }
         obj.numPositions = i;
-        j.at("width").get_to(obj.width);
-        j.at("length").get_to(obj.length);
-        j.at("height").get_to(obj.height);
+        j.at("width").get_to(obj.vehicle_size.width);
+        j.at("length").get_to(obj.vehicle_size.length);
+        j.at("height").get_to(obj.vehicle_size.height);
 
         i = 0;
         for (const auto &h : j.at("heading"))
diff --git a/src/level_gen.cpp b/src/level_gen.cpp
index add4cc4f..0385bfd5 100755
--- a/src/level_gen.cpp
+++ b/src/level_gen.cpp
@@ -121,8 +121,8 @@ static inline Entity createAgent(Engine &ctx, const MapObject &agentInit) {
     auto agent = ctx.makeRenderableEntity<Agent>();
     auto agent_iface = ctx.get<AgentInterfaceEntity>(agent).e = ctx.makeEntity<AgentInterface>();
 
-    ctx.get<VehicleSize>(agent) = {.length = agentInit.length, .width = agentInit.width};
-    ctx.get<Scale>(agent) = Diag3x3{.d0 = agentInit.length/2, .d1 = agentInit.width/2, .d2 = 1};
+    ctx.get<VehicleSize>(agent) = {.length = agentInit.vehicle_size.length, .width = agentInit.vehicle_size.width, .height = agentInit.vehicle_size.height};
+    ctx.get<Scale>(agent) = Diag3x3{.d0 = agentInit.vehicle_size.length/2, .d1 = agentInit.vehicle_size.width/2, .d2 = 1};
     ctx.get<Scale>(agent) *= consts::vehicleLengthScale;
     ctx.get<ObjectID>(agent) = ObjectID{(int32_t)SimObject::Agent};
     ctx.get<EntityType>(agent) = agentInit.type;
diff --git a/src/types.hpp b/src/types.hpp
index d8051a11..c8ec7db2 100755
--- a/src/types.hpp
+++ b/src/types.hpp
@@ -64,9 +64,10 @@ namespace gpudrive
         NUM_TYPES = 21,
     };
 
-struct AgentID {
-    int32_t id;
-};
+    struct AgentID 
+    {
+        int32_t id;
+    };
 
     struct VehicleSize
     {
@@ -180,7 +181,7 @@ struct AgentID {
         VehicleSize vehicle_size;
         Goal goal;
         float collisionState;
-    float id;
+        float id;
         static inline SelfObservation zero()
         {
             return SelfObservation{
@@ -188,11 +189,11 @@ struct AgentID {
                 .vehicle_size = {0, 0, 0},
                 .goal = {.position = {0, 0}},
                 .collisionState = 0,
-            .id = -1};
+                .id = -1};
         }
     };
 
-    const size_t SelfObservationExportSize = 8;
+    const size_t SelfObservationExportSize = 8; // 1 + 3 + 2 + 1 + 1
 
     static_assert(sizeof(SelfObservation) == sizeof(float) * SelfObservationExportSize);
 
@@ -218,7 +219,7 @@ struct AgentID {
         }
     };
 
-    const size_t MapObservationExportSize = 9;
+    const size_t MapObservationExportSize = 9; // 2 + 3 + 1 + 1 + 1 + 1
 
     static_assert(sizeof(MapObservation) == sizeof(float) * MapObservationExportSize);
 
@@ -229,37 +230,37 @@ struct AgentID {
         float heading;
         VehicleSize vehicle_size;
         float type;
-    float id;
-
-    static inline PartnerObservation zero() {
-        return PartnerObservation{
-            .speed = 0,
-            .position = {0, 0},
-            .heading = 0,
-            .vehicle_size = {0, 0, 0},
-            .type = static_cast<float>(EntityType::None),
-            .id = -1};
-    }
-};
+        float id;
 
-    struct RoadMapId{
-        int32_t id;
+        static inline PartnerObservation zero() {
+            return PartnerObservation{
+                .speed = 0,
+                .position = {0, 0},
+                .heading = 0,
+                .vehicle_size = {0, 0, 0},
+                .type = static_cast<float>(EntityType::None),
+                .id = -1};
+        }
     };
 
-    const size_t RoadMapIdExportSize = 1;
-
-    static_assert(sizeof(RoadMapId) == sizeof(int) * RoadMapIdExportSize);
-
     // Egocentric observations of other agents
     struct PartnerObservations
     {
         PartnerObservation obs[consts::kMaxAgentCount - 1];
     };
 
-    const size_t PartnerObservationExportSize = 9;
+    const size_t PartnerObservationExportSize = 9; // 1 + 2 + 1 + 3 + 1 + 1
 
     static_assert(sizeof(PartnerObservations) == sizeof(float) *
-                                                     (consts::kMaxAgentCount - 1) * PartnerObservationExportSize);
+        (consts::kMaxAgentCount - 1) * PartnerObservationExportSize);
+
+    struct RoadMapId{
+        int32_t id;
+    };
+
+    const size_t RoadMapIdExportSize = 1;
+
+    static_assert(sizeof(RoadMapId) == sizeof(int) * RoadMapIdExportSize);
 
     struct AgentMapObservations
     {
@@ -341,7 +342,7 @@ struct AgentID {
 
     struct AbsoluteRotation
     {
-        Rotation rotationAsQuat;
+        Rotation rotationAsQuat; // x, y, z, w
         float rotationFromAxis;
     };
 
@@ -354,7 +355,7 @@ struct AgentID {
         float id;
     };
 
-    const size_t AbsoluteSelfObservationExportSize = 14; // 3 + 4 + 1 + 2 + 2 ??
+    const size_t AbsoluteSelfObservationExportSize = 14; // 3 + 5 + 2 + 3 + 1
 
     static_assert(sizeof(AbsoluteSelfObservation) == sizeof(float) * AbsoluteSelfObservationExportSize);
 

From 283caa85b3e9c12c9377fd297b6cd9b9fb6ba35d Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Wed, 4 Dec 2024 17:25:30 -0500
Subject: [PATCH 03/13] datatype indexing changes

---
 pygpudrive/datatypes/observation.py | 24 ++++++++++++++++--------
 pygpudrive/env/constants.py         |  1 +
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/pygpudrive/datatypes/observation.py b/pygpudrive/datatypes/observation.py
index d86d21d7..2949c906 100644
--- a/pygpudrive/datatypes/observation.py
+++ b/pygpudrive/datatypes/observation.py
@@ -14,6 +14,7 @@ class LocalEgoState:
         speed: Speed of the agent in relative coordinates.
         vehicle_length: Length of the agent's bounding box.
         vehicle_width: Width of the agent's bounding box.
+        vehicle_height: Height of the agent's bounding box.
         rel_goal_x: Relative x-coordinate to the goal.
         rel_goal_y: Relative y-coordinate to the goal.
         is_collided: Whether the agent is in collision with another object.
@@ -25,10 +26,11 @@ def __init__(self, self_obs_tensor: torch.Tensor):
         self.speed = self_obs_tensor[:, :, 0]
         self.vehicle_length = self_obs_tensor[:, :, 1]
         self.vehicle_width = self_obs_tensor[:, :, 2]
-        self.rel_goal_x = self_obs_tensor[:, :, 3]
-        self.rel_goal_y = self_obs_tensor[:, :, 4]
-        self.is_collided = self_obs_tensor[:, :, 5]
-        self.id = self_obs_tensor[:, :, 6]
+        self.vehicle_height = self_obs_tensor[:, :, 3]
+        self.rel_goal_x = self_obs_tensor[:, :, 4]
+        self.rel_goal_y = self_obs_tensor[:, :, 5]
+        self.is_collided = self_obs_tensor[:, :, 6]
+        self.id = self_obs_tensor[:, :, 7]
 
     @classmethod
     def from_tensor(
@@ -48,6 +50,7 @@ def normalize(self):
         self.speed = self.speed / constants.MAX_SPEED
         self.vehicle_length = self.vehicle_length / constants.MAX_VEH_LEN
         self.vehicle_width = self.vehicle_width / constants.MAX_VEH_WIDTH
+        self.vehicle_height = self.vehicle_height / constants.MAX_VEH_HEIGHT
         self.rel_goal_x = normalize_min_max(
             tensor=self.rel_goal_x,
             min_val=constants.MIN_REL_GOAL_COORD,
@@ -70,7 +73,7 @@ def shape(self) -> tuple[int, ...]:
 class GlobalEgoState:
     """A class to represent the ego state of the agent in global coordinates.
     Initialized from abs_self_obs_tensor (src/bindings). For details, see
-    `AbsoluteSelfObservation` in src/types.hpp. Shape: (num_worlds, max_agents, 13).
+    `AbsoluteSelfObservation` in src/types.hpp. Shape: (num_worlds, max_agents, 14).
 
     Attributes:
         pos_x: Global x-coordinate of the agent.
@@ -82,6 +85,7 @@ class GlobalEgoState:
         goal_y: Global y-coordinate of the goal.
         vehicle_length: Length of the agent's bounding box.
         vehicle_width: Width of the agent's bounding box.
+        vehicle_height: Height of the agent's bounding box.
         id: Unique identifier of the agent.
     """
 
@@ -96,7 +100,8 @@ def __init__(self, abs_self_obs_tensor: torch.Tensor):
         self.goal_y = abs_self_obs_tensor[:, :, 9]
         self.vehicle_length = abs_self_obs_tensor[:, :, 10]
         self.vehicle_width = abs_self_obs_tensor[:, :, 11]
-        self.id = abs_self_obs_tensor[:, :, 12]
+        self.vehicle_height = abs_self_obs_tensor[:, :, 12]
+        self.id = abs_self_obs_tensor[:, :, 13]
 
     @classmethod
     def from_tensor(
@@ -130,6 +135,7 @@ class PartnerObs:
     orientation: torch.Tensor
     vehicle_length: torch.Tensor
     vehicle_width: torch.Tensor
+    vehicle_height: torch.Tensor
     agent_type: torch.Tensor
     ids: torch.Tensor
 
@@ -148,8 +154,9 @@ def __init__(self, partner_obs_tensor: torch.Tensor):
         self.orientation = partner_obs_tensor[:, :, :, 3].unsqueeze(-1)
         self.vehicle_length = partner_obs_tensor[:, :, :, 4].unsqueeze(-1)
         self.vehicle_width = partner_obs_tensor[:, :, :, 5].unsqueeze(-1)
-        self.agent_type = partner_obs_tensor[:, :, :, 6].unsqueeze(-1)
-        self.ids = partner_obs_tensor[:, :, :, 7].unsqueeze(-1)
+        self.vehicle_height = partner_obs_tensor[:, :, :, 6].unsqueeze(-1)        
+        self.agent_type = partner_obs_tensor[:, :, :, 7].unsqueeze(-1)
+        self.ids = partner_obs_tensor[:, :, :, 8].unsqueeze(-1)
 
     @classmethod
     def from_tensor(
@@ -180,6 +187,7 @@ def normalize(self):
         self.orientation = self.orientation / constants.MAX_ORIENTATION_RAD
         self.vehicle_length = self.vehicle_length / constants.MAX_VEH_LEN
         self.vehicle_width = self.vehicle_width / constants.MAX_VEH_WIDTH
+        self.vehicle_heights = self.vehicle_heights / constants.MAX_VEH_HEIGHT
         self.agent_type = self.agent_type.long()
         self.ids = self.ids
 
diff --git a/pygpudrive/env/constants.py b/pygpudrive/env/constants.py
index a8ea6201..cb221b55 100644
--- a/pygpudrive/env/constants.py
+++ b/pygpudrive/env/constants.py
@@ -6,6 +6,7 @@
 MAX_SPEED = 100
 MAX_VEH_LEN = 30
 MAX_VEH_WIDTH = 10
+MAX_VEH_HEIGHT = 3 # What's the appropriate value to set this to?
 MIN_REL_GOAL_COORD = -1000
 MAX_REL_GOAL_COORD = 1000
 MIN_REL_AGENT_POS = -1000

From e0e987332ce83cb344a1f187c6cd3cfec7588f1e Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Wed, 4 Dec 2024 18:50:06 -0500
Subject: [PATCH 04/13] removed comment

---
 pygpudrive/env/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygpudrive/env/constants.py b/pygpudrive/env/constants.py
index cb221b55..9f0d336f 100644
--- a/pygpudrive/env/constants.py
+++ b/pygpudrive/env/constants.py
@@ -6,7 +6,7 @@
 MAX_SPEED = 100
 MAX_VEH_LEN = 30
 MAX_VEH_WIDTH = 10
-MAX_VEH_HEIGHT = 3 # What's the appropriate value to set this to?
+MAX_VEH_HEIGHT = 3
 MIN_REL_GOAL_COORD = -1000
 MAX_REL_GOAL_COORD = 1000
 MIN_REL_AGENT_POS = -1000

From 0a09dec4ed4425447d42eedac4d6f1ce2b18fbda Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Thu, 5 Dec 2024 17:37:45 -0500
Subject: [PATCH 05/13] minor cleanup

---
 src/level_gen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/level_gen.cpp b/src/level_gen.cpp
index 0385bfd5..2d49e5ff 100755
--- a/src/level_gen.cpp
+++ b/src/level_gen.cpp
@@ -121,7 +121,7 @@ static inline Entity createAgent(Engine &ctx, const MapObject &agentInit) {
     auto agent = ctx.makeRenderableEntity<Agent>();
     auto agent_iface = ctx.get<AgentInterfaceEntity>(agent).e = ctx.makeEntity<AgentInterface>();
 
-    ctx.get<VehicleSize>(agent) = {.length = agentInit.vehicle_size.length, .width = agentInit.vehicle_size.width, .height = agentInit.vehicle_size.height};
+    ctx.get<VehicleSize>(agent) = agentInit.vehicle_size;
     ctx.get<Scale>(agent) = Diag3x3{.d0 = agentInit.vehicle_size.length/2, .d1 = agentInit.vehicle_size.width/2, .d2 = 1};
     ctx.get<Scale>(agent) *= consts::vehicleLengthScale;
     ctx.get<ObjectID>(agent) = ObjectID{(int32_t)SimObject::Agent};

From 070e76375356b88a27de9c27e402915120a634a2 Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Fri, 6 Dec 2024 15:53:00 -0500
Subject: [PATCH 06/13] new dataset yay

---
 README.md | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 86ab6757..81572345 100644
--- a/README.md
+++ b/README.md
@@ -211,16 +211,41 @@ We are open-sourcing a policy trained on 1,000 randomly sampled scenarios. You c
 
 ### Download the dataset
 
-Two versions of the dataset are available:
+- Two versions of the dataset are available, a mini version with a 1000 training files and 300 test/validation files, and the full sized dataset with over a 100k unique scenes. 
+- Replace 'GPUDrive_mini' with 'GPUDrive' below if you wish to download the full dataset.)
 
-- a mini-one that is about 1 GB and consists of 1000 training files and 100 validation / test files at: [Dropbox Link](https://www.dropbox.com/sh/8mxue9rdoizen3h/AADGRrHYBb86pZvDnHplDGvXa?dl=0).
-- the full dataset (150 GB) and consists of 134453 training files and 12205 validation / test files: [Dropbox Link](https://www.dropbox.com/sh/wv75pjd8phxizj3/AABfNPWfjQdoTWvdVxsAjUL_a?dl=0)
+Option 1: You can download the dataset programmatically using the Hugging Face `datasets` library:
+```python
+from datasets import load_dataset
+dataset = load_dataset("EMERGE-lab/GPUDrive_mini", cache_dir="data/processed") #OR path/to/your/dir
+```
+Option 2: Use the huggingface-cli:
+
+1. First, install the Hugging Face CLI:
+```bash
+pip install huggingface_hub
+```
+
+2. Log in to your Hugging Face account:
+```bash
+huggingface-cli login
+```
+
+3. Download the dataset:
+```bash
+huggingface-cli download EMERGE-lab/GPUDrive_mini --local-dir data/processed #OR path/to/your/dir
+```
+
+Option 3: Manual Download:
+
+1. Visit https://huggingface.co/datasets/EMERGE-lab/GPUDrive
+2. Navigate to the Files and versions tab.
+3. Download the desired files/directories.
 
-The simulator supports initializing scenes from the `Nocturne` dataset. The input parameter for the simulator `json_path` takes in a path to a directory containing the files in the Nocturne format. The `SceneConfig` dataclass in `pygpudrive/env/config.py` dataclass is used to configure how scenes are selected from a folder with traffic scenarios.
 
 ### Re-building the dataset
 
-GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps.
+GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps. (Note: you would only need to do this if there is a newer version of the Waymo dataset that you'd like to test.)
 
 1. First, head to [https://waymo.com/open/](https://waymo.com/open/) and click on the "download" button a the top. After registering, click on the files from `v1.2.1 March 2024`, the newest version of the dataset at the time of wrting (10/2024). This will lead you a Google Cloud page. From here, you should see a folder structure like this:
 

From 2b4ffa9d37bea0a26a45d8a1fc24e53883dc7e15 Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Fri, 6 Dec 2024 16:22:24 -0500
Subject: [PATCH 07/13] fixed downloads and added links

---
 README.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 81572345..fb1bc873 100644
--- a/README.md
+++ b/README.md
@@ -211,41 +211,41 @@ We are open-sourcing a policy trained on 1,000 randomly sampled scenarios. You c
 
 ### Download the dataset
 
-- Two versions of the dataset are available, a mini version with a 1000 training files and 300 test/validation files, and the full sized dataset with over a 100k unique scenes. 
-- Replace 'GPUDrive_mini' with 'GPUDrive' below if you wish to download the full dataset.)
+- Two versions of the dataset are available, a [mini version](https://huggingface.co/datasets/EMERGE-lab/GPUDrive_mini) with a 1000 training files and 300 test/validation files, and the [full sized dataset](https://huggingface.co/datasets/EMERGE-lab/GPUDrive) with over a 100k unique scenes. 
+- Replace 'GPUDrive_mini' with 'GPUDrive' below if you wish to download the full dataset.
+- To download the dataset you need the huggingface_hub library:
+```bash
+pip install huggingface_hub
+```
+Then you can download the dataset using python or just `huggingface-cli`.
 
-Option 1: You can download the dataset programmatically using the Hugging Face `datasets` library:
+Option 1: Using Python:
 ```python
-from datasets import load_dataset
-dataset = load_dataset("EMERGE-lab/GPUDrive_mini", cache_dir="data/processed") #OR path/to/your/dir
+>>> from huggingface_hub import snapshot_download
+>>> snapshot_download(repo_id="EMERGE-lab/GPUDrive_mini", repo_type="dataset", local_dir="data/processed")
 ```
 Option 2: Use the huggingface-cli:
 
-1. First, install the Hugging Face CLI:
-```bash
-pip install huggingface_hub
-```
-
-2. Log in to your Hugging Face account:
+1. Log in to your Hugging Face account:
 ```bash
 huggingface-cli login
 ```
 
-3. Download the dataset:
+2. Download the dataset:
 ```bash
-huggingface-cli download EMERGE-lab/GPUDrive_mini --local-dir data/processed #OR path/to/your/dir
+huggingface-cli download EMERGE-lab/GPUDrive_mini --local-dir data/processed --repo-type "dataset"
 ```
 
 Option 3: Manual Download:
 
-1. Visit https://huggingface.co/datasets/EMERGE-lab/GPUDrive
+1. Visit https://huggingface.co/datasets/EMERGE-lab/GPUDrive_mini
 2. Navigate to the Files and versions tab.
 3. Download the desired files/directories.
 
 
 ### Re-building the dataset
 
-GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps. (Note: you would only need to do this if there is a newer version of the Waymo dataset that you'd like to test.)
+If you wish to manually generate the dataset, GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps.
 
 1. First, head to [https://waymo.com/open/](https://waymo.com/open/) and click on the "download" button a the top. After registering, click on the files from `v1.2.1 March 2024`, the newest version of the dataset at the time of wrting (10/2024). This will lead you a Google Cloud page. From here, you should see a folder structure like this:
 

From a8e4bd2d12cf2a5f7dbd055fd49deb2c2052c5f1 Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Sun, 8 Dec 2024 17:04:41 -0500
Subject: [PATCH 08/13] extract script for large dataset

---
 README.md                    |   5 ++
 data_utils/extract_groups.py | 119 +++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 data_utils/extract_groups.py

diff --git a/README.md b/README.md
index fb1bc873..8e1653f8 100644
--- a/README.md
+++ b/README.md
@@ -242,6 +242,11 @@ Option 3: Manual Download:
 2. Navigate to the Files and versions tab.
 3. Download the desired files/directories.
 
+_NOTE_: If you downloaded the full-sized dataset, it is grouped to subdirectories of 10k files each (according to hugging face constraints). In order for the path to work with GPUDrive, you need to run
+```python
+python data_utils/extract_groups.py #use --help if you've used a custom download path
+```
+
 
 ### Re-building the dataset
 
diff --git a/data_utils/extract_groups.py b/data_utils/extract_groups.py
new file mode 100644
index 00000000..a43e950b
--- /dev/null
+++ b/data_utils/extract_groups.py
@@ -0,0 +1,119 @@
+import argparse
+import shutil
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+import tqdm
+
+def move_file(args):
+    """
+    Move a single file to its target location.
+    
+    Args:
+        args (tuple): (source_path, target_dir)
+    """
+    source_path, target_dir = args
+    target_path = Path(target_dir) / source_path.name
+    shutil.move(str(source_path), str(target_path))
+    return str(source_path)
+
+def extract_groups(dataset_dir, num_workers=None):
+    """
+    Extract all files from group directories back to the parent directory using parallel processing.
+    
+    Args:
+        dataset_dir (str): Path to the dataset directory containing group folders
+        num_workers (int, optional): Number of processes to use. Defaults to CPU count.
+    """
+    dataset_path = Path(dataset_dir)
+    
+    if not dataset_path.is_dir():
+        raise ValueError(f"Directory {dataset_dir} does not exist")
+    
+    # Find all group directories
+    group_dirs = [d for d in dataset_path.iterdir() 
+                 if d.is_dir() and d.name.startswith("group_")]
+    
+    if not group_dirs:
+        print(f"No group directories found in {dataset_dir}!")
+        return
+    
+    print(f"\nProcessing {dataset_dir}")
+    print(f"Found {len(group_dirs)} group directories")
+    
+    # Collect all files that need to be moved
+    all_files = []
+    for group_dir in sorted(group_dirs):
+        files = list(group_dir.glob("*.json"))
+        all_files.extend([(file, dataset_path) for file in files])
+    
+    total_files = len(all_files)
+    print(f"Total files to process: {total_files}")
+    
+    # Use all available CPUs if num_workers is not specified
+    if num_workers is None:
+        num_workers = cpu_count()
+    
+    # Create a pool of workers and process files in parallel
+    with Pool(processes=num_workers) as pool:
+        # Use tqdm to show progress bar
+        list(tqdm.tqdm(
+            pool.imap_unordered(move_file, all_files),
+            total=total_files,
+            desc=f"Moving files from {dataset_dir}"
+        ))
+    
+    # Remove empty group directories
+    for group_dir in group_dirs:
+        group_dir.rmdir()
+    
+    print(f"Completed {dataset_dir}")
+    print(f"Total files processed: {total_files}")
+
+def process_default_directory(num_workers=None):
+    """
+    Process the default training, testing, and validation directories in parallel.
+    
+    Args:
+        num_workers (int, optional): Number of processes to use per directory.
+    """
+    default_dir = "data/processed/training"
+    # Process each directory with its own pool of workers
+    try:
+        extract_groups(default_dir, num_workers)
+    except Exception as e:
+        print(f"Error processing {default_dir}: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract files from group directory back to parent directory in parallel. "
+                  "If no directory is specified, processes data/processed/training by default."
+    )
+    parser.add_argument(
+        "dataset_dir",
+        nargs="?",  # Makes the argument optional
+        help="Path to the dataset directory containing group folders"
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of processes to use (defaults to number of CPU cores)",
+        default=None
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        if args.dataset_dir:
+            # Process single specified directory
+            extract_groups(args.dataset_dir, args.num_workers)
+        else:
+            # Process default directories
+            process_default_directory(args.num_workers)
+    except Exception as e:
+        print(f"Error: {e}")
+        return 1
+    
+    return 0
+
+if __name__ == "__main__":
+    exit(main())
\ No newline at end of file

From 098738cccbe8e96f2230fc8881da87c2f9810504 Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Mon, 9 Dec 2024 11:42:49 -0500
Subject: [PATCH 09/13] added hf to env

---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 80666378..9aed3d7c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -61,4 +61,5 @@ dependencies:
       - trove-classifiers==2024.3.25
       - urllib3==2.2.1
       - virtualenv==20.25.1
-      - zipp==3.18.1
\ No newline at end of file
+      - zipp==3.18.1
+      - huggingface_hub==0.26.5
\ No newline at end of file

From 12f9d9f9ac4974ab21b1d9b932c2afc3b18f0d63 Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Mon, 9 Dec 2024 11:47:51 -0500
Subject: [PATCH 10/13] update dataset size

---
 README.md                    | 4 ++--
 data_utils/extract_groups.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8e1653f8..41944897 100644
--- a/README.md
+++ b/README.md
@@ -211,7 +211,7 @@ We are open-sourcing a policy trained on 1,000 randomly sampled scenarios. You c
 
 ### Download the dataset
 
-- Two versions of the dataset are available, a [mini version](https://huggingface.co/datasets/EMERGE-lab/GPUDrive_mini) with a 1000 training files and 300 test/validation files, and the [full sized dataset](https://huggingface.co/datasets/EMERGE-lab/GPUDrive) with over a 100k unique scenes. 
+- Two versions of the dataset are available, a [mini version](https://huggingface.co/datasets/EMERGE-lab/GPUDrive_mini) with a 1000 training files and 300 test/validation files, and a [large dataset](https://huggingface.co/datasets/EMERGE-lab/GPUDrive) with 100k unique scenes. 
 - Replace 'GPUDrive_mini' with 'GPUDrive' below if you wish to download the full dataset.
 - To download the dataset you need the huggingface_hub library:
 ```bash
@@ -250,7 +250,7 @@ python data_utils/extract_groups.py #use --help if you've used a custom download
 
 ### Re-building the dataset
 
-If you wish to manually generate the dataset, GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps.
+If you wish to manually generate the dataset, GPUDrive is compatible with the complete [Waymo Open Motion Dataset](https://github.com/waymo-research/waymo-open-dataset), which contains well over 100,000 scenarios. To download new files and create scenarios for the simulator, follow these three steps.
 
 1. First, head to [https://waymo.com/open/](https://waymo.com/open/) and click on the "download" button a the top. After registering, click on the files from `v1.2.1 March 2024`, the newest version of the dataset at the time of wrting (10/2024). This will lead you a Google Cloud page. From here, you should see a folder structure like this:
 
diff --git a/data_utils/extract_groups.py b/data_utils/extract_groups.py
index a43e950b..1d28a435 100644
--- a/data_utils/extract_groups.py
+++ b/data_utils/extract_groups.py
@@ -18,7 +18,7 @@ def move_file(args):
 
 def extract_groups(dataset_dir, num_workers=None):
     """
-    Extract all files from group directories back to the parent directory using parallel processing.
+    Extract all files from group directories back to the parent directory in parallel.
     
     Args:
         dataset_dir (str): Path to the dataset directory containing group folders

From 6b45a14f127fb04e9dc12bf8aff9ce7dbdad184f Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Mon, 9 Dec 2024 12:05:17 -0500
Subject: [PATCH 11/13] added hf to env

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 41944897..52fe0007 100644
--- a/README.md
+++ b/README.md
@@ -213,7 +213,7 @@ We are open-sourcing a policy trained on 1,000 randomly sampled scenarios. You c
 
 - Two versions of the dataset are available, a [mini version](https://huggingface.co/datasets/EMERGE-lab/GPUDrive_mini) with a 1000 training files and 300 test/validation files, and a [large dataset](https://huggingface.co/datasets/EMERGE-lab/GPUDrive) with 100k unique scenes. 
 - Replace 'GPUDrive_mini' with 'GPUDrive' below if you wish to download the full dataset.
-- To download the dataset you need the huggingface_hub library:
+- To download the dataset you need the huggingface_hub library (if you initialized from `environment.yml` then you can skip this step):
 ```bash
 pip install huggingface_hub
 ```

From c7905960759788270351848d002fa860add72684 Mon Sep 17 00:00:00 2001
From: Daphne Cornelisse <cor.daphne@gmail.com>
Date: Mon, 9 Dec 2024 12:54:16 -0500
Subject: [PATCH 12/13] Fix typo

---
 pygpudrive/datatypes/observation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygpudrive/datatypes/observation.py b/pygpudrive/datatypes/observation.py
index 2949c906..e2568d2b 100644
--- a/pygpudrive/datatypes/observation.py
+++ b/pygpudrive/datatypes/observation.py
@@ -187,7 +187,7 @@ def normalize(self):
         self.orientation = self.orientation / constants.MAX_ORIENTATION_RAD
         self.vehicle_length = self.vehicle_length / constants.MAX_VEH_LEN
         self.vehicle_width = self.vehicle_width / constants.MAX_VEH_WIDTH
-        self.vehicle_heights = self.vehicle_heights / constants.MAX_VEH_HEIGHT
+        self.vehicle_height = self.vehicle_height / constants.MAX_VEH_HEIGHT
         self.agent_type = self.agent_type.long()
         self.ids = self.ids
 

From f829adc7b3a42c57215cad11c935f4b1f846390b Mon Sep 17 00:00:00 2001
From: kevin <kevinwinston184@gmail.com>
Date: Mon, 9 Dec 2024 16:24:15 -0500
Subject: [PATCH 13/13] minor docstring update

---
 pygpudrive/datatypes/observation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pygpudrive/datatypes/observation.py b/pygpudrive/datatypes/observation.py
index e2568d2b..6b1855c6 100644
--- a/pygpudrive/datatypes/observation.py
+++ b/pygpudrive/datatypes/observation.py
@@ -7,8 +7,8 @@
 
 class LocalEgoState:
     """A class to represent the ego state of the agent in relative coordinates.
-    Initialized from agent_roadmap_tensor (src/bindings). For details, see
-    `agentMapObservations` in src/types.hpp.
+    Initialized from self_observation_tensor (src/bindings). For details, see
+    `SelfObservation` in src/types.hpp.
 
     Attributes:
         speed: Speed of the agent in relative coordinates.
@@ -225,7 +225,7 @@ class LidarObs:
         - Axis 3 represents the lidar points per type, which can be configured in src/consts.hpp as `numLidarSamples`.
         - Axis 4 represents the depth, type and x, y, values of the lidar points.
     Initialized from lidar_tensor (src/bindings).
-    For details, see `LidarObservations` in src/types.hpp.
+    For details, see `Lidar` and `LidarSample` in src/types.hpp.
     """
 
     def __init__(self, lidar_tensor: torch.Tensor):