Merge branch 'main' into FixViewer

Emerge-Lab · May 2, 2024 · cde55c4 · cde55c4
2 parents 145794d + 630ffbb
commit cde55c4
Show file tree

Hide file tree

Showing 35 changed files with 1,249 additions and 430 deletions.
diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml
@@ -0,0 +1,34 @@
+name: Run C++ tests
+
+on:
+  pull_request:
+
+jobs:
+  build-and-test:
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Install system dependencies
+      run: |
+        sudo apt update
+        sudo apt install -y libx11-dev libxrandr-dev libxinerama-dev libxcursor-dev libxi-dev mesa-common-dev libc++1
+        
+    - name: Setup Conda environment.
+      uses: conda-incubator/setup-miniconda@v2
+      with:
+         activate-environment: gpudrive
+         environment-file: environment.yml
+
+    - name: Install dependencies and build the project with Poetry
+      run: |
+        poetry install
+    - name: Run tests
+      run: |
+        cd build/
+        ctest --rerun-failed --output-on-failure
diff --git a/.gitignore b/.gitignore
@@ -18,9 +18,9 @@
 
 # Data
 /formatted_json_v2_no_tl_train
-/data_10
-/data_100
-/data_1000
+/waymo_data_new
+
+
 
 # Logging
 /wandb
@@ -232,4 +232,4 @@ pyrightconfig.json
 
 *~
 
-# End of https://www.toptal.com/developers/gitignore/api/python,c++
+# End of https://www.toptal.com/developers/gitignore/api/python,c++
diff --git a/algorithms/ppo/sb3/callbacks.py b/algorithms/ppo/sb3/callbacks.py
@@ -1,4 +1,6 @@
 import numpy as np
+import torch
+import wandb
 from stable_baselines3.common.callbacks import BaseCallback
 
 
@@ -7,10 +9,12 @@ class MultiAgentCallback(BaseCallback):
 
     def __init__(
         self,
+        config,
         wandb_run=None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
+        self.config = config
         self.wandb_run = wandb_run
 
     def _on_training_start(self) -> None:
@@ -44,8 +48,13 @@ def _on_rollout_end(self) -> None:
             nan=0,
         )
 
-        # TODO: note that this only works when we have a fixed number of agents
-        num_controlled_agents = rewards.shape[1]
+        # Get the total number of controlled agents we are controlling
+        # The number of controllable agents is different per scenario
+        num_controlled_agents = self.locals[
+            "env"
+        ]._get_sum_controlled_valid_agents
+
+        print(f"num_controlled_agents: {num_controlled_agents}")
 
         # Number of episodes in the rollout
         num_episodes_in_rollout = (
@@ -61,45 +70,49 @@ def _on_rollout_end(self) -> None:
             / num_controlled_agents
         )
 
-        # Rewards for each agent
-        for agent_idx in range(num_controlled_agents):
-            self.logger.record(
-                f"rollout/avg_agent_rew{agent_idx}",
-                rewards[:, agent_idx].sum() / num_episodes_in_rollout,
-            )
+        mean_reward_per_agent_per_episode = (
+            rewards.sum() / num_episodes_in_rollout / num_controlled_agents
+        )
 
         observations = (
             self.locals["rollout_buffer"].observations.cpu().detach().numpy()
         )
 
-        num_episodes_in_rollout = np.nan_to_num(
-            (
-                self.locals["rollout_buffer"]
-                .episode_starts.cpu()
-                .detach()
-                .numpy()
-            ),
-            nan=0,
-        ).sum()
-
         self.logger.record("rollout/global_step", self.num_timesteps)
         self.logger.record(
             "rollout/num_episodes_in_rollout",
-            num_episodes_in_rollout.item() / num_controlled_agents,
+            num_episodes_in_rollout.item(),
         )
         self.logger.record("rollout/sum_reward", rewards.sum())
         self.logger.record(
-            "rollout/avg_reward",
-            (rewards.sum() / (num_episodes_in_rollout)).item(),
+            "rollout/avg_reward", mean_reward_per_agent_per_episode.item()
         )
-
         self.logger.record("rollout/obs_max", observations.max())
         self.logger.record("rollout/obs_min", observations.min())
 
-        # Get categorical max values
-        self.logger.record("norm/speed_max", observations[:, :, 0].max())
-        self.logger.record("norm/veh_len_max", observations[:, :, 1].max())
-        self.logger.record("norm/veh_width_max", observations[:, :, 2].max())
-        self.logger.record("norm/goal_coord_x", observations[:, :, 3].max())
-        self.logger.record("norm/goal_coord_y", observations[:, :, 4].max())
-        self.logger.record("norm/L2_norm_to_goal", observations[:, :, 5].max())
+        # Render the environment
+        if self.config.render:
+            self._create_and_log_video()
+
+    def _create_and_log_video(self):
+        """Make a video and log to wandb.
+        Note: Currently only works a single world."""
+        policy = self.model
+        env = self.locals["env"]
+
+        obs = env.reset()
+        frames = []
+
+        for _ in range(90):
+
+            action, _ = policy.predict(obs.detach().cpu().numpy())
+
+            # Step the environment
+            obs, _, _, _ = env.step(action)
+
+            frame = env.render()
+            frames.append(frame.T)
+
+        frames = np.array(frames)
+
+        wandb.log({"video": wandb.Video(frames, fps=5, format="gif")})
diff --git a/algorithms/ppo/sb3/mappo.py b/algorithms/ppo/sb3/mappo.py
@@ -81,18 +81,56 @@ def collect_rollouts(
             with torch.no_grad():
                 obs_tensor = self._last_obs
 
-                # # EDIT_1: Mask out invalid observations (NaN dimensions and/or dead agents)
-                # # Create dummy actions, values and log_probs (NaN)
-                # actions = torch.full(fill_value=float('nan'), size=(self.n_envs,)).to(self.device)
-                # log_probs = torch.full(fill_value=float('nan'), size=(self.n_envs,), dtype=torch.float32).to(self.device)
-                # values = (
-                #     torch.full(fill_value=float('nan'), size=(self.n_envs,), dtype=torch.float32)
-                #     .unsqueeze(dim=1)
-                #     .to(self.device)
-                # )
+                # TODO: Check
+                # EDIT_1: Mask out invalid observations (NaN axes and/or dead agents)
+                # Create dummy actions, values and log_probs (NaN)
+                actions = torch.full(
+                    fill_value=float("nan"), size=(self.n_envs,)
+                ).to(self.device)
+                log_probs = torch.full(
+                    fill_value=float("nan"),
+                    size=(self.n_envs,),
+                    dtype=torch.float32,
+                ).to(self.device)
+                values = (
+                    torch.full(
+                        fill_value=float("nan"),
+                        size=(self.n_envs,),
+                        dtype=torch.float32,
+                    )
+                    .unsqueeze(dim=1)
+                    .to(self.device)
+                )
+
+                # Get indices of alive agent ids
+                # Convert env_dead_agent_mask to boolean tensor with the same shape as obs_tensor
+                alive_agent_mask = ~(
+                    env.dead_agent_mask.reshape(env.num_envs, 1)
+                )  # .expand_as(obs_tensor)
+
+                # Use boolean indexing to select elements in obs_tensor
+                obs_tensor_alive = obs_tensor[
+                    alive_agent_mask.expand_as(obs_tensor)
+                ].reshape(-1, obs_tensor.shape[-1])
+
+                # Predict actions, vals and log_probs given obs
+                actions_tmp, values_tmp, log_prob_tmp = self.policy(
+                    obs_tensor_alive
+                )
+
+                # Store
+                (
+                    actions[alive_agent_mask.squeeze(dim=1)],
+                    values[alive_agent_mask.squeeze(dim=1)],
+                    log_probs[alive_agent_mask.squeeze(dim=1)],
+                ) = (
+                    actions_tmp.float(),
+                    values_tmp.float(),
+                    log_prob_tmp.float(),
+                )
 
                 # Predict actions, vals and log_probs given obs
-                actions, values, log_probs = self.policy(obs_tensor)
+                # actions, values, log_probs = self.policy(obs_tensor)
 
             # Rescale and perform action
             clipped_actions = actions

diff --git a/experiments/__init__.py → baselines/__init__.py b/experiments/__init__.py → baselines/__init__.py
diff --git a/baselines/config.py b/baselines/config.py
@@ -0,0 +1,23 @@
+
+from dataclasses import dataclass
+import torch
+
+@dataclass
+class ExperimentConfig:
+    """
+    Configurations for experiments.
+    """
+    # Rendering options
+    render: bool = False
+    render_mode: str = "rgb_array"
+    render_freq: int = 1
+
+    # TODO: Logging
+    log_dir: str = "logs"
+
+    # Hyperparameters
+    policy: str = "MlpPolicy"
+    seed: int = 42
+    n_steps: int = 2048
+    batch_size: int = 256
+    verbose: int = 0
diff --git a/experiments/run_ppo_sb3.py → baselines/run_ppo_sb3.py b/experiments/run_ppo_sb3.py → baselines/run_ppo_sb3.py
@@ -11,49 +11,60 @@
 # Import adapted PPO version
 from algorithms.ppo.sb3.mappo import MAPPO
 
+from baselines.config import ExperimentConfig
+
 if __name__ == "__main__":
 
-    config = EnvConfig()
+    env_config = EnvConfig(
+        ego_state=True,
+        road_map_obs=False,
+        partner_obs=True,
+        normalize_obs=False,
+    )
+
+    exp_config = ExperimentConfig(
+        render=False,
+    )
 
     # Make SB3-compatible environment
     env = SB3MultiAgentEnv(
-        config=config,
+        config=env_config,
         num_worlds=2,
-        max_cont_agents=3,
+        max_cont_agents=10,
         data_dir="waymo_data",
         device="cuda",
     )
 
-    # Initialize wandb
-    wandb.login()
     run = wandb.init(
-        project="please_drive",
-        group="single_agent_sparse",
+        project="rl_benchmarking",
+        group="different_scenes",
         sync_tensorboard=True,
     )
     run_id = run.id
 
     # Initialize custom callback
     custom_callback = MultiAgentCallback(
+        config=exp_config,
         wandb_run=run if run_id is not None else None,
     )
 
     model = MAPPO(
-        policy="MlpPolicy",  # Policy type
-        n_steps=2048,  # Number of steps per rollout
-        batch_size=256,  # Minibatch size
-        env=env,  # Our wrapped environment
-        seed=42,  # Always seed for reproducibility
-        verbose=0,
+        policy=exp_config.policy,
+        n_steps=exp_config.n_steps,
+        batch_size=exp_config.batch_size,
+        env=env,
+        seed=exp_config.seed,
+        verbose=exp_config.verbose,
         tensorboard_log=f"runs/{run_id}"
         if run_id is not None
         else None,  # Sync with wandb
     )
 
     # Learn
     model.learn(
-        total_timesteps=3_000_000,
+        total_timesteps=10_000_000,
         callback=custom_callback,
     )
 
     run.finish()
+    env.close()
diff --git a/experiments/run_rl_sim_bench.py → baselines/run_rl_sim_bench.py b/experiments/run_rl_sim_bench.py → baselines/run_rl_sim_bench.py