From 2a22cb2bad5291d8bcb47ba48a52342b139e216d Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 4 Feb 2025 17:40:32 +0100 Subject: [PATCH 1/5] speedup feature selection --- src/tabpfn/model/encoders.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/tabpfn/model/encoders.py b/src/tabpfn/model/encoders.py index 2ae5e698..ffede3c1 100644 --- a/src/tabpfn/model/encoders.py +++ b/src/tabpfn/model/encoders.py @@ -109,27 +109,25 @@ def select_features(x: torch.Tensor, sel: torch.Tensor) -> torch.Tensor: Returns: The tensor with selected features. """ - new_x = x.clone() - for B in range(x.shape[1]): - if x.shape[1] > 1: - new_x[:, B, :] = torch.cat( - [ - x[:, B, sel[B]], - torch.zeros( - x.shape[0], - x.shape[-1] - sel[B].sum(), - device=x.device, - dtype=x.dtype, - ), - ], - -1, - ) - else: - # If B == 1, we don't need to append zeros, as the number of features can change - new_x = x[:, :, sel[B]] + B, total_features = sel.shape + batch_size = x.shape[0] + + # If B == 1, we don't need to append zeros, as the number of features don't need to be fixed. + if B == 1: + return x[:, :, sel[0]] + + new_x = torch.zeros((batch_size, B, total_features), device=x.device, dtype=x.dtype) + + # For each block, compute the number of selected features. + sel_counts = sel.sum(dim=-1) # shape: (B,) + + for b in range(B): + s = int(sel_counts[b]) + if s > 0: + new_x[:, b, :s] = x[:, b, sel[b]] + return new_x - def remove_outliers( X: torch.Tensor, n_sigma: float = 4, From 6f9ce905c403d945779524b5b8d34fd438a3db78 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 4 Feb 2025 17:47:38 +0100 Subject: [PATCH 2/5] ruff --- src/tabpfn/model/encoders.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/tabpfn/model/encoders.py b/src/tabpfn/model/encoders.py index ffede3c1..7159fdab 100644 --- a/src/tabpfn/model/encoders.py +++ b/src/tabpfn/model/encoders.py @@ -115,19 +115,20 @@ def select_features(x: torch.Tensor, sel: torch.Tensor) -> torch.Tensor: # If B == 1, we don't need to append zeros, as the number of features don't need to be fixed. if B == 1: return x[:, :, sel[0]] - + new_x = torch.zeros((batch_size, B, total_features), device=x.device, dtype=x.dtype) - + # For each block, compute the number of selected features. sel_counts = sel.sum(dim=-1) # shape: (B,) - + for b in range(B): s = int(sel_counts[b]) if s > 0: new_x[:, b, :s] = x[:, b, sel[b]] - + return new_x + def remove_outliers( X: torch.Tensor, n_sigma: float = 4, From 41a5df7801f41bd4f1c57d71509b04ae7c1c54bf Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 4 Feb 2025 17:52:07 +0100 Subject: [PATCH 3/5] ruff --- src/tabpfn/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tabpfn/__init__.py b/src/tabpfn/__init__.py index b021a9e5..32029c76 100644 --- a/src/tabpfn/__init__.py +++ b/src/tabpfn/__init__.py @@ -1,6 +1,7 @@ +from importlib.metadata import version + from tabpfn.classifier import TabPFNClassifier from tabpfn.regressor import TabPFNRegressor -from importlib.metadata import version try: __version__ = version(__name__) From 3a5e0334566efcaec89994a65964663ba32d4011 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 4 Feb 2025 18:31:59 +0100 Subject: [PATCH 4/5] add doc --- src/tabpfn/model/encoders.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/tabpfn/model/encoders.py b/src/tabpfn/model/encoders.py index 7159fdab..8a311c61 100644 --- a/src/tabpfn/model/encoders.py +++ b/src/tabpfn/model/encoders.py @@ -100,25 +100,29 @@ def normalize_data( def select_features(x: torch.Tensor, sel: torch.Tensor) -> torch.Tensor: - """Select features from the input tensor based on the selection mask. + """Select features from the input tensor based on the selection mask, + and arrange them contiguously in the last dimension. + If batch size is bigger than 1, we pad the features with zeros to make the number of features fixed. Args: - x: The input tensor. - sel: The boolean selection mask indicating which features to keep. + x: The input tensor of shape (sequence_length, batch_size, total_features) + sel: The boolean selection mask indicating which features to keep of shape (batch_size, total_features) Returns: The tensor with selected features. + The shape is (sequence_length, batch_size, number_of_selected_features) if batch_size is 1. + The shape is (sequence_length, batch_size, total_features) if batch_size is greater than 1. """ B, total_features = sel.shape - batch_size = x.shape[0] + sequence_length = x.shape[0] # If B == 1, we don't need to append zeros, as the number of features don't need to be fixed. if B == 1: return x[:, :, sel[0]] - new_x = torch.zeros((batch_size, B, total_features), device=x.device, dtype=x.dtype) + new_x = torch.zeros((sequence_length, B, total_features), device=x.device, dtype=x.dtype) - # For each block, compute the number of selected features. + # For each batch, compute the number of selected features. sel_counts = sel.sum(dim=-1) # shape: (B,) for b in range(B): From dbfc9ea5779f1187af26be11925b30681e353df3 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 4 Feb 2025 18:36:08 +0100 Subject: [PATCH 5/5] ruff --- src/tabpfn/model/encoders.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tabpfn/model/encoders.py b/src/tabpfn/model/encoders.py index 8a311c61..2f247539 100644 --- a/src/tabpfn/model/encoders.py +++ b/src/tabpfn/model/encoders.py @@ -120,7 +120,11 @@ def select_features(x: torch.Tensor, sel: torch.Tensor) -> torch.Tensor: if B == 1: return x[:, :, sel[0]] - new_x = torch.zeros((sequence_length, B, total_features), device=x.device, dtype=x.dtype) + new_x = torch.zeros( + (sequence_length, B, total_features), + device=x.device, + dtype=x.dtype, + ) # For each batch, compute the number of selected features. sel_counts = sel.sum(dim=-1) # shape: (B,)