diff --git a/litgpt/adapter.py b/litgpt/adapter.py index bef77ece1b..bc095a3ca9 100644 --- a/litgpt/adapter.py +++ b/litgpt/adapter.py @@ -9,7 +9,7 @@ """ from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple import torch import torch.nn as nn @@ -28,56 +28,27 @@ class Config(BaseConfig): class GPT(BaseModel): - """The implementation is identical to `litgpt.model.GPT` with the exception that - the `Block` saves the layer index and passes it down to the attention layer.""" - + # Copy & paste from :class:`model.GPT`. Note that :class:`Block` is new here. def __init__(self, config: Config) -> None: nn.Module.__init__(self) assert config.padded_vocab_size is not None self.config = config - self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = nn.Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, block_idx) + for block_idx in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) - self.max_seq_length = self.config.block_size self.mask_cache: Optional[torch.Tensor] = None - - def forward( - self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 - ) -> Union[torch.Tensor, List[torch.Tensor]]: - T = idx.size(1) - if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") - - if input_pos is not None: # use the kv cache - cos = self.cos.index_select(0, input_pos) - sin = self.sin.index_select(0, input_pos) - if self.mask_cache is None: - raise TypeError("You need to call `gpt.set_kv_cache()`") - mask = self.mask_cache.index_select(2, input_pos) - else: - cos = self.cos[:T] - sin = self.sin[:T] - mask = None - - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) - if self.config.scale_embeddings: - x = x * (self.config.n_embd**0.5) - for block in self.transformer.h: - x = block(x, cos, sin, mask, input_pos) - x = self.transformer.ln_f(x) - if lm_head_chunk_size > 0: - # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] - x = self.lm_head(x) # (b, t, vocab_size) - if self.config.final_logit_softcapping is not None: - x = torch.tanh(x / self.config.final_logit_softcapping) * self.config.final_logit_softcapping - return x + self.max_seq_length = self.config.block_size @classmethod def from_name(cls, name: str, **kwargs: Any) -> Self: @@ -91,30 +62,9 @@ def _init_weights(self, module: nn.Module) -> None: class Block(BaseBlock): - """The implementation is identical to `litgpt.model.Block` with the exception that - we replace the attention layer where adaption is implemented.""" - def __init__(self, config: Config, block_idx: int) -> None: - # Skip the parent class __init__ altogether and replace it to avoid useless allocations - nn.Module.__init__(self) - if not config.parallel_residual and config.shared_attention_norm: - raise NotImplementedError( - "No checkpoint amongst the ones we support uses this configuration:" - - " non-parallel residual and shared attention norm." - ) - self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + super().__init__(config, block_idx) self.attn = CausalSelfAttention(config, block_idx) - self.post_attention_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_attention_norm else nn.Identity() - ) - self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps) - self.mlp = config.mlp_class(config) - self.post_mlp_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_mlp_norm else nn.Identity() - ) - - self.config = config class CausalSelfAttention(BaseCausalSelfAttention): @@ -130,12 +80,6 @@ def __init__(self, config: Config, block_idx: int) -> None: self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) # kv cache for inference self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - self.block_idx = block_idx - self.apply_sliding_window_attention = ( - config.sliding_window_size is not None and - block_idx % config.sliding_window_layer_stride == 0 - ) - self.config = config def scaled_dot_product_attention( self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None diff --git a/litgpt/adapter_v2.py b/litgpt/adapter_v2.py index 9b975260f0..e7a203ba6d 100644 --- a/litgpt/adapter_v2.py +++ b/litgpt/adapter_v2.py @@ -9,7 +9,7 @@ """ from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Type, Optional import torch import torch.nn as nn @@ -17,10 +17,9 @@ import litgpt from litgpt.adapter import GPT as BaseModel -from litgpt.adapter import Block as BaseBlock +from litgpt.model import Block as BaseBlock from litgpt.adapter import CausalSelfAttention as BaseCausalSelfAttention from litgpt.adapter import Config as BaseConfig -from litgpt.model import KVCache from litgpt.scripts.convert_hf_checkpoint import qkv_reassemble from litgpt.utils import map_old_state_dict_weights @@ -64,54 +63,27 @@ def reset_parameters(self) -> None: class GPT(BaseModel): + # Copy & paste from :class:`model.GPT`. Note that :class:`Block` is new here. def __init__(self, config: Config) -> None: - # Skip the parent class __init__ altogether and replace it to avoid useless allocations nn.Module.__init__(self) assert config.padded_vocab_size is not None self.config = config - self.lm_head = AdapterV2Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = AdapterV2Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, block_idx) + for block_idx in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) - self.max_seq_length = self.config.block_size self.mask_cache: Optional[torch.Tensor] = None - - def forward( - self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 - ) -> Union[torch.Tensor, List[torch.Tensor]]: - T = idx.size(1) - if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") - - if input_pos is not None: # use the kv cache - cos = self.cos.index_select(0, input_pos) - sin = self.sin.index_select(0, input_pos) - if self.mask_cache is None: - raise TypeError("You need to call `gpt.set_kv_cache()`") - mask = self.mask_cache.index_select(2, input_pos) - else: - cos = self.cos[:T] - sin = self.sin[:T] - mask = None - - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) - if self.config.scale_embeddings: - x = x * (self.config.n_embd**0.5) - for block in self.transformer.h: - x = block(x, cos, sin, mask, input_pos) - x = self.transformer.ln_f(x) - if lm_head_chunk_size > 0: - # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] - x = self.lm_head(x) # (b, t, vocab_size) - if self.config.final_logit_softcapping is not None: - x = torch.tanh(x / self.config.final_logit_softcapping) * self.config.final_logit_softcapping - return x + self.max_seq_length = self.config.block_size @classmethod def from_name(cls, name: str, **kwargs: Any) -> Self: @@ -131,61 +103,30 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class Block(BaseBlock): - """The implementation is identical to `litgpt.model.Block` with the exception that - we replace the attention layer where adaption is implemented.""" - def __init__(self, config: Config, block_idx: int) -> None: - # Skip the parent class __init__ altogether and replace it to avoid useless allocations - nn.Module.__init__(self) - if not config.parallel_residual and config.shared_attention_norm: - raise NotImplementedError( - "No checkpoint amongst the ones we support uses this configuration:" - " non-parallel residual and shared attention norm." - ) - self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + super().__init__(config, block_idx) self.attn = CausalSelfAttention(config, block_idx) - self.post_attention_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_attention_norm else nn.Identity() - ) - self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps) self.mlp = config.mlp_class(config) - self.post_mlp_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_mlp_norm else nn.Identity() - ) - - self.config = config class CausalSelfAttention(BaseCausalSelfAttention): """A modification of `litgpt.adapter.CausalSelfAttention` that uses the Adapter V2 Linear class""" + # Copy&paste from :class:`model.CausalSelfAttention` def __init__(self, config: Config, block_idx: int) -> None: - # Skip the parent class __init__ altogether and replace it to avoid useless allocations - nn.Module.__init__(self) - shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + super().__init__(config, block_idx) # key, query, value projections for all heads, but in a batch - self.qkv = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias) + shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + self.qkv = AdapterV2Linear( + in_features=config.n_embd, + out_features=shape, + bias=config.bias or config.attn_bias + ) # output projection - # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` - self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) - # disabled by default - self.kv_cache: Optional[KVCache] = None - - if block_idx >= config.adapter_start_layer: - # adapter embedding layer - self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) - # gate for adaption - self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) - # kv cache for inference - self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - self.block_idx = block_idx - self.apply_sliding_window_attention = ( - config.sliding_window_size is not None and - block_idx % config.sliding_window_layer_stride == 0 + self.proj = AdapterV2Linear( + config.head_size * config.n_head, config.n_embd, bias=config.bias ) - self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: """For compatibility with base and/or legacy checkpoints.""" mapping = { @@ -211,9 +152,12 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class GptNeoxMLP(litgpt.model.GptNeoxMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) - + self.fc = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = AdapterV2Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: @@ -231,10 +175,15 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class LLaMAMLP(litgpt.model.LLaMAMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc_1 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.fc_2 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) - + self.fc_1 = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.fc_2 = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = AdapterV2Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: @@ -264,7 +213,6 @@ def __init__(self, config: Config) -> None: nn.Module.__init__(self) self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False) self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) - self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py index 866947beea..273fe7497d 100644 --- a/litgpt/generate/base.py +++ b/litgpt/generate/base.py @@ -4,7 +4,7 @@ import time from pathlib import Path from pprint import pprint -from typing import Any, Literal, Optional, Tuple, List, Union, Iterator +from typing import Any, Literal, Optional, Tuple, List, Union, Iterator, Dict import warnings import lightning as L @@ -73,15 +73,23 @@ def sample( return torch.argmax(logits, dim=-1, keepdim=True) -def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor: - logits = model(x, input_pos) - _next = sample(logits, **kwargs).to(dtype=torch.int64) +def next_token( + model: GPT, + input_pos: torch.Tensor, + x: torch.Tensor, + input_pos_maxp1: Optional[torch.Tensor] = None, + **sample_kwargs: Dict[str, Any], +) -> torch.Tensor: + logits = model(x, input_pos, input_pos_maxp1=input_pos_maxp1) + _next = sample(logits, **sample_kwargs).to(dtype=torch.int64) return _next + def batched_sample(logits: list[torch.Tensor], kwargs: list[dict]) -> torch.Tensor: assert len(logits) == len(kwargs), "logits and kwargs must have the same length." return torch.stack([sample(l, **sample_args).to(dtype=torch.int64) for sample_args, l in zip(kwargs, logits)], dim=0) + def batched_next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, kwargs: Union[dict, list[dict]]) -> torch.Tensor: # Where: # input_pos is a 1d tensor of shape [seq_length...] @@ -166,10 +174,19 @@ def generate_fn( token = prompt prefill_token = True input_pos = torch.arange(0, prompt_size, device=device, dtype=torch.int64) + input_pos_maxp1 = torch.tensor(prompt_size, device=device) for current_idx in range(max_returned_tokens - prompt_size): # Generate the token - token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p) + token = next_token( + model, + input_pos, + token.view(1, -1), + input_pos_maxp1=input_pos_maxp1, + temperature=temperature, + top_k=top_k, + top_p=top_p, + ) tokens.append(token) int_token = token.item() @@ -205,6 +222,7 @@ def generate_fn( input_pos = torch.tensor([prompt_size], device=device, dtype=torch.int64) else: input_pos.add_(1) + input_pos_maxp1.add_(1) # Yield any remaining tokens if yielded_idx < len(tokens): diff --git a/litgpt/lora.py b/litgpt/lora.py index beca761c48..8144695aaf 100644 --- a/litgpt/lora.py +++ b/litgpt/lora.py @@ -45,7 +45,7 @@ import math from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Tuple, Type, Union, Optional import torch import torch.nn as nn @@ -481,60 +481,31 @@ def mlp_class(self) -> Type: class GPT(BaseModel): + # Copy & paste from :class:`model.GPT`. Note that :class:`Block` is new here. def __init__(self, config: Config) -> None: nn.Module.__init__(self) assert config.padded_vocab_size is not None self.config = config - self.lm_head = LoRALinear( + self.lm_head = create_lora_linear( + config, config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias, - r=(config.lora_r if config.lora_head else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + use_r=config.lora_head, ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, block_idx) for block_idx in range(config.n_layer)), + h=nn.ModuleList( + Block(config, block_idx) + for block_idx in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) - self.max_seq_length = self.config.block_size self.mask_cache: Optional[torch.Tensor] = None - - def forward( - self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 - ) -> Union[torch.Tensor, List[torch.Tensor]]: - T = idx.size(1) - if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") - - if input_pos is not None: # use the kv cache - cos = self.cos.index_select(0, input_pos) - sin = self.sin.index_select(0, input_pos) - if self.mask_cache is None: - raise TypeError("You need to call `gpt.set_kv_cache()`") - mask = self.mask_cache.index_select(2, input_pos) - else: - cos = self.cos[:T] - sin = self.sin[:T] - mask = None - - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) - if self.config.scale_embeddings: - x = x * (self.config.n_embd**0.5) - for block in self.transformer.h: - x = block(x, cos, sin, mask, input_pos) - x = self.transformer.ln_f(x) - if lm_head_chunk_size > 0: - # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] - x = self.lm_head(x) # (b, t, vocab_size) - if self.config.final_logit_softcapping is not None: - x = torch.tanh(x / self.config.final_logit_softcapping) * self.config.final_logit_softcapping - return x + self.max_seq_length = self.config.block_size @classmethod def from_name(cls, name: str, **kwargs: Any) -> Self: @@ -555,33 +526,16 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class Block(BaseBlock): def __init__(self, config: Config, block_idx: int) -> None: - nn.Module.__init__(self) - if not config.parallel_residual and config.shared_attention_norm: - raise NotImplementedError( - "No checkpoint amongst the ones we support uses this configuration:" - " non-parallel residual and shared attention norm." - ) - self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + super().__init__(config, block_idx) self.attn = CausalSelfAttention(config, block_idx) - self.post_attention_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_attention_norm else nn.Identity() - ) - self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps) self.mlp = config.mlp_class(config) - self.post_mlp_norm = ( - config.norm_class(config.n_embd, eps=config.norm_eps) if config.post_mlp_norm else nn.Identity() - ) - - self.config = config class CausalSelfAttention(BaseCausalSelfAttention): def __init__(self, config: Config, block_idx: int) -> None: - # Skip the parent class __init__ altogether and replace it to avoid - # useless allocations - nn.Module.__init__(self) - shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + super().__init__(config, block_idx) # key, query, value projections for all heads, but in a batch + shape = (config.n_head + 2 * config.n_query_groups) * config.head_size self.qkv = LoRAQKVLinear( in_features=config.n_embd, out_features=shape, @@ -596,23 +550,12 @@ def __init__(self, config: Config, block_idx: int) -> None: n_query_groups=config.n_query_groups, ) # output projection - # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` - self.proj = LoRALinear( + self.proj = create_lora_linear( + config, config.head_size * config.n_head, config.n_embd, - bias=config.bias, - r=(config.lora_r if config.lora_projection else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + use_r=config.lora_projection, ) - # disabled by default - self.kv_cache: Optional[KVCache] = None - self.apply_sliding_window_attention = ( - config.sliding_window_size is not None and - block_idx % config.sliding_window_layer_stride == 0 - ) - - self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: """For compatibility with base and/or legacy checkpoints.""" @@ -633,26 +576,36 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) +def create_lora_linear( + config: Config, + in_size: int, + out_size: int, + bias: Optional[Union[float, bool]] = None, + use_r: Optional[bool] = None, +) -> LoRALinear: + if bias is None: + bias = config.bias + if use_r is None: + use_r = config.lora_mlp + return LoRALinear( + in_size, + out_size, + bias=bias, + r=(config.lora_r if use_r else 0), + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + ) + + class GptNeoxMLP(litgpt.model.GptNeoxMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc = LoRALinear( - config.n_embd, - config.intermediate_size, - bias=config.bias, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.fc = create_lora_linear( + config, config.n_embd, config.intermediate_size ) - self.proj = LoRALinear( - config.intermediate_size, - config.n_embd, - bias=config.bias, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.proj = create_lora_linear( + config, config.intermediate_size, config.n_embd ) - self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: @@ -670,31 +623,15 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class LLaMAMLP(litgpt.model.LLaMAMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc_1 = LoRALinear( - config.n_embd, - config.intermediate_size, - bias=config.bias, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.fc_1 = create_lora_linear( + config, config.n_embd, config.intermediate_size ) - self.fc_2 = LoRALinear( - config.n_embd, - config.intermediate_size, - bias=config.bias, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.fc_2 = create_lora_linear( + config, config.n_embd, config.intermediate_size ) - self.proj = LoRALinear( - config.intermediate_size, - config.n_embd, - bias=config.bias, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.proj = create_lora_linear( + config, config.intermediate_size, config.n_embd ) - self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: @@ -722,16 +659,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class LLaMAMoE(litgpt.model.LLaMAMoE): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.gate = LoRALinear( - config.n_embd, - config.n_expert, - bias=False, - r=(config.lora_r if config.lora_mlp else 0), - lora_alpha=config.lora_alpha, - lora_dropout=config.lora_dropout, + self.gate = create_lora_linear( + config, config.n_embd, config.n_expert, bias=False ) self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) - self.config = config def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: diff --git a/litgpt/model.py b/litgpt/model.py index cbdf2a4bdd..a89070d8bb 100644 --- a/litgpt/model.py +++ b/litgpt/model.py @@ -7,10 +7,12 @@ """ import math -from typing import Any, Dict, Optional, Tuple +from typing import Any, Optional, Tuple, Union, List +from functools import partial import torch import torch.nn as nn +import torch.nn.functional as F from typing_extensions import Self from litgpt.config import Config @@ -23,16 +25,21 @@ def __init__(self, config: Config) -> None: assert config.padded_vocab_size is not None self.config = config - self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = nn.Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, block_idx) for block_idx in range(config.n_layer)), + h=nn.ModuleList( + Block(config, block_idx) + for block_idx in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) - self.max_seq_length = self.config.block_size self.mask_cache: Optional[torch.Tensor] = None + self.max_seq_length = self.config.block_size @property def max_seq_length(self) -> int: @@ -60,6 +67,8 @@ def max_seq_length(self, value: int) -> None: self.cos, self.sin = self.rope_cache(device=self.cos.device) # the mask and kv cache size will get updated on `set_kv_cache`. we cannot update it here because we don't know # if the kv cache is expected + if self.mask_cache is not None and self.mask_cache.shape[-1] < value: + print(f"Warning: KV cache has length {self.mask_cache.shape[-1]} < {value} = max_seq_length. Call 'set_kv_cache' before doing any forwards!") def reset_parameters(self) -> None: # Trigger resetting the rope-cache @@ -74,21 +83,41 @@ def _init_weights(self, module: nn.Module) -> None: elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward( + self, + idx: torch.Tensor, + input_pos: Optional[torch.Tensor] = None, + input_pos_maxp1: Optional[torch.Tensor] = None, + lm_head_chunk_size: int = 0, + ) -> Union[torch.Tensor, List[torch.Tensor]]: """ + If `input_pos` is provided, the KV cache uses K and V vectors for + positions smaller than entries in `input_pos`. For efficiency, pass + `input_pos_maxp1` as `max(input_pos) + 1` if already available from + your forward algorithm. This slices the KV cache buffers and speeds + up multi-head attention. + + Without `input_pos_maxp1`, the computation uses the full KV cache + (`max_seq_length`) with masking applied. Note that inferring + `input_pos_maxp1` from `input_pos` causes graph breaks and prevents + compilation. + Args: - idx (torch.Tensor): Input token indices, shape `(B, T)` - input_pos (torch.Tensor, optional): Contains input positions, - either with shape `(T,)` or `(B, T)`, if provided. This is used - for generative inference, where a KV cache is required. By - default, this assumes `input_dim == arange(T)` with all inputs - up to `T` provided upfront. + idx: Token indices of input sequences, shape `(B, T)`, where `B` + is batch size. + input_pos: Optional. Positions of input tokens. The default is + `arange(T)`. Can have shape `(T,)` or `(B, T)` (batched index). + input_pos_maxp1: Optional. See above. + lm_head_chunk_size: Optional. If `lm_head_chunk_size > 0`, the final + `lm_head` computation is done in chunks of this size. Returns: - torch.Tensor: Output (logits), shape `(B, T, config.padded_vocab_size)` + Logit outputs, shape `(B, T, config.padded_vocab_size)`. If + `lm_head_chunk_size > 0`, this is a list of chunks of shape + `(B, lm_head_chunk_size, config.padded_vocab_size)`, the final + entry can be shorter. + """ - if idx.dim() != 2: - raise ValueError(f"idx must have 2 dimensions, idx.shape = {idx.shape}") T = idx.size(1) if self.max_seq_length < T: raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") @@ -101,31 +130,49 @@ def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) - raise ValueError(f"input_pos.shape[-1] = {input_pos.shape[-1]} != {T} = idx.shape[1], must be the same") cos = batched_index_select(self.cos, 0, input_pos) sin = batched_index_select(self.sin, 0, input_pos) + if input_pos.dim() == 1: + cos = cos.unsqueeze(0) + sin = sin.unsqueeze(0) if self.mask_cache is None: raise TypeError("You need to call `gpt.set_kv_cache()`") mask = batched_index_select(self.mask_cache, 2, input_pos) if mask.dim() > 4: # the mask cache has a batch dim of 1 in addition to the one # we get if input_pos has a batch dimension - mask = mask.squeeze(1) + mask = mask.view(*(mask.shape[0:1] + mask.shape[2:])) + if input_pos_maxp1 is not None: + # Shorten final dimension so it just covers all `input_pos` entries + if input_pos_maxp1 > self.max_seq_length: + raise ValueError(f"Positions in 'input_pos' must be in [0,{self.max_seq_length})") + mask = mask[..., :input_pos_maxp1] else: # unsqueeze to have a batch dimension cos = self.cos[:T].unsqueeze(0) sin = self.sin[:T].unsqueeze(0) # `cos`, `sin` have shape (1, T, config.rope_n_elem) mask = None # defaults to causal mask + input_pos_maxp1 = None x = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd) if self.config.scale_embeddings: - x = x * torch.tensor(self.config.n_embd**0.5, dtype=x.dtype) + x = x * torch.tensor(self.config.n_embd ** 0.5, dtype=x.dtype) for block in self.transformer.h: - x = block(x, cos, sin, mask, input_pos) + x = block(x, cos, sin, mask, input_pos, input_pos_maxp1) x = self.transformer.ln_f(x) - x = self.lm_head(x) # (B, T, padded_vocab_size) - if self.config.final_logit_softcapping is not None: - x = do_softcapping(x, self.config.final_logit_softcapping) - return x + clamp_head = ( + partial(do_softcapping, thresh=self.config.final_logit_softcapping) + if self.config.final_logit_softcapping is not None + else nn.Identity() + ) + if lm_head_chunk_size > 0: + # chunk the lm head logits to reduce the peak memory used by autograd + return [ + clamp_head(self.lm_head(x_i)) + for x_i in x.split(lm_head_chunk_size, dim=1) + ] + else: + return clamp_head(self.lm_head(x)) # (B, T, padded_vocab_size) @classmethod def from_name(cls, name: str, **kwargs: Any) -> Self: @@ -204,7 +251,11 @@ def clear_kv_cache(self) -> None: class Block(nn.Module): - def __init__(self, config: Config, block_idx: int) -> None: + def __init__( + self, + config: Config, + block_idx: int, + ) -> None: super().__init__() if not config.parallel_residual and config.shared_attention_norm: raise NotImplementedError( @@ -232,6 +283,7 @@ def forward( sin: torch.Tensor, mask: Optional[torch.Tensor] = None, input_pos: Optional[torch.Tensor] = None, + input_pos_maxp1: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Non-parallel residual Parallel residual @@ -255,7 +307,9 @@ def forward( """ x_normed = self.norm_1(x) - attention_output = self.attn(x_normed, cos, sin, mask, input_pos) + attention_output = self.attn( + x_normed, cos, sin, mask, input_pos, input_pos_maxp1 + ) attention_output = self.post_attention_norm(attention_output) if self.config.parallel_residual: @@ -278,16 +332,17 @@ def __init__(self, config: Config, block_idx: int) -> None: bias=config.bias or config.attn_bias, ) # output projection - # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` - self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) + self.proj = nn.Linear( + config.head_size * config.n_head, config.n_embd, bias=config.bias + ) # disabled by default self.kv_cache: Optional[KVCache] = None self.apply_sliding_window_attention = ( config.sliding_window_size is not None and block_idx % config.sliding_window_layer_stride == 0 ) - self.config = config + self.block_idx = block_idx def forward( self, @@ -296,6 +351,7 @@ def forward( sin: torch.Tensor, mask: Optional[torch.Tensor] = None, input_pos: Optional[torch.Tensor] = None, + input_pos_maxp1: Optional[torch.Tensor] = None, ) -> torch.Tensor: # Notation: # - B | batch size @@ -304,8 +360,11 @@ def forward( # - C* | attentions's embeddings size # - nh_(q,k,v) | number of heads for query, key and value # - hs | head size - - B, T, C = x.size() + head_size = self.config.head_size + n_head = self.config.n_head + n_query_groups = self.config.n_query_groups + rope_n_elem = self.config.rope_n_elem + B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) # Perform a single multiplication operation using a combined QKV matrix to calculate `query`, `key`, and `value` # instead of individually multiplying the input `x` with the respective weight matrices. @@ -313,16 +372,16 @@ def forward( # Define query, key and value sizes. # If grouped/multi query is enabled, these sizes are not equal (see the diagram in `lit_gpt/config.py::Config`). - query_size = self.config.n_head * self.config.head_size - key_size = value_size = self.config.n_query_groups * self.config.head_size + query_size = n_head * head_size + key_size = value_size = n_query_groups * head_size # Split qkv into query, key and value matrices. q, k, v = qkv.split((query_size, key_size, value_size), dim=-1) # 3x(B, T, C*) # To place the num_heads (nh) dimension right after the batch (B) dimension, the first step is to decouple the # embedding size (C) into num_heads (nh) and head_size (hs). - q = q.view(B, T, self.config.n_head, self.config.head_size) # (B, T, nh_q, hs) - k = k.view(B, T, self.config.n_query_groups, self.config.head_size) # (B, T, nh_k, hs) - v = v.view(B, T, self.config.n_query_groups, self.config.head_size) # (B, T, nh_v, hs) + q = q.view(B, T, n_head, head_size) # (B, T, nh_q, hs) + k = k.view(B, T, n_query_groups, head_size) # (B, T, nh_k, hs) + v = v.view(B, T, n_query_groups, head_size) # (B, T, nh_v, hs) # The tensors `query`, `key`, and `value` are now accurately structured: within each batch element (B), there are # multiple heads (nh), and within each head, there is a sequence of elements (T), each represented by a vector @@ -332,22 +391,28 @@ def forward( v = v.transpose(1, 2) # (B, nh_v, T, hs) # Unlike standard positional embeddings rotary embeddings must be applied at every layer. - q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin) - k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin) - q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1) # (B, nh_q, T, hs) - k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1) # (B, nh_k, T, hs) + q_roped = apply_rope(q[..., : rope_n_elem], cos, sin) + k_roped = apply_rope(k[..., : rope_n_elem], cos, sin) + q = torch.cat((q_roped, q[..., rope_n_elem :]), dim=-1) # (B, nh_q, T, hs) + k = torch.cat((k_roped, k[..., rope_n_elem :]), dim=-1) # (B, nh_k, T, hs) # Apply kv-cache during inference. if input_pos is not None: if not isinstance(self.kv_cache, KVCache): raise TypeError("You need to call `gpt.set_kv_cache()`") k, v = self.kv_cache(input_pos, k, v) + if input_pos_maxp1 is not None: + # Subselect along sequence dimension + k = k[..., :input_pos_maxp1, :] + v = v[..., :input_pos_maxp1, :] + # k, v: (B, nh_k, input_pos_maxp1, hs) + # If input_pos_maxp1 is None -> max_seq_length # Grouped queries: balance the number of heads across all three matrices. # NOTE: flash attention requires it in training mode. # Multi-query: this step can be skipped since there is only 1 head, allowing us to use broadcasting. - if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1): - q_per_kv = self.config.n_head // self.config.n_query_groups + if n_query_groups != n_head and (input_pos is None or n_query_groups != 1): + q_per_kv = n_head // n_query_groups k = k.repeat_interleave(q_per_kv, dim=1) # (B, nh_q, T, hs) v = v.repeat_interleave(q_per_kv, dim=1) # (B, nh_q, T, hs) @@ -365,6 +430,7 @@ def forward( if mask is None: mask = torch.ones(T, T, dtype=q.dtype, device=q.device).triu(diagonal=1) mask.masked_fill_(mask.bool(), float("-inf")) + mask = mask.view(1, 1, *mask.shape) sliding_window_bias = torch.ones_like(mask).tril(diagonal=-self.config.sliding_window_size) sliding_window_bias.masked_fill_(sliding_window_bias.bool(), float("-inf")) mask += sliding_window_bias @@ -375,7 +441,7 @@ def forward( y = self.scaled_dot_product_attention(q, k, v, mask) # Re-assemble all head outputs side by side. - y = y.reshape(B, T, self.config.head_size * self.config.n_head) + y = y.reshape(B, T, head_size * n_head) # Output projection. return self.proj(y) # (B, T, C) @@ -393,10 +459,10 @@ def scaled_dot_product_attention( mask = torch.ones(q.size(2), q.size(2), dtype=q.dtype, device=q.device).triu(diagonal=1) mask.masked_fill_(mask.bool(), torch.finfo(q.dtype).min) scores = scores + mask - scores = torch.nn.functional.softmax(scores, dim=-1, dtype=torch.float).to(dtype=q.dtype) + scores = F.softmax(scores, dim=-1, dtype=torch.float).to(dtype=q.dtype) y = scores @ v else: - y = torch.nn.functional.scaled_dot_product_attention( + y = F.scaled_dot_product_attention( q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None ) return y.transpose(1, 2) @@ -423,7 +489,7 @@ def build_kv_cache( ) return KVCache(k_shape, v_shape, device=device, dtype=dtype) - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict(self, state_dict: dict, prefix: str, *args: Any, **kwargs: Any) -> None: """For compatibility with legacy checkpoints.""" for attr in ("weight", "bias"): @@ -438,30 +504,38 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class GptNeoxMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) - + self.fc = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.fc(x) - x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate) + x = F.gelu(x, approximate=self.config.gelu_approximate) return self.proj(x) class LLaMAMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) - + self.fc_1 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.fc_2 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: x_fc_1 = self.fc_1(x) x_fc_2 = self.fc_2(x) - x = torch.nn.functional.silu(x_fc_1) * x_fc_2 + x = F.silu(x_fc_1) * x_fc_2 return self.proj(x) @@ -469,7 +543,7 @@ class GemmaMLP(LLaMAMLP): def forward(self, x: torch.Tensor) -> torch.Tensor: x_fc_1 = self.fc_1(x) x_fc_2 = self.fc_2(x) - x = torch.nn.functional.gelu(x_fc_1, approximate=self.config.gelu_approximate) * x_fc_2 + x = F.gelu(x_fc_1, approximate=self.config.gelu_approximate) * x_fc_2 return self.proj(x) @@ -478,7 +552,6 @@ def __init__(self, config: Config) -> None: super().__init__() self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False) self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) - self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -521,6 +594,7 @@ def build_rope_cache( Returns: Tuple[torch.Tensor, torch.Tensor]: Cosine and sine caches for RoPE. + Shapes are `(seq_len, n_elem)`. """ # Compute the inverse frequencies theta @@ -546,6 +620,15 @@ def build_rope_cache( # Calculate the product of position index and $\theta_i$ idx_theta = torch.outer(seq_idx, theta).repeat(1, 2) + # If `n_elem` is odd, the final dimension of `idx_theta` has size + # `n_elem + 1`, so need to cut something off. + # Due to a current bug in Hugging Face, in the case `n_elem == 1`, we leave + # `idx_theta`, `cos`, `sin` as is. Things work out in `apply_rope` due to + # broadcasting. If we shorten `idx_theta`, unit tests comparing to + # Hugging Face fail. + # https://github.com/huggingface/transformers/issues/35233 + if idx_theta.shape[-1] > n_elem > 1: + idx_theta = idx_theta[..., :n_elem] return torch.cos(idx_theta), torch.sin(idx_theta) @@ -620,18 +703,32 @@ def batched_index_copy_(t, dim, idx, val): def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - # x: (B, nh, T, hs) - # sin, cos: (B, T, hs) or (1, T, hs) - head_size = x.size(-1) - x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) - x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2) - rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs) - if cos.dim() > 1: - # batch dimensions must align - # sin/cos are (B, T, hs) so we unsqeeze -3 for nh - # we count from back because all of apply_rope does - cos = cos.unsqueeze(-3) - sin = sin.unsqueeze(-3) + """ + Applies RoPE transform to `x`. Note that `cos`, `sin` need to have a batch + dimension. + + Args: + x: Input tensor, `(B, ..., T, head_size)` + cos: Cached cosines, `(B, T, head_size)` or `(1, T, head_size)` + sin: Cached sines, `(B, T, head_size)` or `(1, T, head_size)` + + Returns: + Encoded tensor, `(B, ..., T, head_size)` + """ + if cos.dim() != 3: + raise ValueError(f"cos must be three-dimensional, but shape is {cos.shape}") + if cos.shape != sin.shape: + raise ValueError(f"cos, sin must have same shape, but cos.shape={cos.shape}, sin.shape={sin.shape}") + head_size_half = x.size(-1) // 2 + x1 = x[..., : head_size_half] # (B, ..., T, head_size/2) + x2 = x[..., head_size_half :] # (B, ..., T, head_size/2) + rotated = torch.cat((-x2, x1), dim=-1) # (B, ..., T, head_size) + dims_diff = x.dim() - cos.dim() + if dims_diff > 0: + # Ensure that shapes of `x`, `cos`, `sin` align + new_shape = cos.shape[0:1] + (1,) * dims_diff + cos.shape[1:] + cos = cos.view(*new_shape) + sin = sin.view(*new_shape) roped = (x * cos) + (rotated * sin) return roped.to(dtype=x.dtype) @@ -642,6 +739,10 @@ def do_softcapping(x: torch.Tensor, thresh: float) -> torch.Tensor: class KVCache(nn.Module): + """ + Buffers `k`, `v` have shape + `(batch_size, n_query_groups, max_seq_length, head_size)`. + """ def __init__( self, k_shape: Tuple[int, int, int, int], @@ -654,13 +755,28 @@ def __init__( self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False) def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Writes new values `k` and `v` into the cache at the positions specified + by `input_pos` along the sequence dimension (`max_seq_length`). The batch + size of `k` and `v` (`bs`) must be smaller or equal to `KVCache` batch + size. Returns the full buffers, adjusted to the batch size `bs`. + + Args: + input_pos: Position index, `(bs, T)` or `(T,)` + k: New values, `(bs, n_query_groups, T, head_size)` + v: New values, `(bs, n_query_groups, T, head_size)` + + Returns: + k_full, v_full, `(bs, n_query_groups, max_seq_length, head_size)` + + """ # move the buffer to the activation dtype for when AMP is used self.k = self.k.to(k.dtype) self.v = self.v.to(v.dtype) # update the cache - n = k.size(0) - k = batched_index_copy_(self.k[:n, ...], -2, input_pos, k) - v = batched_index_copy_(self.v[:n, ...], -2, input_pos, v) + bs = k.size(0) + k = batched_index_copy_(self.k[:bs, ...], -2, input_pos, k) + v = batched_index_copy_(self.v[:bs, ...], -2, input_pos, v) return k, v def reset_parameters(self) -> None: diff --git a/litgpt/utils.py b/litgpt/utils.py index 2180762617..60e7cd9034 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -358,7 +358,6 @@ def get_default_supported_precision(training: bool) -> str: Args: training: If True, returns '-mixed' version of the precision; if False, returns '-true' version. - use_mps: Flag to determine if MPS should be used when available. Returns: The default precision that is suitable for the task and is supported by the hardware. diff --git a/tests/test_model.py b/tests/test_model.py index abd1a767bf..e8a110a409 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -914,7 +914,7 @@ def test_against_original_salamandra(model_name, device, dtype): ours_y = ours_model(x) theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float torch.testing.assert_close(ours_y, theirs_y) - + @torch.inference_mode() @pytest.mark.parametrize("model_name", ("SmolLM2-135M", "SmolLM2-360M", "SmolLM2-1.7B")) @@ -1322,3 +1322,65 @@ def test_load_legacy_state_dict(): attention_2 = CausalSelfAttention(config=config, block_idx=0) attention_2.load_state_dict(state_dict) + +@pytest.mark.parametrize("n_query_groups", (1, 2, 4, 8)) +@torch.inference_mode() +def test_kv_cache_buffer_shape(n_query_groups): + batch_size = 3 + max_seq_length = 23 + config = Config( + block_size=25, + padded_vocab_size=5, + n_layer=2, + n_head=8, + n_embd=16, + n_query_groups=n_query_groups, + ) + model = GPT(config) + model.max_seq_length = max_seq_length + model.set_kv_cache(batch_size) + required_shape = (batch_size, n_query_groups, max_seq_length, config.head_size) + for block in model.transformer.h: + kv_cache = block.attn.kv_cache + assert kv_cache is not None + assert kv_cache.k.shape == required_shape + assert kv_cache.v.shape == required_shape + + +@pytest.mark.parametrize( + ("rotary_percentage", "final_dim"), + ((0.75, 3), (0.25, 2)) +) +@torch.inference_mode() +def test_rope_cos_sin_shapes_if_rope_n_elem_is_odd(rotary_percentage, final_dim): + batch_size = 3 + config = Config( + block_size=25, + padded_vocab_size=5, + n_layer=2, + n_head=4, + n_embd=16, + rotary_percentage=rotary_percentage, + ) + model = GPT(config) + required_shape = (config.block_size, final_dim) + assert model.cos.shape == required_shape + assert model.sin.shape == required_shape + +def test_forward_with_without_input_pos_maxp1(): + batch_size = 3 + config = Config( + block_size=25, + padded_vocab_size=5, + n_layer=2, + n_head=8, + n_embd=16, + ) + model = GPT(config) + model.set_kv_cache(batch_size) + idx = torch.randint(0, config.padded_vocab_size, (1, 10)) + input_pos = torch.arange(1, 11) + input_pos_maxp1 = torch.tensor(11) + logits_with_maxp1 = model(idx, input_pos, input_pos_maxp1=input_pos_maxp1) + logits_no_maxp1 = model(idx, input_pos) + torch.testing.assert_close(logits_with_maxp1, logits_no_maxp1) diff --git a/tests/test_rope.py b/tests/test_rope.py index 7293e52fa7..0aa10aeb58 100644 --- a/tests/test_rope.py +++ b/tests/test_rope.py @@ -13,7 +13,7 @@ @torch.inference_mode() def test_rope_gptneox(): bs, seq_len, n_head, n_embed = 1, 6, 2, 8 - head_size = n_embed // n_head + head_size = n_embed // n_head # 4 x = torch.randint(0, 10000, size=(bs, n_head, seq_len, head_size)).float() position_ids = torch.arange(seq_len).unsqueeze(0) @@ -21,9 +21,10 @@ def test_rope_gptneox(): theirs_cos, theirs_sin = theirs_rot_emb(x, position_ids) ours_cos_cached, ours_sin_cached = build_rope_cache(seq_len, head_size, device=x.device) - # their rope cache has 2 added dimensions and the cos/sin is duplicated - torch.testing.assert_close(ours_cos_cached, theirs_cos.squeeze()) - torch.testing.assert_close(ours_sin_cached, theirs_sin.squeeze()) + ours_cos_cached = ours_cos_cached.unsqueeze(0) + ours_sin_cached = ours_sin_cached.unsqueeze(0) + torch.testing.assert_close(ours_cos_cached, theirs_cos) + torch.testing.assert_close(ours_sin_cached, theirs_sin) ours_x_rope = apply_rope(x, ours_cos_cached, ours_sin_cached) theirs_x_rope, _ = apply_rotary_pos_emb_gptneo(x, x, theirs_cos, theirs_sin, position_ids) @@ -47,8 +48,10 @@ def test_rope_llama_2(): # our rope ours_cos, ours_sin = build_rope_cache(seq_len, n_elem=head_dim, base=rope_theta) - torch.testing.assert_close(theirs_cos.squeeze(0), ours_cos) - torch.testing.assert_close(theirs_sin.squeeze(0), ours_sin) + ours_cos = ours_cos.unsqueeze(0) + ours_sin = ours_sin.unsqueeze(0) + torch.testing.assert_close(theirs_cos, ours_cos) + torch.testing.assert_close(theirs_sin, ours_sin) ################################## # Compare rotated tensors @@ -86,8 +89,10 @@ def test_rope_llama_3(): # our rope ours_cos, ours_sin = build_rope_cache(seq_len, n_elem=head_dim, base=rope_theta) - torch.testing.assert_close(theirs_cos.squeeze(0), ours_cos) - torch.testing.assert_close(theirs_sin.squeeze(0), ours_sin) + ours_cos = ours_cos.unsqueeze(0) + ours_sin = ours_sin.unsqueeze(0) + torch.testing.assert_close(theirs_cos, ours_cos) + torch.testing.assert_close(theirs_sin, ours_sin) ################################## # Compare rotated tensors @@ -146,8 +151,10 @@ def test_rope_llama_3_1(): # our rope ours_cos, ours_sin = build_rope_cache(seq_len, n_elem=head_dim, base=rope_theta, extra_config=our_rope_config) - torch.testing.assert_close(theirs_cos.squeeze(0), ours_cos) - torch.testing.assert_close(theirs_sin.squeeze(0), ours_sin) + ours_cos = ours_cos.unsqueeze(0) + ours_sin = ours_sin.unsqueeze(0) + torch.testing.assert_close(theirs_cos, ours_cos) + torch.testing.assert_close(theirs_sin, ours_sin) ################################## # Compare rotated tensors @@ -206,8 +213,10 @@ def test_rope_llama_3_2(): # our rope ours_cos, ours_sin = build_rope_cache(seq_len, n_elem=head_dim, base=rope_theta, extra_config=our_rope_config) - torch.testing.assert_close(theirs_cos.squeeze(0), ours_cos) - torch.testing.assert_close(theirs_sin.squeeze(0), ours_sin) + ours_cos = ours_cos.unsqueeze(0) + ours_sin = ours_sin.unsqueeze(0) + torch.testing.assert_close(theirs_cos, ours_cos) + torch.testing.assert_close(theirs_sin, ours_sin) ################################## # Compare rotated tensors @@ -225,4 +234,24 @@ def test_rope_llama_3_2(): theirs_q_rot, theirs_k_rot = apply_rotary_pos_emb_llama(queries, keys, theirs_cos, theirs_sin) torch.testing.assert_close(theirs_q_rot, ours_q_rot) torch.testing.assert_close(theirs_k_rot, ours_k_rot) - + +@torch.inference_mode() +def test_rope_cos_sin_shapes_if_rope_n_elem_is_odd(): + bs, seq_len, n_head, n_embed = 1, 6, 2, 8 + head_size = n_embed // n_head # 4 + rotary_percentage = 0.75 + rope_n_elem = int(head_size * rotary_percentage) # 3 + ours_cos, ours_sin = build_rope_cache(seq_len, rope_n_elem) + required_shape = (seq_len, rope_n_elem) + assert ours_cos.shape == required_shape + assert ours_sin.shape == required_shape + # Special case: If `rope_n_elem == 1`, the shape is extended. This is to + # accommodate a current bug in Hugging Face, ensuring that other unit tests + # pass. + # https://github.com/huggingface/transformers/issues/35233 + rotary_percentage = 0.25 + rope_n_elem = int(head_size * rotary_percentage) # 1 + ours_cos, ours_sin = build_rope_cache(seq_len, rope_n_elem) + required_shape = (seq_len, rope_n_elem + 1) + assert ours_cos.shape == required_shape + assert ours_sin.shape == required_shape diff --git a/tests/test_unsloth_executor.py b/tests/test_unsloth_executor.py index 15b1c7c673..b62eac7214 100644 --- a/tests/test_unsloth_executor.py +++ b/tests/test_unsloth_executor.py @@ -54,6 +54,8 @@ def test_unsloth_rope(): B, nh, T, hs = 2, 32, 64, 16 cos, sin = build_rope_cache(T, hs, device="cuda") + cos = cos.unsqueeze(0) + sin = sin.unsqueeze(0) q = torch.rand((B, nh, T, hs), device="cuda", requires_grad=True) def foo(x, cos, sin):