kilianhaefeli
/

Fast_dLLM_v2_7B

Safetensors

English

Fast_dLLM_Qwen

custom_code

Model card Files Files and versions

xet

Community

kilianhaefeli commited on Dec 8, 2025

Commit

3a8246f

1 Parent(s): 55543b4

add new modleing file which contains fixed to batching

Browse files

Files changed (2) hide show

modeling.py +174 -47
modeling_old.py +785 -0

modeling.py CHANGED Viewed

@@ -483,16 +483,69 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
                         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
                     )
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
         if self.training:
             attention_mask = self.gen_mask(labels.shape[1], self.bd_size, labels.shape[0], self.config.num_attention_heads).to(device=inputs_embeds.device)
         else:
             if use_block_cache and block_past_key_values.get_seq_length() != 0:
                 attention_mask = None
             else:
-                attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
         hidden_states = inputs_embeds
@@ -652,7 +705,8 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
     def generate(
         self,
         input_ids,
-        max_new_tokens,
         mask_id=151665,
         threshold=1,
         small_block_size=8,
@@ -664,59 +718,86 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         use_block_cache=False,
         **kwargs
     ):
         num_blocks = max_new_tokens // block_size
         original_input_length = input_ids.shape[1]
         if input_ids.shape[1] > block_size:
-            output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
             logits, past_key_values = output.logits, output.past_key_values
             if input_ids.shape[1] % block_size == 0:
                 next_token = logits[:, -1:, :].argmax(dim=-1)
                 input_ids = torch.cat([input_ids, next_token], dim=1)
         else:
             past_key_values = None
         num_small_blocks = block_size // small_block_size
         for block_idx in range(num_blocks):
-            if stop_token in input_ids[:, original_input_length:]:
-                break
             prompt_length = input_ids.shape[1]
             # Initialize x_init with mask_id
             x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long)
-            x_init = torch.cat([input_ids, x_init], dim=1)
             x_t = x_init.clone()
             block_past_key_values = None
             while True:
-                if stop_token in x_t[:, prompt_length:]:
-                    stop_token_idx = (x_t[:, prompt_length:] == stop_token).nonzero()[0][1]
-                    if (x_t[:, prompt_length:prompt_length+stop_token_idx] == mask_id).sum() == 0:
-                        break
                 mask_idx = (x_t[:, -block_size:] == mask_id)
-                # Decode a complete block, update cache, and generate the next token
                 if mask_idx.sum() == 0:
-                    output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=True, block_size=block_size)
                     logits, past_key_values = output.logits, output.past_key_values
                     next_token = logits[:, -1:, :].argmax(dim=-1)
                     x_t = torch.cat([x_t, next_token], dim=1)
                     break
                 for small_block_idx in range(num_small_blocks):
-                    small_block_start_idx = small_block_idx * small_block_size
-                    small_block_end_idx = small_block_start_idx + small_block_size
-                    start = -block_size + small_block_start_idx
                     end = None if block_size == small_block_end_idx else -block_size + small_block_end_idx
                     while True:
                         mask_idx = (x_t[:, -block_size:] == mask_id)
-                        if mask_idx[:, start:end].sum() == 0:
                             break
-                        if stop_token in x_t[:, prompt_length:]:
-                            stop_token_idx = (x_t[:, prompt_length:] == stop_token).nonzero()[0][1]
-                            if (x_t[:, prompt_length:prompt_length+stop_token_idx] == mask_id).sum() == 0:
-                                break
                         if use_block_cache:
                             if block_past_key_values is None or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
                                 output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
                                 logits, block_past_key_values = output.logits, output.block_past_key_values
@@ -726,28 +807,71 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
                                 logits = self.forward(input_ids=x_t[:,start:end], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True, block_past_key_values=block_past_key_values, replace_position=small_block_start_idx).logits
                                 logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
                         else:
-                            logits = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False).logits
-                            logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
                             logits = logits[:, start:end]
                         x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
                         # Select tokens with probability greater than threshold from p_1t
                         x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
                         x1_p = torch.where(mask_idx[:, start:end], x1_p, -torch.inf)
                         unmask_idx = (x1_p > threshold)
                         max_prob_idx = x1_p.argmax(dim=-1)
                         unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
                         unmask_idx = unmask_idx & mask_idx[:, start:end]
                         x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
             input_ids = x_t
-        # Truncate stop_token
-        if stop_token in input_ids[:, original_input_length:]:
-            stop_token_idx = (input_ids[:, original_input_length:] == stop_token).nonzero()[0][1]
-            input_ids = input_ids[:, :stop_token_idx+original_input_length+1]
         return input_ids
     def sample_with_top_p(self, logits, top_p=0.95, temperature=1.0):
@@ -758,28 +882,31 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
             p_1t = torch.softmax(logits, dim=-1)
             x_1 = p_1t.argmax(dim=-1)
             return x_1, p_1t
-        probs = F.softmax(scaled_logits, dim=-1)
-        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
-        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
         indices_to_remove = torch.zeros_like(probs, dtype=torch.bool).scatter_(
             dim=-1, index=sorted_indices, src=sorted_indices_to_remove
-        )
-        probs[indices_to_remove] = 0
-        # Renormalize so that the probabilities of remaining tokens sum to 1
-        # Add a small epsilon value to prevent division by zero
-        probs_sum = torch.sum(probs, dim=-1, keepdim=True)
-        normalized_probs = probs / probs_sum
-        p_1t = normalized_probs
-        x_1 = torch.multinomial(p_1t[0], num_samples=1).unsqueeze(0).squeeze(-1)
-        return x_1, p_1t

                         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
                     )
+        # --- keep the user/tokenizer padding mask BEFORE you overwrite attention_mask ---
+        padding_mask_2d = attention_mask  # shape [B, KV_LEN], 1=token, 0=pad
+        # -------------------------
+        # Position ids (left padding)
+        # -------------------------
         if position_ids is None:
+            if (padding_mask_2d is not None) and (not self.training):
+                # full, per-sample positions over KV_LEN
+                pos_full = padding_mask_2d.long().cumsum(-1) - 1          # pads => -1
+                pos_full = pos_full.clamp_min(0)                          # pads => 0
+                q_len = inputs_embeds.shape[1]
+                kv_len = pos_full.shape[1]
+                if kv_len < q_len:
+                    raise ValueError(f"attention_mask KV_LEN={kv_len} < input_len={q_len}. "
+                                    "When using cache, pass the FULL mask (past+current).")
+                q_start = kv_len - q_len  # assumes current tokens are the last q_len positions
+                position_ids = pos_full[:, q_start:]
+            else:
+                # no padding mask: same positions for all batch elements
+                position_ids = cache_position.unsqueeze(0)
+        # -------------------------
+        # Attention mask (block-causal + padding), per sample
+        # -------------------------
         if self.training:
             attention_mask = self.gen_mask(labels.shape[1], self.bd_size, labels.shape[0], self.config.num_attention_heads).to(device=inputs_embeds.device)
         else:
             if use_block_cache and block_past_key_values.get_seq_length() != 0:
                 attention_mask = None
             else:
+                # attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
+                if padding_mask_2d is None:
+                    # fallback: original behavior (no padding)
+                    structural = self.eval_mask(
+                        seqlen=input_ids.shape[1],
+                        block_size=block_size,
+                        cache_seq_len=past_key_values.get_seq_length() if past_key_values is not None else 0,
+                    ).to(device=inputs_embeds.device)
+                    attention_mask = structural[None, None, :, :]  # [1,1,Q,KV]
+                else:
+                    pad = padding_mask_2d.to(torch.bool)            # [B, KV]
+                    B, kv_len = pad.shape
+                    q_len = inputs_embeds.shape[1]
+                    q_start = kv_len - q_len
+                    # Per-sample block ids computed from *non-pad* positions
+                    pos_full = pad.long().cumsum(-1) - 1
+                    pos_full = pos_full.clamp_min(0)
+                    block_full = pos_full // block_size             # [B, KV]
+                    block_q = block_full[:, q_start:]               # [B, Q]
+                    block_k = block_full                            # [B, KV]
+                    structural = block_q.unsqueeze(-1) >= block_k.unsqueeze(-2)  # [B, Q, KV]
+                    # Mask keys AND queries (only valid tokens participate)
+                    key_ok   = pad[:, None, None, :]                # [B,1,1,KV]
+                    query_ok = pad[:, None, q_start:, None]         # [B,1,Q,1]
+                    attention_mask = structural[:, None, :, :] & key_ok & query_ok  # [B,1,Q,KV]
         hidden_states = inputs_embeds
     def generate(
         self,
         input_ids,
+        attention_mask=None, # --- ADDED ARGUMENT ---
+        max_new_tokens=20, # Added default value for safety
         mask_id=151665,
         threshold=1,
         small_block_size=8,
         use_block_cache=False,
         **kwargs
     ):
+        if use_block_cache:
+            raise ValueError("use_block_cache=True is not supported in this generate() implementation.")
+        assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
         num_blocks = max_new_tokens // block_size
+        device = input_ids.device
+        batch_size = input_ids.size(0)
         original_input_length = input_ids.shape[1]
+        # Track which sequences in the batch are still active
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=device)
+        # Handle prefix processing (Context Encoding)
         if input_ids.shape[1] > block_size:
+            output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], attention_mask=attention_mask[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
             logits, past_key_values = output.logits, output.past_key_values
             if input_ids.shape[1] % block_size == 0:
                 next_token = logits[:, -1:, :].argmax(dim=-1)
                 input_ids = torch.cat([input_ids, next_token], dim=1)
+                # Update finished status
+                # unfinished_sequences = unfinished_sequences & (next_token.squeeze(-1) != stop_token).long()
+                # Append to mask: If unfinished, append 1. If finished, append 0.
+                new_mask_col = unfinished_sequences.unsqueeze(1).to(dtype=attention_mask.dtype)
+                attention_mask = torch.cat([attention_mask, new_mask_col], dim=1)
         else:
             past_key_values = None
         num_small_blocks = block_size // small_block_size
+        iterations = torch.zeros((batch_size,), device=device)
+        n_generated_tokens = torch.zeros((batch_size,), device=device)
+        finished = torch.zeros((batch_size,), dtype=torch.bool, device=device)
         for block_idx in range(num_blocks):
+            new_tokens = input_ids[:, original_input_length:]
+            has_stop_now = (new_tokens == stop_token).any(dim=1)
+            finished |= has_stop_now
+            stop_tokens = torch.where(input_ids[:, original_input_length:] == stop_token, 1.0, 0.0).sum(dim=1)
+            if stop_tokens.min() > 0:
+                break # break if all sequences have generated the stop token
             prompt_length = input_ids.shape[1]
             # Initialize x_init with mask_id
             x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long)
+            x_init = torch.cat([input_ids, x_init], dim=1) # 1: [1, 32]
+            mask_extension = unfinished_sequences.unsqueeze(1).repeat(1, block_size - prompt_length % block_size).to(dtype=attention_mask.dtype)
+            curr_attention_mask = torch.cat([attention_mask, mask_extension], dim=1)
             x_t = x_init.clone()
             block_past_key_values = None
             while True:
+                # Mask idx is just the current block hwere there are masks. (Note, that first token is never a mask token, because at least one token must be present to condition on!)
                 mask_idx = (x_t[:, -block_size:] == mask_id)
+                # If in the current 32 block there is no mask id, then we have generated 31 tokens already at max! (one token must always be et otherwise there is not conditional generation possible!)
                 if mask_idx.sum() == 0:
+                    # In here we predict the last token whcih corresponds to the first token in the next batch.
+                    # Why not predict it before? If sampling it before the rest was sampled then this would violate semi-autoregressiveness.
+                    output = self.forward(input_ids=x_t[:, -block_size:], attention_mask=curr_attention_mask, use_cache=True, past_key_values=past_key_values, update_past_key_values=True, block_size=block_size)
                     logits, past_key_values = output.logits, output.past_key_values
                     next_token = logits[:, -1:, :].argmax(dim=-1)
                     x_t = torch.cat([x_t, next_token], dim=1)
+                    curr_attention_mask = torch.cat([curr_attention_mask, unfinished_sequences.unsqueeze(1).to(curr_attention_mask.dtype)], dim=1)
+                    iterations += (~finished).long()
+                    n_generated_tokens += (~finished).long()
                     break
                 for small_block_idx in range(num_small_blocks):
+                    small_block_start_idx = small_block_idx * small_block_size # 0
+                    small_block_end_idx = small_block_start_idx + small_block_size # 32
+                    start = -block_size + small_block_start_idx # -32
                     end = None if block_size == small_block_end_idx else -block_size + small_block_end_idx
                     while True:
                         mask_idx = (x_t[:, -block_size:] == mask_id)
+                        if mask_idx[:, start:end].sum() == 0: # [-32:None] (just get 32 tokens newest of the mask ids, meaning we dont have any current mask preictions)
                             break
                         if use_block_cache:
+                            assert False, "use_block_cache=True is not supported in this generate() implementation."
                             if block_past_key_values is None or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
                                 output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
                                 logits, block_past_key_values = output.logits, output.block_past_key_values
                                 logits = self.forward(input_ids=x_t[:,start:end], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True, block_past_key_values=block_past_key_values, replace_position=small_block_start_idx).logits
                                 logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
                         else:
+                            # enter most recent 32 tokens as the input
+                            # kv cache does not contain the kv for our current block yet so it gets recomputed always!
+                            logits = self.forward(input_ids=x_t[:, -block_size:], attention_mask=curr_attention_mask, use_cache=True, past_key_values=past_key_values, update_past_key_values=False,block_size=block_size,).logits
+                            # the logits to be sampled from are the most recent 32 tokens
+                            # shift because of autoregressive  conversion and valid by appending anything to the start since first token mask is off anyways always.
+                            logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1) # TODO maybe prepend nan or sth
                             logits = logits[:, start:end]
                         x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
                         # Select tokens with probability greater than threshold from p_1t
                         x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
                         x1_p = torch.where(mask_idx[:, start:end], x1_p, -torch.inf)
                         unmask_idx = (x1_p > threshold)
+                        # Ensure at least one token is unmasked in the current small block
                         max_prob_idx = x1_p.argmax(dim=-1)
                         unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
                         unmask_idx = unmask_idx & mask_idx[:, start:end]
+                        # Add 1 to iterations if the sequence is not stopped AND at least one token is generated in this iteration
+                        iterations += (~finished & unmask_idx.any(dim=1)).long()
+                        # count number of generated tokens in this iteration if not stopped
+                        n_generated_iter = torch.where(finished, 0, unmask_idx.sum(dim=1))
+                        n_generated_tokens += n_generated_iter
+                        # Only update the positions where unmask_idx is True AND the sequence if not finished TODO check this, otherwise
                         x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
+                        new_tokens = input_ids[:, original_input_length:]
+                        has_stop_now = (new_tokens == stop_token).any(dim=1)
+                        finished |= has_stop_now
             input_ids = x_t
+            attention_mask = curr_attention_mask
+        print("Generated iterations per sequence:", iterations)
+        print("Generated tokens per sequence:", n_generated_tokens)
+        print("Average tokens generated per iteration:", (n_generated_tokens / iterations))
+        # Final truncation: keep everything up to the *latest* first stop_token
+        new_tokens = input_ids[:, original_input_length:]
+        has_stop = (new_tokens == stop_token)
+        gen = input_ids[:, original_input_length:]  # (B, T)
+        T = gen.size(1)
+        if T > 0:
+            device = input_ids.device
+            B = input_ids.size(0)
+            idx = torch.arange(T, device=device).unsqueeze(0).expand(B, T)
+            stop_mask = gen.eq(stop_token)
+            first_stop = torch.where(stop_mask, idx, torch.full_like(idx, T)).min(dim=1).values
+            has_stop = first_stop < T
+            keep = torch.where(has_stop, first_stop + 1, torch.full_like(first_stop, T))
+            pad_id = self.config.pad_token_id if getattr(self.config, "pad_token_id", None) is not None else stop_token
+            after = idx >= keep.unsqueeze(1)
+            gen = gen.clone()
+            gen[after] = pad_id
+            input_ids = torch.cat([input_ids[:, :original_input_length], gen], dim=1)
         return input_ids
     def sample_with_top_p(self, logits, top_p=0.95, temperature=1.0):
             p_1t = torch.softmax(logits, dim=-1)
             x_1 = p_1t.argmax(dim=-1)
             return x_1, p_1t
+        probs = torch.softmax(scaled_logits, dim=-1) # [B, seq_len, vocab_size]
+        sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True)  # [B, seq_len, sorted(vocab_size)]
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1) # [B, seq_len, cumsum(sorted(vocab_size))]
+        sorted_indices_to_remove = cumulative_probs > top_p # [B, seq_len, bool(sorted(vocab_size))]
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() # clone the tensor to avoid in-place operation error
+        sorted_indices_to_remove[..., 0] = 0 # always keep at least one token
         indices_to_remove = torch.zeros_like(probs, dtype=torch.bool).scatter_(
             dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        ) # [B, seq_len, vocab_size], take 0 array and
+        # set True at the indices where sorted_indices_to_remove is True
+        # we index using the sorted indices in order to put the values back to their original position
+        # prev: probs[indices_to_remove] = 0, indices_to_remove is of the same shape as probs
+        # and therefore this operation just selects
+        probs = probs.masked_fill(indices_to_remove, 0.0)
+        probs_sum = probs.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        p_1t = probs / probs_sum
+        vocab = p_1t.shape[-1]
+        flat = p_1t.reshape(-1, vocab)
+        samples = torch.multinomial(flat, num_samples=1).squeeze(-1)
+        x_1 = samples.view(*p_1t.shape[:-1])
+        return x_1, p_1t

modeling_old.py ADDED Viewed

	@@ -0,0 +1,785 @@

+from typing import Callable, Optional, Union
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from functools import partial
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, logging
+from .configuration import Fast_dLLM_QwenConfig
+from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+from einops import rearrange, repeat
+logger = logging.get_logger(__name__)
+@dataclass
+class CausalLMOutputWithPastAndBlockCache(CausalLMOutputWithPast):
+    block_past_key_values: Optional[Cache] = None
+@dataclass
+class BaseModelOutputWithPastAndBlockCache(BaseModelOutputWithPast):
+    block_past_key_values: Optional[Cache] = None
+@torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
+def fused_flex_attention(q, k, v, mask=None):
+    return flex_attention(q, k, v, block_mask=mask, enable_gqa=True)
+def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
+    """
+    Constructs the specialized block diffusion attention mask for training
+    composed of three masks:
+    - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
+    - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
+    - **Block Causal Mask (M_BC)**: Attention to update x0
+    Args:
+        b, h: Batch and head indices (ignored for mask logic).
+        q_idx, kv_idx: Query and Key indices.
+        seq_len: Total sequence length.
+        block_size: Defines the block structure.
+    Returns:
+        A boolean attention mask.
+    """
+    # Indicate whether token belongs to xt or x0
+    x0_flag_q = (q_idx >= n)
+    x0_flag_kv = (kv_idx >= n)
+    # Compute block indices
+    block_q = torch.where(x0_flag_q == 1,
+                        (q_idx - n) // block_size,
+                        q_idx // block_size)
+    block_kv = torch.where(x0_flag_kv == 1,
+                        (kv_idx - n) // block_size,
+                        kv_idx // block_size)
+    # **1. Block Diagonal Mask (M_BD) **
+    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
+    # **2. Offset Block-Causal Mask (M_OBC) **
+    offset_block_causal = (
+    (block_q > block_kv)
+    & (x0_flag_kv == 1)
+    & (x0_flag_q == 0)
+    )
+    # **3. Block-Causal Mask (M_BC) **
+    block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
+    # **4. Combine Masks **
+    return block_diagonal | offset_block_causal | block_causal
+def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
+    # Compute block indices
+    block_q = q_idx // block_size
+    block_kv = kv_idx // block_size
+    return block_q >= block_kv
+class Fast_dLLM_QwenMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Fast_dLLM_QwenAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Fast_dLLM_QwenConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        update_past_key_values: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if self.training:
+            #split q into two parts
+            q_1 = query_states[:,:,:query_states.shape[2]//2]
+            q_2 = query_states[:,:,query_states.shape[2]//2:]
+            #split k into two parts
+            k_1 = key_states[:,:,:key_states.shape[2]//2]
+            k_2 = key_states[:,:,key_states.shape[2]//2:]
+            q_1, k_1 = apply_rotary_pos_emb(q_1, k_1, cos, sin)
+            q_2, k_2 = apply_rotary_pos_emb(q_2, k_2, cos, sin)
+            query_states = torch.cat((q_1, q_2), dim=-2)
+            key_states = torch.cat((k_1, k_2), dim=-2)
+        else:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if block_past_key_values is not None:
+            if len(block_past_key_values) <= self.layer_idx:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = block_past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            else:
+                block_cache_key_states = block_past_key_values[self.layer_idx][0]
+                block_cache_value_states = block_past_key_values[self.layer_idx][1]
+                block_cache_key_states[:, :, replace_position:replace_position+key_states.shape[2]] = key_states
+                block_cache_value_states[:, :, replace_position:replace_position+value_states.shape[2]] = value_states
+                key_states = block_cache_key_states
+                value_states = block_cache_value_states
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            if update_past_key_values:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            elif len(past_key_value) > self.layer_idx:
+                key_states = torch.cat((past_key_value[self.layer_idx][0], key_states), dim=-2)
+                value_states = torch.cat((past_key_value[self.layer_idx][1], value_states), dim=-2)
+        if self.training:
+            attn_output = fused_flex_attention(query_states, key_states, value_states, mask=attention_mask)
+            attn_output = attn_output.transpose(1, 2).contiguous()
+        else:
+            attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                is_causal=False,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                scaling=self.scaling,
+                sliding_window=self.sliding_window,  # main diff with Llama
+                **kwargs,
+            )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+@use_kernel_forward_from_hub("RMSNorm")
+class Fast_dLLM_QwenRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Fast_dLLM_QwenRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class Fast_dLLM_QwenDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Fast_dLLM_QwenConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Fast_dLLM_QwenAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Fast_dLLM_QwenMLP(config)
+        self.input_layernorm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        update_past_key_values: Optional[bool] = False,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs
+    ) -> tuple[torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            update_past_key_values=update_past_key_values,
+            use_block_cache=use_block_cache,
+            block_past_key_values=block_past_key_values,
+            replace_position=replace_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Fast_dLLM_QwenPreTrainedModel(PreTrainedModel):
+    config_class = Fast_dLLM_QwenConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Fast_dLLM_QwenDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Fast_dLLM_QwenDecoderLayer,
+        "attentions": Fast_dLLM_QwenAttention,
+    }
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Fast_dLLM_QwenRMSNorm):
+            module.weight.data.fill_(1.0)
+class Fast_dLLM_QwenRotaryEmbedding(nn.Module):
+    def __init__(self, config: Fast_dLLM_QwenConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
+    def __init__(self, config: Fast_dLLM_QwenConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.bd_size = config.bd_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Fast_dLLM_QwenDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Fast_dLLM_QwenRotaryEmbedding(config=config)
+        self.gradient_checkpointing = True
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def eval_mask(self, seqlen, block_size, cache_seq_len):
+        q_indices = torch.arange(seqlen) + cache_seq_len
+        k_indices = torch.arange(seqlen + cache_seq_len)
+        mask = eval_block_diff_mask(
+            q_idx=q_indices[:, None],
+            kv_idx=k_indices[None, :],
+            block_size=block_size
+        )
+        return mask
+    def gen_mask(self, seqlen, block_size, B, H):
+        mask = create_block_mask(
+            partial(block_diff_mask, block_size=block_size, n=seqlen),
+            B=B, H=H, Q_LEN=seqlen*2, KV_LEN=seqlen*2)
+        return mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        update_past_key_values: Optional[bool] = False,
+        block_size: Optional[int] = 32,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if use_block_cache and block_past_key_values is None:
+            block_past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            if self.training:
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]//2, device=inputs_embeds.device
+                )
+            else:
+                if use_block_cache:
+                    block_start_position = past_seen_tokens+replace_position if replace_position is not None else past_seen_tokens
+                    cache_position = torch.arange(
+                        block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
+                    )
+                else:
+                    cache_position = torch.arange(
+                        past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
+                    )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        if self.training:
+            attention_mask = self.gen_mask(labels.shape[1], self.bd_size, labels.shape[0], self.config.num_attention_heads).to(device=inputs_embeds.device)
+        else:
+            if use_block_cache and block_past_key_values.get_seq_length() != 0:
+                attention_mask = None
+            else:
+                attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                update_past_key_values=update_past_key_values,
+                use_block_cache=use_block_cache,
+                block_past_key_values=block_past_key_values,
+                replace_position=replace_position,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPastAndBlockCache(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            block_past_key_values=block_past_key_values if use_block_cache else None,
+        )
+class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Fast_dLLM_QwenModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        update_past_key_values: Optional[bool] = False,
+        block_size: Optional[int] = 32,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        mask_id: Optional[int] = 151665,
+        **kwargs
+    ) -> CausalLMOutputWithPastAndBlockCache:
+        if self.training:
+            original_labels = labels.clone()
+            original_input_ids = input_ids.clone()
+            noisy_input_ids = input_ids.clone()
+            input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            b, l = input_ids.shape
+            t = torch.rand((b,), device=input_ids.device)
+            eps=1e-3
+            p_mask = (1 - eps) * t + eps
+            p_mask = p_mask[:, None].repeat(1, l)
+            mask_indices = torch.rand((b, l), device=input_ids.device) < p_mask
+            x_t = torch.where(mask_indices, mask_id, input_ids).reshape(labels.shape)
+            noisy_input_ids[labels != -100] = x_t[labels != -100]
+            mask = (noisy_input_ids != mask_id)
+            labels[mask] = -100
+            input_ids = torch.cat([noisy_input_ids, input_ids.reshape(labels.shape)], dim=1)
+            complementary_noisy_input_ids = original_input_ids.clone()
+            complementary_labels = original_labels.clone()
+            complementary_input_ids = original_input_ids.reshape(original_input_ids.shape[0] * original_input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            complementary_mask_indices = ~mask_indices
+            complementary_x_t = torch.where(complementary_mask_indices, mask_id, complementary_input_ids).reshape(labels.shape)
+            complementary_noisy_input_ids[complementary_labels != -100] = complementary_x_t[complementary_labels != -100]
+            complementary_mask = (complementary_noisy_input_ids != mask_id)
+            complementary_labels[complementary_mask] = -100
+            complementary_input_ids = torch.cat([complementary_noisy_input_ids, complementary_input_ids.reshape(complementary_labels.shape)], dim=1)
+            input_ids = torch.cat([input_ids, complementary_input_ids], dim=0)
+            labels = torch.cat([labels, complementary_labels], dim=0)
+        outputs: BaseModelOutputWithPastAndBlockCache = self.model(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            update_past_key_values=update_past_key_values,
+            block_size=block_size,
+            use_block_cache=use_block_cache,
+            block_past_key_values=block_past_key_values,
+            replace_position=replace_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.training:
+            hidden_states = hidden_states[:, :hidden_states.shape[1]//2, :]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPastAndBlockCache(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            block_past_key_values=outputs.block_past_key_values,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids,
+        max_new_tokens,
+        mask_id=151665,
+        threshold=1,
+        small_block_size=8,
+        block_size=32,
+        stop_token=151645,
+        stopping_criteria=None,
+        top_p=0.95,
+        temperature=0,
+        use_block_cache=False,
+        **kwargs
+    ):
+        num_blocks = max_new_tokens // block_size
+        original_input_length = input_ids.shape[1]
+        if input_ids.shape[1] > block_size:
+            output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
+            logits, past_key_values = output.logits, output.past_key_values
+            if input_ids.shape[1] % block_size == 0:
+                next_token = logits[:, -1:, :].argmax(dim=-1)
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+        else:
+            past_key_values = None
+        num_small_blocks = block_size // small_block_size
+        for block_idx in range(num_blocks):
+            if stop_token in input_ids[:, original_input_length:]:
+                break
+            prompt_length = input_ids.shape[1]
+            # Initialize x_init with mask_id
+            x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long)
+            x_init = torch.cat([input_ids, x_init], dim=1)
+            x_t = x_init.clone()
+            block_past_key_values = None
+            while True:
+                if stop_token in x_t[:, prompt_length:]:
+                    stop_token_idx = (x_t[:, prompt_length:] == stop_token).nonzero()[0][1]
+                    if (x_t[:, prompt_length:prompt_length+stop_token_idx] == mask_id).sum() == 0:
+                        break
+                mask_idx = (x_t[:, -block_size:] == mask_id)
+                # Decode a complete block, update cache, and generate the next token
+                if mask_idx.sum() == 0:
+                    output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=True, block_size=block_size)
+                    logits, past_key_values = output.logits, output.past_key_values
+                    next_token = logits[:, -1:, :].argmax(dim=-1)
+                    x_t = torch.cat([x_t, next_token], dim=1)
+                    break
+                for small_block_idx in range(num_small_blocks):
+                    small_block_start_idx = small_block_idx * small_block_size
+                    small_block_end_idx = small_block_start_idx + small_block_size
+                    start = -block_size + small_block_start_idx
+                    end = None if block_size == small_block_end_idx else -block_size + small_block_end_idx
+                    while True:
+                        mask_idx = (x_t[:, -block_size:] == mask_id)
+                        if mask_idx[:, start:end].sum() == 0:
+                            break
+                        if stop_token in x_t[:, prompt_length:]:
+                            stop_token_idx = (x_t[:, prompt_length:] == stop_token).nonzero()[0][1]
+                            if (x_t[:, prompt_length:prompt_length+stop_token_idx] == mask_id).sum() == 0:
+                                break
+                        if use_block_cache:
+                            if block_past_key_values is None or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
+                                output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
+                                logits, block_past_key_values = output.logits, output.block_past_key_values
+                                logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+                                logits = logits[:, start:end]
+                            else:
+                                logits = self.forward(input_ids=x_t[:,start:end], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True, block_past_key_values=block_past_key_values, replace_position=small_block_start_idx).logits
+                                logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+                        else:
+                            logits = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False).logits
+                            logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+                            logits = logits[:, start:end]
+                        x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
+                        # Select tokens with probability greater than threshold from p_1t
+                        x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
+                        x1_p = torch.where(mask_idx[:, start:end], x1_p, -torch.inf)
+                        unmask_idx = (x1_p > threshold)
+                        max_prob_idx = x1_p.argmax(dim=-1)
+                        unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
+                        unmask_idx = unmask_idx & mask_idx[:, start:end]
+                        x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
+            input_ids = x_t
+        # Truncate stop_token
+        if stop_token in input_ids[:, original_input_length:]:
+            stop_token_idx = (input_ids[:, original_input_length:] == stop_token).nonzero()[0][1]
+            input_ids = input_ids[:, :stop_token_idx+original_input_length+1]
+        return input_ids
+    def sample_with_top_p(self, logits, top_p=0.95, temperature=1.0):
+        # Calculate probabilities
+        if temperature > 0:
+            scaled_logits = logits / temperature
+        else:
+            p_1t = torch.softmax(logits, dim=-1)
+            x_1 = p_1t.argmax(dim=-1)
+            return x_1, p_1t
+        probs = F.softmax(scaled_logits, dim=-1)
+        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = torch.zeros_like(probs, dtype=torch.bool).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        probs[indices_to_remove] = 0
+        # Renormalize so that the probabilities of remaining tokens sum to 1
+        # Add a small epsilon value to prevent division by zero
+        probs_sum = torch.sum(probs, dim=-1, keepdim=True)
+        normalized_probs = probs / probs_sum
+        p_1t = normalized_probs
+        x_1 = torch.multinomial(p_1t[0], num_samples=1).unsqueeze(0).squeeze(-1)
+        return x_1, p_1t