-
Notifications
You must be signed in to change notification settings - Fork 335
feat: fuse add and rmsnorm #1368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |||||
| import triton | ||||||
| import triton.language as tl | ||||||
| import os | ||||||
| from lightllm.common.triton_utils.autotuner import autotune | ||||||
|
|
||||||
| rmsnorm_num_warps = int(os.getenv("RMSNORM_WARPS", "8")) | ||||||
|
|
||||||
|
|
@@ -48,6 +49,69 @@ def _rms_norm_fwd_fused( | |||||
| tl.store(Y + cols * y_stride1, y.to(Y.dtype.element_ty), mask=mask) | ||||||
|
|
||||||
|
|
||||||
| @triton.jit | ||||||
| def _add_rms_norm_fwd_fused( | ||||||
| X, | ||||||
| R, | ||||||
| Y, | ||||||
| W, | ||||||
| x_stride0, | ||||||
| x_stride1, | ||||||
| r_stride0, | ||||||
| r_stride1, | ||||||
| y_stride0, | ||||||
| y_stride1, | ||||||
| N, | ||||||
| eps, | ||||||
| HAS_WEIGHT: tl.constexpr, | ||||||
| SINGLE_PASS: tl.constexpr, | ||||||
| BLOCK_SIZE: tl.constexpr, | ||||||
| ): | ||||||
| row = tl.program_id(0) | ||||||
| X += row * x_stride0 | ||||||
| R += row * r_stride0 | ||||||
| Y += row * y_stride0 | ||||||
|
|
||||||
| if SINGLE_PASS: | ||||||
| cols = tl.arange(0, BLOCK_SIZE) | ||||||
| mask = cols < N | ||||||
| x = tl.load(X + cols * x_stride1, mask=mask, other=0.0).to(tl.float32) | ||||||
| r = tl.load(R + cols * r_stride1, mask=mask, other=0.0).to(tl.float32) | ||||||
| x = (x + r).to(X.dtype.element_ty) | ||||||
| tl.store(X + cols * x_stride1, x, mask=mask) | ||||||
|
|
||||||
| x = x.to(tl.float32) | ||||||
| var = tl.sum(x * x, axis=0) / N | ||||||
| y = x * (1 / tl.sqrt(var + eps)) | ||||||
| if HAS_WEIGHT: | ||||||
| w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32) | ||||||
| y *= w | ||||||
| tl.store(Y + cols * y_stride1, y.to(Y.dtype.element_ty), mask=mask) | ||||||
| else: | ||||||
| _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) | ||||||
| for off in range(0, N, BLOCK_SIZE): | ||||||
| cols = off + tl.arange(0, BLOCK_SIZE) | ||||||
| mask = cols < N | ||||||
| x = tl.load(X + cols * x_stride1, mask=mask, other=0.0).to(tl.float32) | ||||||
| r = tl.load(R + cols * r_stride1, mask=mask, other=0.0).to(tl.float32) | ||||||
| x = (x + r).to(X.dtype.element_ty) | ||||||
| tl.store(X + cols * x_stride1, x, mask=mask) | ||||||
| x = x.to(tl.float32) | ||||||
| _var += x * x | ||||||
|
|
||||||
| var = tl.sum(_var, axis=0) / N | ||||||
| rstd = 1 / tl.sqrt(var + eps) | ||||||
| for off in range(0, N, BLOCK_SIZE): | ||||||
| cols = off + tl.arange(0, BLOCK_SIZE) | ||||||
| mask = cols < N | ||||||
| x = tl.load(X + cols * x_stride1, mask=mask, other=0.0).to(tl.float32) | ||||||
| y = x * rstd | ||||||
| if HAS_WEIGHT: | ||||||
| w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32) | ||||||
| y *= w | ||||||
| tl.store(Y + cols * y_stride1, y.to(Y.dtype.element_ty), mask=mask) | ||||||
|
|
||||||
|
|
||||||
| def rmsnorm_forward(x: torch.Tensor, weight: torch.Tensor, eps: float, out=None): | ||||||
| # allocate output | ||||||
| y = torch.empty_like(x) if out is None else out | ||||||
|
|
@@ -60,7 +124,7 @@ def rmsnorm_forward(x: torch.Tensor, weight: torch.Tensor, eps: float, out=None) | |||||
| assert y.data_ptr() == y_arg.data_ptr() | ||||||
| M, N = x_arg.shape | ||||||
| # Less than 64KB per feature: enqueue fused kernel | ||||||
| MAX_FUSED_SIZE = 65536 // x.element_size() | ||||||
| MAX_FUSED_SIZE = 65536 // x_arg.element_size() | ||||||
| BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) | ||||||
| # print("BLOCK_SIZE:", BLOCK_SIZE) | ||||||
| if N > BLOCK_SIZE: | ||||||
|
|
@@ -86,6 +150,81 @@ def rmsnorm_forward(x: torch.Tensor, weight: torch.Tensor, eps: float, out=None) | |||||
| return y | ||||||
|
|
||||||
|
|
||||||
| def _get_add_rmsnorm_configs(): | ||||||
| return [{"num_warps": nw} for nw in [4, 8, 16]] | ||||||
|
|
||||||
|
|
||||||
| def _get_add_rmsnorm_static_key( | ||||||
| x_arg: torch.Tensor, residual_arg: torch.Tensor, y_arg: torch.Tensor, weight: torch.Tensor | ||||||
| ): | ||||||
| return { | ||||||
| "x_dtype": str(x_arg.dtype), | ||||||
| "residual_dtype": str(residual_arg.dtype), | ||||||
| "out_dtype": str(y_arg.dtype), | ||||||
| "weight_dtype": "none" if weight is None else str(weight.dtype), | ||||||
| "N": x_arg.shape[1], | ||||||
| "has_weight": weight is not None, | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| @autotune( | ||||||
| kernel_name="add_rmsnorm_forward:v1", | ||||||
| configs_gen_func=_get_add_rmsnorm_configs, | ||||||
| static_key_func=_get_add_rmsnorm_static_key, | ||||||
| run_key_func=lambda x_arg: x_arg.shape[0], | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||
| mutates_args=["x_arg", "y_arg"], | ||||||
| ) | ||||||
| def _add_rmsnorm_forward( | ||||||
| x_arg: torch.Tensor, | ||||||
| residual_arg: torch.Tensor, | ||||||
| y_arg: torch.Tensor, | ||||||
| weight: torch.Tensor, | ||||||
| eps: float, | ||||||
| run_config: dict = None, | ||||||
| ): | ||||||
| M, N = x_arg.shape | ||||||
| MAX_FUSED_SIZE = 65536 // x_arg.element_size() | ||||||
| BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) | ||||||
| if N > BLOCK_SIZE: | ||||||
| raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") | ||||||
| if BLOCK_SIZE > 16384: | ||||||
| BLOCK_SIZE = 16384 | ||||||
| if not run_config: | ||||||
| run_config = {"num_warps": rmsnorm_num_warps} | ||||||
| _add_rms_norm_fwd_fused[(M,)]( | ||||||
| x_arg, | ||||||
| residual_arg, | ||||||
| y_arg, | ||||||
| weight, | ||||||
| x_arg.stride(0), | ||||||
| x_arg.stride(1), | ||||||
| residual_arg.stride(0), | ||||||
| residual_arg.stride(1), | ||||||
| y_arg.stride(0), | ||||||
| y_arg.stride(1), | ||||||
| N, | ||||||
| eps, | ||||||
| HAS_WEIGHT=weight is not None, | ||||||
| SINGLE_PASS=N <= BLOCK_SIZE, | ||||||
| BLOCK_SIZE=BLOCK_SIZE, | ||||||
| num_warps=run_config["num_warps"], | ||||||
| ) | ||||||
| return y_arg | ||||||
|
|
||||||
|
|
||||||
| def add_rmsnorm_forward(x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float, out=None): | ||||||
| y = torch.empty_like(x) if out is None else out | ||||||
| x_arg = x.view(-1, x.shape[-1]) | ||||||
| residual_arg = residual.view(-1, x.shape[-1]) | ||||||
| y_arg = y.view(-1, x.shape[-1]) | ||||||
| assert x_arg.shape == residual_arg.shape == y_arg.shape | ||||||
| if weight is not None: | ||||||
| assert x_arg.shape[-1] == weight.shape[0] | ||||||
| assert y.data_ptr() == y_arg.data_ptr() | ||||||
| _add_rmsnorm_forward(x_arg, residual_arg, y_arg, weight, eps) | ||||||
| return y | ||||||
|
|
||||||
|
|
||||||
| def torch_rms_norm(x, weight, eps): | ||||||
| return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) * weight | ||||||
|
|
||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
_get_add_rmsnorm_static_keyfunction is used as thestatic_key_funcfor the@autotunedecorator on_add_rmsnorm_forward. However,_add_rmsnorm_forwardaccepts 6 arguments (x_arg,residual_arg,y_arg,weight,eps,run_config), while_get_add_rmsnorm_static_keyonly accepts 4. When the autotuner invokes this function with all arguments, it will raise aTypeErrorat runtime. Adding*args, **kwargsto the signature will make it robust against extra arguments.