from numba import cuda

from ch08_gpu_offload import GPUBackend
from ch08_gpu_offload import gpu_compiler_config as _ch08_gpu_compiler_config
from demo01_gelu_tanh_approx import *
from utils.report import Report

class GpuUfuncBackend(Backend, GPUBackend):
    # Ufunc + GPU backend
    def __init__(self, compile_only: bool = False):
        GPUBackend.__init__(self, compile_only)

gpu_compiler_config = {
    **_ch08_gpu_compiler_config,
    "converter_class": ExtendEGraphToRVSDG,
    "cost_model": MyCostModel(),
    "backend": GpuUfuncBackend(compile_only=not cuda.is_available()),
}

report = Report("Pipeline execution report", enable_nested_metadata=True)
cuda_vectorized_gelu = ufunc_vectorize(
    input_type=Float32,
    ndim=1,
    compiler_config={
        **gpu_compiler_config,
        "pipeline_report": report,
        "pipeline_debug": True,
    },
    extra_ruleset=additional_rules | optimize_rules,
)(gelu_tanh_forward)
if __name__ == "__main__":
    report.display()

--------------------------------original source---------------------------------
 106|def gelu_tanh_forward(a):
 107|    dt = np.float32
 108|    result = (
 109|        dt(0.5)
 110|        * a
 111|        * (
 112|            dt(1)
 113|            + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a**3))
 114|        )
 115|    )
 116|    return result
----------------------------------inter source----------------------------------
   1|def transformed_gelu_tanh_forward(a):
   2|    """#file: demo01_gelu_tanh_approx.py"""
   3|    '#loc: 107:8-107:23'
   4|    dt = np.float32
   5|    '#loc: 108:8-115:9'
   6|    result = dt(0.5) * a * (dt(1) + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a ** 3)))
   7|    '#loc: 116:8-116:21'
   8|    return result

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[804] <- !io a
{
  $1 = PyLoadGlobal $0[0] 'np'
  $2 = PyAttr $0[0] $1 'float32'
  $3 = DbgValue 'dt' $2[1]
  $4 = PyFloat 0.5
  $5 = PyCall $3 $2[0] $4
  $6 = PyBinOp * $5[0] $5[1], $0[1]
  $7 = PyInt 1
  $8 = PyCall $3 $6[0] $7
  $9 = PyInt 2
  $10 = PyCall $3 $8[0] $9
  $11 = PyLoadGlobal $10[0] 'np'
  $12 = PyAttr $10[0] $11 'pi'
  $13 = PyCall $3 $12[0] $12[1]
  $14 = PyBinOp / $13[0] $10[1], $13[1]
  $15 = PyLoadGlobal $14[0] 'np'
  $16 = PyAttr $14[0] $15 'sqrt'
  $17 = PyCall $16[1] $16[0] $14[1]
  $18 = PyFloat 0.044715
  $19 = PyCall $3 $17[0] $18
  $20 = PyInt 3
  $21 = PyBinOp ** $19[0] $0[1], $20
  $22 = PyBinOp * $21[0] $19[1], $21[1]
  $23 = PyBinOp + $22[0] $0[1], $22[1]
  $24 = PyBinOp * $23[0] $17[1], $23[1]
  $25 = PyLoadGlobal $24[0] 'np'
  $26 = PyAttr $24[0] $25 'tanh'
  $27 = PyCall $26[1] $26[0] $24[1]
  $28 = PyBinOp + $27[0] $8[1], $27[1]
  $29 = PyBinOp * $28[0] $6[1], $28[1]
  $30 = DbgValue 'result' $29[1]
} [1268] -> !io=$29[0] !ret=$30

time elapsed 12.08ms
timing breakdown:
  8.32ms: Debug Info on RVSDG 
  3.77ms: RVSDG

time elapsed 109.13ms
timing breakdown:
  109.13ms: EGraph

_Region_1 = Region("804", InPorts(Vec[String]("!io", "a")))
GraphRoot(
    Term.Func(
        "1274",
        "transformed_gelu_tanh_forward",
        Term.RegionEnd(
            _Region_1,
            PortList(
                Vec[Port](
                    Port("!io", _Region_1.get(0)),
                    Port(
                        "!ret",
                        Nb_Mul_Float32(
                            Nb_Mul_Float32(Npy_cast_f64_to_f32(Term.LiteralF64(0.5)), _Region_1.get(1)),
                            Nb_Add_Float32(
                                Npy_cast_i64_to_f32(Term.LiteralI64(1)),
                                Npy_tanh_float32(
                                    Nb_Mul_Float32(
                                        Npy_sqrt_float32(Nb_Div_Float32(Npy_cast_i64_to_f32(Term.LiteralI64(2)), Npy_cast_f64_to_f32(Term.LiteralF64(3.141592653589793)))),
                                        Nb_Add_Float32(
                                            _Region_1.get(1),
                                            Nb_Mul_Float32(Npy_cast_f64_to_f32(Term.LiteralF64(0.044715)), Nb_Pow_Float32_Int64(_Region_1.get(1), Term.LiteralI64(3))),
                                        ),
                                    )
                                ),
                            ),
                        ),
                    ),
                )
            ),
        ),
    )
)

time elapsed 269.30ms
timing breakdown:
  46.46ms: [debug] initial egraph
  194.40ms: [debug] saturated egraph
  28.44ms: [debug] egglog.extract

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[1575] <- !io a; #attrs (_, Float32)->(_, Float32)
{
  $1 = PyFloat 0.5
  $2 = NbOp_F64_to_F32 $1
  $3 = NbOp_Mul_Float32 $2 $0[1]
  $4 = PyInt 1
  $5 = NbOp_I64_to_F32 $4
  $6 = PyFloat 10.0
  $7 = NbOp_F64_to_F32 $6
  $8 = PyInt 2
  $9 = NbOp_I64_to_F32 $8
  $10 = PyFloat 3.141592653589793
  $11 = NbOp_F64_to_F32 $10
  $12 = NbOp_Div_Float32 $9 $11
  $13 = NpyOp_Sqrt_Float32 $12
  $14 = PyFloat 0.044715
  $15 = NbOp_F64_to_F32 $14
  $16 = PyFloat 1.0
  $17 = NbOp_F64_to_F32 $16
  $18 = NbOp_Mul_Float32 $0[1] $17
  $19 = NbOp_Mul_Float32 $0[1] $18
  $20 = NbOp_Mul_Float32 $0[1] $19
  $21 = NbOp_Mul_Float32 $15 $20
  $22 = NbOp_Add_Float32 $0[1] $21
  $23 = NbOp_Mul_Float32 $13 $22
  $24 = NbOp_Mul_Float32 $23 $17
  $25 = NbOp_Mul_Float32 $23 $24
  $26 = NbOp_Mul_Float32 $23 $25
  $27 = NbOp_Mul_Float32 $7 $26
  $28 = PyFloat 105.0
  $29 = NbOp_F64_to_F32 $28
  $30 = NbOp_Mul_Float32 $29 $23
  $31 = NbOp_Add_Float32 $27 $30
  $32 = NbOp_Mul_Float32 $23 $26
  $33 = PyFloat 45.0
  $34 = NbOp_F64_to_F32 $33
  $35 = NbOp_Mul_Float32 $34 $25
  $36 = NbOp_Add_Float32 $32 $35
  $37 = NbOp_Add_Float32 $36 $29
  $38 = NbOp_Div_Float32 $31 $37
  $39 = NbOp_Add_Float32 $5 $38
  $40 = NbOp_Mul_Float32 $3 $39
} [1782] -> !io=$0[0] !ret=$40

14747.0

time elapsed 20.04ms
timing breakdown:
  20.03ms: Extracted RVSDG     
  0.01ms: Extracted cost

module {
  func.func @func(%arg0: f32) -> f32 attributes {llvm.emit_c_interface} {
    %cst = arith.constant 5.000000e-01 : f64
    %c1_i64 = arith.constant 1 : i64
    %cst_0 = arith.constant 1.000000e+01 : f64
    %c2_i64 = arith.constant 2 : i64
    %cst_1 = arith.constant 3.1415926535897931 : f64
    %cst_2 = arith.constant 4.471500e-02 : f64
    %cst_3 = arith.constant 1.000000e+00 : f64
    %cst_4 = arith.constant 1.050000e+02 : f64
    %cst_5 = arith.constant 4.500000e+01 : f64
    cf.br ^bb1
  ^bb1:  // pred: ^bb0
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.truncf %cst : f64 to f32
    %1 = arith.mulf %0, %arg0 : f32
    %2 = arith.sitofp %c1_i64 : i64 to f32
    %3 = arith.truncf %cst_0 : f64 to f32
    %4 = arith.sitofp %c2_i64 : i64 to f32
    %5 = arith.truncf %cst_1 : f64 to f32
    %6 = arith.divf %4, %5 : f32
    %7 = math.sqrt %6 : f32
    %8 = arith.truncf %cst_2 : f64 to f32
    %9 = arith.truncf %cst_3 : f64 to f32
    %10 = arith.mulf %arg0, %9 : f32
    %11 = arith.mulf %arg0, %10 : f32
    %12 = arith.mulf %arg0, %11 : f32
    %13 = arith.mulf %8, %12 : f32
    %14 = arith.addf %arg0, %13 : f32
    %15 = arith.mulf %7, %14 : f32
    %16 = arith.mulf %15, %9 : f32
    %17 = arith.mulf %15, %16 : f32
    %18 = arith.mulf %15, %17 : f32
    %19 = arith.mulf %3, %18 : f32
    %20 = arith.truncf %cst_4 : f64 to f32
    %21 = arith.mulf %20, %15 : f32
    %22 = arith.addf %19, %21 : f32
    %23 = arith.mulf %15, %18 : f32
    %24 = arith.truncf %cst_5 : f64 to f32
    %25 = arith.mulf %24, %17 : f32
    %26 = arith.addf %23, %25 : f32
    %27 = arith.addf %26, %20 : f32
    %28 = arith.divf %22, %27 : f32
    %29 = arith.addf %2, %28 : f32
    %30 = arith.mulf %1, %29 : f32
    return %30 : f32
  }
}

if __name__ == "__main__":
    if not cuda.is_available():
        print("SKIPPED. CUDA unavailable")
    else:
        relclose = lambda x, y: np.allclose(x, y, rtol=1e-6)
        input_val = np.random.random(100).astype(np.float32)
        report.display()
        run_test(
            gelu_tanh_forward,
            cuda_vectorized_gelu,
            (input_val,),
            equal=relclose,
            verbose=True,
        )

SKIPPED. CUDA unavailable

if __name__ == "__main__":
    if not cuda.is_available():
        print("SKIPPED. CUDA unavailable")
    else:
        input_val = np.random.random(300000).astype(np.float32)
        out = np.zeros_like(input_val)

        print("original")
        %timeit gelu_tanh_forward(input_val)
        print("superoptimized")
        %timeit cuda_vectorized_gelu(input_val, out=out)

SKIPPED. CUDA unavailable

Demo 2: CUDA Backend for Tanh Approximation in GELU Activation Layer¶

Setup GPU Backend¶

Configure the CUDA Ufunc Pipeline¶

Test GELU Ufunc on CUDA¶

Benchmark¶