import mlir.dialects.arith as arith
import mlir.dialects.math as math
import mlir.ir as ir
import numpy as np
from egglog import (
    Expr,
    StringLike,
    Vec,
    function,
    i64,
    i64Like,
    rewrite,
    rule,
    ruleset,
    set_,
    subsume,
    union,
)
from sealir import rvsdg
from sealir.eqsat.py_eqsat import (
    Py_AddIO,
    Py_AttrIO,
    Py_Call,
    Py_DivIO,
    Py_LoadGlobal,
    Py_MulIO,
    Py_PowIO,
)
from sealir.eqsat.rvsdg_eqsat import (
    Term,
    TermList,
)
from sealir.rvsdg import grammar as rg

from ch04_1_typeinfer_ifelse import (
    ExtendEGraphToRVSDG as ch04_1_ExtendEGraphToRVSDG,
)
from ch04_1_typeinfer_ifelse import (
    Grammar,
    NbOp_Base,
    String,
    Type,
    TypeFloat64,
    TypeInt64,
    TypeVar,
    make_rules_for_binop,
    setup_argtypes,
)
from ch05_typeinfer_array import MyCostModel as ch06_CostModel
from ch05_typeinfer_array import (
    base_ruleset,
)
from ch06_mlir_backend import LowerStates, jit_compiler, run_test
from ch07_mlir_ufunc import Backend as UfuncBackend
from ch07_mlir_ufunc import (
    Float32,
    TypeFloat32,
    ufunc_compiler,
    ufunc_vectorize,
)
from utils.report import Report

def gelu_tanh_forward(a):
    dt = np.float32
    result = (
        dt(0.5)
        * a
        * (
            dt(1)
            + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a**3))
        )
    )
    return result

class Module(Expr):
    def __init__(self, name: StringLike): ...

    def toType(self) -> Type: ...

@function
def ModuleGetAttr(mod: Module, attrname: StringLike) -> Term: ...

@ruleset
def facts_numpy_module(io: Term, name: String, op: Term, args: Vec[Term]):

    yield rule(
        op == Py_LoadGlobal(io, name),
        name == String("np"),
    ).then(set_(TypeVar(op).getType()).to(Module("numpy").toType()))

    # ------ attributes ------
    numpy_mod = Module("numpy")

    def unary_func(fname, target_func):
        return rule(
            op
            == (
                stmt := Py_Call(
                    func=ModuleGetAttr(numpy_mod, fname),
                    io=io,
                    args=TermList(args),
                )
            ),
            args.length() == i64(1),
        ).then(
            subsume(stmt),
            union(op.getPort(0)).with_(io),
            union(op.getPort(1)).with_(target_func(args[0])),
        )

    # np.pi
    const_pi = ModuleGetAttr(numpy_mod, "pi")
    yield rewrite(
        const_pi,
        subsume=True,
    ).to(Term.LiteralF64(np.pi))
    # np.float32
    yield unary_func("float32", Npy_float32)
    # np.sqrt
    yield unary_func("sqrt", Npy_sqrt)
    # np.tanh
    yield unary_func("tanh", Npy_tanh)

@function
def Npy_float32(val: Term) -> Term: ...
@function
def Npy_sqrt(val: Term) -> Term: ...
@function
def Npy_tanh(val: Term) -> Term: ...

@function
def Npy_cast_f64_to_f32(val: Term) -> Term: ...
@function
def Npy_cast_i64_to_f32(val: Term) -> Term: ...
@function
def Npy_sqrt_float32(val: Term) -> Term: ...
@function
def Npy_tanh_float32(val: Term) -> Term: ...

@ruleset
def ruleset_typeinfer_numpy_functions(res: Term, arg: Term):
    # float32()
    yield rewrite(Npy_float32(arg), subsume=True).to(
        Npy_cast_f64_to_f32(arg),
        TypeVar(arg).getType() == TypeFloat64,
    )
    yield rewrite(Npy_float32(arg), subsume=True).to(
        Npy_cast_i64_to_f32(arg),
        TypeVar(arg).getType() == TypeInt64,
    )

    for fn in [Npy_cast_f64_to_f32, Npy_cast_i64_to_f32]:
        yield rule(
            res == fn(arg),
        ).then(set_(TypeVar(res).getType()).to(TypeFloat32))
    # others

    for func, typed_func in [
        (Npy_sqrt, Npy_sqrt_float32),
        (Npy_tanh, Npy_tanh_float32),
    ]:
        yield rewrite(func(arg), subsume=True).to(
            typed_func(arg),
            TypeVar(arg).getType() == TypeFloat32,
        )
        yield rule(
            res == typed_func(arg),
        ).then(set_(TypeVar(res).getType()).to(TypeFloat32))

@ruleset
def ruleset_module(
    io: Term, name: String, modname: String, op: Term, obj: Term
):
    # Getattribute
    yield rule(
        op == Py_AttrIO(io, obj, name),
        TypeVar(obj).getType() == Module(modname).toType(),
    ).then(
        # Shortcut io
        union(op.getPort(0)).with_(io),
        # Setup getattr
        union(op.getPort(1)).with_(ModuleGetAttr(Module(modname), name)),
    )

@function
def Nb_Add_Float32(lhs: Term, rhs: Term) -> Term: ...


@function
def Nb_Mul_Float32(lhs: Term, rhs: Term) -> Term: ...


@function
def Nb_Div_Float32(lhs: Term, rhs: Term) -> Term: ...


@function
def Nb_Pow_Float32_Int64(lhs: Term, rhs: Term) -> Term: ...

@ruleset
def ruleset_typeinfer_f32_ops(res: Term, x: Term, y: Term):
    yield from make_rules_for_binop(
        Py_AddIO, TypeFloat32, TypeFloat32, Nb_Add_Float32, TypeFloat32
    )
    yield from make_rules_for_binop(
        Py_MulIO, TypeFloat32, TypeFloat32, Nb_Mul_Float32, TypeFloat32
    )
    yield from make_rules_for_binop(
        Py_DivIO, TypeFloat32, TypeFloat32, Nb_Div_Float32, TypeFloat32
    )
    yield from make_rules_for_binop(
        Py_PowIO, TypeFloat32, TypeInt64, Nb_Pow_Float32_Int64, TypeFloat32
    )

additional_rules = (
    facts_numpy_module
    | ruleset_module
    | ruleset_typeinfer_numpy_functions
    | ruleset_typeinfer_f32_ops
)

SExpr = rvsdg.grammar.SExpr


class NbOp_F64_to_F32(NbOp_Base):
    operand: SExpr


class NbOp_I64_to_F32(NbOp_Base):
    operand: SExpr


class NpyOp_Sqrt_Float32(NbOp_Base):
    operand: SExpr


class NpyOp_Tanh_Float32(NbOp_Base):
    operand: SExpr


class NbOp_Mul_Float32(NbOp_Base):
    lhs: SExpr
    rhs: SExpr


class NbOp_Div_Float32(NbOp_Base):
    lhs: SExpr
    rhs: SExpr


class NbOp_Add_Float32(NbOp_Base):
    lhs: SExpr
    rhs: SExpr


class NbOp_Pow_Float32_Int64(NbOp_Base):
    lhs: SExpr
    rhs: SExpr


class NbOp_module(NbOp_Base):
    name: str

class ExtendEGraphToRVSDG(ch04_1_ExtendEGraphToRVSDG):

    def handle_Term(self, op: str, children: dict | list, grm: Grammar):

        match op, children:
            case "Py_Float", {"val": float(arg)}:
                return grm.write(rg.PyFloat(arg))

            case "Npy_cast_f64_to_f32", {"val": expr}:
                return grm.write(NbOp_F64_to_F32(expr))

            case "Npy_cast_i64_to_f32", {"val": expr}:
                return grm.write(NbOp_I64_to_F32(expr))

            case "Nb_Mul_Float32", {"lhs": lhs, "rhs": rhs}:
                return grm.write(NbOp_Mul_Float32(lhs=lhs, rhs=rhs))
            case "Nb_Add_Float32", {"lhs": lhs, "rhs": rhs}:
                return grm.write(NbOp_Add_Float32(lhs=lhs, rhs=rhs))
            case "Nb_Div_Float32", {"lhs": lhs, "rhs": rhs}:
                return grm.write(NbOp_Div_Float32(lhs=lhs, rhs=rhs))
            case "Nb_Pow_Float32_Int64", {"lhs": lhs, "rhs": rhs}:
                return grm.write(NbOp_Pow_Float32_Int64(lhs=lhs, rhs=rhs))
            case "Npy_sqrt_float32", {"val": val}:
                return grm.write(NpyOp_Sqrt_Float32(val))
            case "Npy_tanh_float32", {"val": val}:
                return grm.write(NpyOp_Tanh_Float32(val))
            # ---
            case "ModuleGetAttr", {"mod": mod, "attrname": str(attrname)}:
                return grm.write(rg.Undef(str(op)))
            case _:
                # Use parent's implementation for other terms.
                return super().handle_Term(op, children, grm)

    def handle_Module(
        self, key: str, op: str, children: dict | list, grm: Grammar
    ):
        return grm.write(rg.Undef(str(key)))

class Backend(UfuncBackend):
    def __init__(self):
        super().__init__()
        self.f32 = ir.F32Type.get(context=self.context)

    def get_mlir_type(self, seal_ty):
        match seal_ty.name:
            case "Float32":
                return self.f32
        return super().get_mlir_type(seal_ty)

    def lower_expr(self, expr: SExpr, state: LowerStates):
        match expr:
            case NbOp_Add_Float32(lhs, rhs):
                lhs = yield lhs
                rhs = yield rhs
                return arith.addf(lhs, rhs)
            case NbOp_Mul_Float32(lhs, rhs):
                lhs = yield lhs
                rhs = yield rhs
                return arith.mulf(lhs, rhs)
            case NbOp_Div_Float32(lhs, rhs):
                lhs = yield lhs
                rhs = yield rhs
                return arith.divf(lhs, rhs)
            case NbOp_F64_to_F32(val):
                val = yield val
                return arith.truncf(self.f32, val)
            case NbOp_I64_to_F32(val):
                val = yield val
                return arith.sitofp(self.f32, val)
            case NpyOp_Tanh_Float32(val):
                val = yield val
                return math.tanh(val)
            case NpyOp_Sqrt_Float32(val):
                val = yield val
                return math.sqrt(val)
            case NbOp_Pow_Float32_Int64(val, p):
                val = yield val
                p = yield p
                return math.powf(val, arith.sitofp(val.type, p))
            case rg.Undef(str(name)):
                return arith.constant(self.i32, 0)
        return (yield from super().lower_expr(expr, state))

class MyCostModel(ch06_CostModel):
    def get_cost_function(self, nodename, op, ty, cost, children):
        match op:
            case "Npy_tanh" | "Npy_sqrt" | "Npy_float32":
                cost = float("inf")  # suppress untyped op
            case "Npy_tanh_float32":
                cost = 100
            case "Npy_sqrt_float32":
                cost = 50
            case "Nb_Pow_Float32_Int64":
                cost = 50

        # Fallthrough to parent's cost function
        return super().get_cost_function(nodename, op, ty, cost, children)

compiler_config = dict(
    converter_class=ExtendEGraphToRVSDG,
    backend=Backend(),
    cost_model=MyCostModel(),
)

if __name__ == "__main__":
    report = Report("Pipeline execution report", enable_nested_metadata=True)
    jit_func = jit_compiler(
        fn=gelu_tanh_forward,
        argtypes=(Float32,),
        ruleset=(
            base_ruleset | setup_argtypes(TypeFloat32) | additional_rules
        ),
        pipeline_report=report,
        **compiler_config,
    ).jit_func
    report.display()
    run_test(gelu_tanh_forward, jit_func, (0.234,), verbose=True)

--------------------------------original source---------------------------------
   1|def gelu_tanh_forward(a):
   2|    dt = np.float32
   3|    result = (
   4|        dt(0.5)
   5|        * a
   6|        * (
   7|            dt(1)
   8|            + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a**3))
   9|        )
  10|    )
  11|    return result
----------------------------------inter source----------------------------------
   1|def transformed_gelu_tanh_forward(a):
   2|    """#file: /tmp/ipykernel_3859/2556971375.py"""
   3|    '#loc: 2:8-2:23'
   4|    dt = np.float32
   5|    '#loc: 3:8-10:9'
   6|    result = dt(0.5) * a * (dt(1) + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a ** 3)))
   7|    '#loc: 11:8-11:21'
   8|    return result

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[804] <- !io a
{
  $1 = PyLoadGlobal $0[0] 'np'
  $2 = PyAttr $0[0] $1 'float32'
  $3 = DbgValue 'dt' $2[1]
  $4 = PyFloat 0.5
  $5 = PyCall $3 $2[0] $4
  $6 = PyBinOp * $5[0] $5[1], $0[1]
  $7 = PyInt 1
  $8 = PyCall $3 $6[0] $7
  $9 = PyInt 2
  $10 = PyCall $3 $8[0] $9
  $11 = PyLoadGlobal $10[0] 'np'
  $12 = PyAttr $10[0] $11 'pi'
  $13 = PyCall $3 $12[0] $12[1]
  $14 = PyBinOp / $13[0] $10[1], $13[1]
  $15 = PyLoadGlobal $14[0] 'np'
  $16 = PyAttr $14[0] $15 'sqrt'
  $17 = PyCall $16[1] $16[0] $14[1]
  $18 = PyFloat 0.044715
  $19 = PyCall $3 $17[0] $18
  $20 = PyInt 3
  $21 = PyBinOp ** $19[0] $0[1], $20
  $22 = PyBinOp * $21[0] $19[1], $21[1]
  $23 = PyBinOp + $22[0] $0[1], $22[1]
  $24 = PyBinOp * $23[0] $17[1], $23[1]
  $25 = PyLoadGlobal $24[0] 'np'
  $26 = PyAttr $24[0] $25 'tanh'
  $27 = PyCall $26[1] $26[0] $24[1]
  $28 = PyBinOp + $27[0] $8[1], $27[1]
  $29 = PyBinOp * $28[0] $6[1], $28[1]
  $30 = DbgValue 'result' $29[1]
} [1268] -> !io=$29[0] !ret=$30

time elapsed 12.66ms
timing breakdown:
  8.82ms: Debug Info on RVSDG 
  3.84ms: RVSDG

time elapsed 69.66ms
timing breakdown:
  69.66ms: EGraph

time elapsed 0.00ms
timing breakdown:

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[1575] <- !io a; #attrs (_, Float32)->(_, Float32)
{
  $1 = PyFloat 0.5
  $2 = NbOp_F64_to_F32 $1
  $3 = NbOp_Mul_Float32 $2 $0[1]
  $4 = PyInt 1
  $5 = NbOp_I64_to_F32 $4
  $6 = PyInt 2
  $7 = NbOp_I64_to_F32 $6
  $8 = PyFloat 3.141592653589793
  $9 = NbOp_F64_to_F32 $8
  $10 = NbOp_Div_Float32 $7 $9
  $11 = NpyOp_Sqrt_Float32 $10
  $12 = PyFloat 0.044715
  $13 = NbOp_F64_to_F32 $12
  $14 = PyInt 3
  $15 = NbOp_Pow_Float32_Int64 $0[1] $14
  $16 = NbOp_Mul_Float32 $13 $15
  $17 = NbOp_Add_Float32 $0[1] $16
  $18 = NbOp_Mul_Float32 $11 $17
  $19 = NpyOp_Tanh_Float32 $18
  $20 = NbOp_Add_Float32 $5 $19
  $21 = NbOp_Mul_Float32 $3 $20
} [1693] -> !io=$0[0] !ret=$21

34139.0

time elapsed 11.96ms
timing breakdown:
  11.95ms: Extracted RVSDG     
  0.01ms: Extracted cost

module {
  func.func @func(%arg0: f32) -> f32 attributes {llvm.emit_c_interface} {
    %cst = arith.constant 5.000000e-01 : f64
    %c1_i64 = arith.constant 1 : i64
    %c2_i64 = arith.constant 2 : i64
    %cst_0 = arith.constant 3.1415926535897931 : f64
    %cst_1 = arith.constant 4.471500e-02 : f64
    %c3_i64 = arith.constant 3 : i64
    cf.br ^bb1
  ^bb1:  // pred: ^bb0
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.truncf %cst : f64 to f32
    %1 = arith.mulf %0, %arg0 : f32
    %2 = arith.sitofp %c1_i64 : i64 to f32
    %3 = arith.sitofp %c2_i64 : i64 to f32
    %4 = arith.truncf %cst_0 : f64 to f32
    %5 = arith.divf %3, %4 : f32
    %6 = math.sqrt %5 : f32
    %7 = arith.truncf %cst_1 : f64 to f32
    %8 = arith.sitofp %c3_i64 : i64 to f32
    %9 = math.powf %arg0, %8 : f32
    %10 = arith.mulf %7, %9 : f32
    %11 = arith.addf %arg0, %10 : f32
    %12 = arith.mulf %6, %11 : f32
    %13 = math.tanh %12 : f32
    %14 = arith.addf %2, %13 : f32
    %15 = arith.mulf %1, %14 : f32
    return %15 : f32
  }
}

time elapsed 2.73ms
timing breakdown:
  2.73ms: Lowered module

@ruleset
def pade44_tanh_expansion(x: Term):
    flt = lambda f: Npy_float32(Term.LiteralF64(float(f)))
    liti64 = Term.LiteralI64
    pow = Nb_Pow_Float32_Int64
    mul = Nb_Mul_Float32
    add = Nb_Add_Float32
    div = Nb_Div_Float32
    # Rewrite tanh(x) to the pade44 approximation
    # Note the use of pow(), they will be optimized
    # by the `pow_expansion` ruleset.
    yield rewrite(Npy_tanh_float32(x)).to(
        div(
            add(mul(flt(10), pow(x, liti64(3))), mul(flt(105), x)),
            add(
                add(pow(x, liti64(4)), mul(flt(45), pow(x, liti64(2)))),
                flt(105),
            ),
        )
    )

@ruleset
def pow_expansion(x: Term, ival: i64):
    # Rules to expand pow(x, i) to multiplcations
    powf = Nb_Pow_Float32_Int64
    lit64 = Term.LiteralI64
    mulf = Nb_Mul_Float32
    yield rewrite(powf(x, lit64(ival))).to(
        mulf(x, powf(x, lit64(ival - 1))),
        ival >= 1,
    )

    yield rewrite(powf(x, lit64(i64(0))), subsume=True).to(
        Npy_float32(Term.LiteralF64(float(1))),
    )

optimize_rules = pade44_tanh_expansion | pow_expansion

if __name__ == "__main__":
    report = Report("Pipeline execution report", enable_nested_metadata=True)
    jit_func = jit_compiler(
        fn=gelu_tanh_forward,
        argtypes=(Float32,),
        ruleset=(
            base_ruleset
            | setup_argtypes(TypeFloat32)
            | additional_rules
            | optimize_rules
        ),
        pipeline_report=report,
        **compiler_config,
    ).jit_func
    report.display()

--------------------------------original source---------------------------------
   1|def gelu_tanh_forward(a):
   2|    dt = np.float32
   3|    result = (
   4|        dt(0.5)
   5|        * a
   6|        * (
   7|            dt(1)
   8|            + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a**3))
   9|        )
  10|    )
  11|    return result
----------------------------------inter source----------------------------------
   1|def transformed_gelu_tanh_forward(a):
   2|    """#file: /tmp/ipykernel_3859/2556971375.py"""
   3|    '#loc: 2:8-2:23'
   4|    dt = np.float32
   5|    '#loc: 3:8-10:9'
   6|    result = dt(0.5) * a * (dt(1) + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a ** 3)))
   7|    '#loc: 11:8-11:21'
   8|    return result

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[804] <- !io a
{
  $1 = PyLoadGlobal $0[0] 'np'
  $2 = PyAttr $0[0] $1 'float32'
  $3 = DbgValue 'dt' $2[1]
  $4 = PyFloat 0.5
  $5 = PyCall $3 $2[0] $4
  $6 = PyBinOp * $5[0] $5[1], $0[1]
  $7 = PyInt 1
  $8 = PyCall $3 $6[0] $7
  $9 = PyInt 2
  $10 = PyCall $3 $8[0] $9
  $11 = PyLoadGlobal $10[0] 'np'
  $12 = PyAttr $10[0] $11 'pi'
  $13 = PyCall $3 $12[0] $12[1]
  $14 = PyBinOp / $13[0] $10[1], $13[1]
  $15 = PyLoadGlobal $14[0] 'np'
  $16 = PyAttr $14[0] $15 'sqrt'
  $17 = PyCall $16[1] $16[0] $14[1]
  $18 = PyFloat 0.044715
  $19 = PyCall $3 $17[0] $18
  $20 = PyInt 3
  $21 = PyBinOp ** $19[0] $0[1], $20
  $22 = PyBinOp * $21[0] $19[1], $21[1]
  $23 = PyBinOp + $22[0] $0[1], $22[1]
  $24 = PyBinOp * $23[0] $17[1], $23[1]
  $25 = PyLoadGlobal $24[0] 'np'
  $26 = PyAttr $24[0] $25 'tanh'
  $27 = PyCall $26[1] $26[0] $24[1]
  $28 = PyBinOp + $27[0] $8[1], $27[1]
  $29 = PyBinOp * $28[0] $6[1], $28[1]
  $30 = DbgValue 'result' $29[1]
} [1268] -> !io=$29[0] !ret=$30

time elapsed 9.57ms
timing breakdown:
  6.40ms: Debug Info on RVSDG 
  3.17ms: RVSDG

time elapsed 62.73ms
timing breakdown:
  62.73ms: EGraph

time elapsed 0.00ms
timing breakdown:

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[1575] <- !io a; #attrs (_, Float32)->(_, Float32)
{
  $1 = PyFloat 0.5
  $2 = NbOp_F64_to_F32 $1
  $3 = NbOp_Mul_Float32 $2 $0[1]
  $4 = PyInt 1
  $5 = NbOp_I64_to_F32 $4
  $6 = PyFloat 10.0
  $7 = NbOp_F64_to_F32 $6
  $8 = PyInt 2
  $9 = NbOp_I64_to_F32 $8
  $10 = PyFloat 3.141592653589793
  $11 = NbOp_F64_to_F32 $10
  $12 = NbOp_Div_Float32 $9 $11
  $13 = NpyOp_Sqrt_Float32 $12
  $14 = PyFloat 0.044715
  $15 = NbOp_F64_to_F32 $14
  $16 = PyFloat 1.0
  $17 = NbOp_F64_to_F32 $16
  $18 = NbOp_Mul_Float32 $0[1] $17
  $19 = NbOp_Mul_Float32 $0[1] $18
  $20 = NbOp_Mul_Float32 $0[1] $19
  $21 = NbOp_Mul_Float32 $15 $20
  $22 = NbOp_Add_Float32 $0[1] $21
  $23 = NbOp_Mul_Float32 $13 $22
  $24 = NbOp_Mul_Float32 $23 $17
  $25 = NbOp_Mul_Float32 $23 $24
  $26 = NbOp_Mul_Float32 $23 $25
  $27 = NbOp_Mul_Float32 $7 $26
  $28 = PyFloat 105.0
  $29 = NbOp_F64_to_F32 $28
  $30 = NbOp_Mul_Float32 $29 $23
  $31 = NbOp_Add_Float32 $27 $30
  $32 = NbOp_Mul_Float32 $23 $26
  $33 = PyFloat 45.0
  $34 = NbOp_F64_to_F32 $33
  $35 = NbOp_Mul_Float32 $34 $25
  $36 = NbOp_Add_Float32 $32 $35
  $37 = NbOp_Add_Float32 $36 $29
  $38 = NbOp_Div_Float32 $31 $37
  $39 = NbOp_Add_Float32 $5 $38
  $40 = NbOp_Mul_Float32 $3 $39
} [1782] -> !io=$0[0] !ret=$40

14747.0

time elapsed 17.75ms
timing breakdown:
  17.74ms: Extracted RVSDG     
  0.01ms: Extracted cost

module {
  func.func @func(%arg0: f32) -> f32 attributes {llvm.emit_c_interface} {
    %cst = arith.constant 5.000000e-01 : f64
    %c1_i64 = arith.constant 1 : i64
    %cst_0 = arith.constant 1.000000e+01 : f64
    %c2_i64 = arith.constant 2 : i64
    %cst_1 = arith.constant 3.1415926535897931 : f64
    %cst_2 = arith.constant 4.471500e-02 : f64
    %cst_3 = arith.constant 1.000000e+00 : f64
    %cst_4 = arith.constant 1.050000e+02 : f64
    %cst_5 = arith.constant 4.500000e+01 : f64
    cf.br ^bb1
  ^bb1:  // pred: ^bb0
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.truncf %cst : f64 to f32
    %1 = arith.mulf %0, %arg0 : f32
    %2 = arith.sitofp %c1_i64 : i64 to f32
    %3 = arith.truncf %cst_0 : f64 to f32
    %4 = arith.sitofp %c2_i64 : i64 to f32
    %5 = arith.truncf %cst_1 : f64 to f32
    %6 = arith.divf %4, %5 : f32
    %7 = math.sqrt %6 : f32
    %8 = arith.truncf %cst_2 : f64 to f32
    %9 = arith.truncf %cst_3 : f64 to f32
    %10 = arith.mulf %arg0, %9 : f32
    %11 = arith.mulf %arg0, %10 : f32
    %12 = arith.mulf %arg0, %11 : f32
    %13 = arith.mulf %8, %12 : f32
    %14 = arith.addf %arg0, %13 : f32
    %15 = arith.mulf %7, %14 : f32
    %16 = arith.mulf %15, %9 : f32
    %17 = arith.mulf %15, %16 : f32
    %18 = arith.mulf %15, %17 : f32
    %19 = arith.mulf %3, %18 : f32
    %20 = arith.truncf %cst_4 : f64 to f32
    %21 = arith.mulf %20, %15 : f32
    %22 = arith.addf %19, %21 : f32
    %23 = arith.mulf %15, %18 : f32
    %24 = arith.truncf %cst_5 : f64 to f32
    %25 = arith.mulf %24, %17 : f32
    %26 = arith.addf %23, %25 : f32
    %27 = arith.addf %26, %20 : f32
    %28 = arith.divf %22, %27 : f32
    %29 = arith.addf %2, %28 : f32
    %30 = arith.mulf %1, %29 : f32
    return %30 : f32
  }
}

time elapsed 2.13ms
timing breakdown:
  2.13ms: Lowered module

if __name__ == "__main__":
    relclose = lambda x, y: np.allclose(x, y, rtol=1e-6)
    run_test(
        gelu_tanh_forward, jit_func, (0.234,), equal=relclose, verbose=True
    )

(0.234,)

0.13864579796791077

0.1386458

if __name__ == "__main__":
    report = Report("Pipeline execution report", enable_nested_metadata=True)
    vectorized_gelu = ufunc_vectorize(
        input_type=Float32,
        ndim=1,
        compiler_config={**compiler_config, "pipeline_report": report},
        extra_ruleset=additional_rules | optimize_rules,
    )(gelu_tanh_forward)
    report.display()
    relclose = lambda x, y: np.allclose(x, y, rtol=1e-6)
    input_val = np.random.random(100).astype(np.float32)

    run_test(
        gelu_tanh_forward,
        vectorized_gelu,
        (input_val,),
        equal=relclose,
        verbose=True,
    )

--------------------------------original source---------------------------------
   1|def gelu_tanh_forward(a):
   2|    dt = np.float32
   3|    result = (
   4|        dt(0.5)
   5|        * a
   6|        * (
   7|            dt(1)
   8|            + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a**3))
   9|        )
  10|    )
  11|    return result
----------------------------------inter source----------------------------------
   1|def transformed_gelu_tanh_forward(a):
   2|    """#file: /tmp/ipykernel_3859/2556971375.py"""
   3|    '#loc: 2:8-2:23'
   4|    dt = np.float32
   5|    '#loc: 3:8-10:9'
   6|    result = dt(0.5) * a * (dt(1) + np.tanh(np.sqrt(dt(2) / dt(np.pi)) * (a + dt(0.044715) * a ** 3)))
   7|    '#loc: 11:8-11:21'
   8|    return result

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[804] <- !io a
{
  $1 = PyLoadGlobal $0[0] 'np'
  $2 = PyAttr $0[0] $1 'float32'
  $3 = DbgValue 'dt' $2[1]
  $4 = PyFloat 0.5
  $5 = PyCall $3 $2[0] $4
  $6 = PyBinOp * $5[0] $5[1], $0[1]
  $7 = PyInt 1
  $8 = PyCall $3 $6[0] $7
  $9 = PyInt 2
  $10 = PyCall $3 $8[0] $9
  $11 = PyLoadGlobal $10[0] 'np'
  $12 = PyAttr $10[0] $11 'pi'
  $13 = PyCall $3 $12[0] $12[1]
  $14 = PyBinOp / $13[0] $10[1], $13[1]
  $15 = PyLoadGlobal $14[0] 'np'
  $16 = PyAttr $14[0] $15 'sqrt'
  $17 = PyCall $16[1] $16[0] $14[1]
  $18 = PyFloat 0.044715
  $19 = PyCall $3 $17[0] $18
  $20 = PyInt 3
  $21 = PyBinOp ** $19[0] $0[1], $20
  $22 = PyBinOp * $21[0] $19[1], $21[1]
  $23 = PyBinOp + $22[0] $0[1], $22[1]
  $24 = PyBinOp * $23[0] $17[1], $23[1]
  $25 = PyLoadGlobal $24[0] 'np'
  $26 = PyAttr $24[0] $25 'tanh'
  $27 = PyCall $26[1] $26[0] $24[1]
  $28 = PyBinOp + $27[0] $8[1], $27[1]
  $29 = PyBinOp * $28[0] $6[1], $28[1]
  $30 = DbgValue 'result' $29[1]
} [1268] -> !io=$29[0] !ret=$30

time elapsed 9.11ms
timing breakdown:
  6.13ms: Debug Info on RVSDG 
  2.98ms: RVSDG

time elapsed 62.41ms
timing breakdown:
  62.41ms: EGraph

time elapsed 0.00ms
timing breakdown:

transformed_gelu_tanh_forward = Func (Args (ArgSpec 'a' (PyNone)))
$0 = Region[1575] <- !io a; #attrs (_, Float32)->(_, Float32)
{
  $1 = PyFloat 0.5
  $2 = NbOp_F64_to_F32 $1
  $3 = NbOp_Mul_Float32 $2 $0[1]
  $4 = PyInt 1
  $5 = NbOp_I64_to_F32 $4
  $6 = PyFloat 10.0
  $7 = NbOp_F64_to_F32 $6
  $8 = PyInt 2
  $9 = NbOp_I64_to_F32 $8
  $10 = PyFloat 3.141592653589793
  $11 = NbOp_F64_to_F32 $10
  $12 = NbOp_Div_Float32 $9 $11
  $13 = NpyOp_Sqrt_Float32 $12
  $14 = PyFloat 0.044715
  $15 = NbOp_F64_to_F32 $14
  $16 = PyFloat 1.0
  $17 = NbOp_F64_to_F32 $16
  $18 = NbOp_Mul_Float32 $0[1] $17
  $19 = NbOp_Mul_Float32 $0[1] $18
  $20 = NbOp_Mul_Float32 $0[1] $19
  $21 = NbOp_Mul_Float32 $15 $20
  $22 = NbOp_Add_Float32 $0[1] $21
  $23 = NbOp_Mul_Float32 $13 $22
  $24 = NbOp_Mul_Float32 $23 $17
  $25 = NbOp_Mul_Float32 $23 $24
  $26 = NbOp_Mul_Float32 $23 $25
  $27 = NbOp_Mul_Float32 $7 $26
  $28 = PyFloat 105.0
  $29 = NbOp_F64_to_F32 $28
  $30 = NbOp_Mul_Float32 $29 $23
  $31 = NbOp_Add_Float32 $27 $30
  $32 = NbOp_Mul_Float32 $23 $26
  $33 = PyFloat 45.0
  $34 = NbOp_F64_to_F32 $33
  $35 = NbOp_Mul_Float32 $34 $25
  $36 = NbOp_Add_Float32 $32 $35
  $37 = NbOp_Add_Float32 $36 $29
  $38 = NbOp_Div_Float32 $31 $37
  $39 = NbOp_Add_Float32 $5 $38
  $40 = NbOp_Mul_Float32 $3 $39
} [1782] -> !io=$0[0] !ret=$40

14747.0

time elapsed 16.56ms
timing breakdown:
  16.55ms: Extracted RVSDG     
  0.01ms: Extracted cost

module {
  func.func @func(%arg0: f32) -> f32 attributes {llvm.emit_c_interface} {
    %cst = arith.constant 5.000000e-01 : f64
    %c1_i64 = arith.constant 1 : i64
    %cst_0 = arith.constant 1.000000e+01 : f64
    %c2_i64 = arith.constant 2 : i64
    %cst_1 = arith.constant 3.1415926535897931 : f64
    %cst_2 = arith.constant 4.471500e-02 : f64
    %cst_3 = arith.constant 1.000000e+00 : f64
    %cst_4 = arith.constant 1.050000e+02 : f64
    %cst_5 = arith.constant 4.500000e+01 : f64
    cf.br ^bb1
  ^bb1:  // pred: ^bb0
    %c0_i32 = arith.constant 0 : i32
    %0 = arith.truncf %cst : f64 to f32
    %1 = arith.mulf %0, %arg0 : f32
    %2 = arith.sitofp %c1_i64 : i64 to f32
    %3 = arith.truncf %cst_0 : f64 to f32
    %4 = arith.sitofp %c2_i64 : i64 to f32
    %5 = arith.truncf %cst_1 : f64 to f32
    %6 = arith.divf %4, %5 : f32
    %7 = math.sqrt %6 : f32
    %8 = arith.truncf %cst_2 : f64 to f32
    %9 = arith.truncf %cst_3 : f64 to f32
    %10 = arith.mulf %arg0, %9 : f32
    %11 = arith.mulf %arg0, %10 : f32
    %12 = arith.mulf %arg0, %11 : f32
    %13 = arith.mulf %8, %12 : f32
    %14 = arith.addf %arg0, %13 : f32
    %15 = arith.mulf %7, %14 : f32
    %16 = arith.mulf %15, %9 : f32
    %17 = arith.mulf %15, %16 : f32
    %18 = arith.mulf %15, %17 : f32
    %19 = arith.mulf %3, %18 : f32
    %20 = arith.truncf %cst_4 : f64 to f32
    %21 = arith.mulf %20, %15 : f32
    %22 = arith.addf %19, %21 : f32
    %23 = arith.mulf %15, %18 : f32
    %24 = arith.truncf %cst_5 : f64 to f32
    %25 = arith.mulf %24, %17 : f32
    %26 = arith.addf %23, %25 : f32
    %27 = arith.addf %26, %20 : f32
    %28 = arith.divf %22, %27 : f32
    %29 = arith.addf %2, %28 : f32
    %30 = arith.mulf %1, %29 : f32
    return %30 : f32
  }
}

time elapsed 2.06ms
timing breakdown:
  2.06ms: Lowered module

if __name__ == "__main__":
    input_val = np.random.random(300000).astype(np.float32)
    out = np.zeros_like(input_val)

    print("original")
    %timeit gelu_tanh_forward(input_val)
    print("superoptimized")
    %timeit vectorized_gelu(input_val, out=out)

original

4.35 ms ± 16.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
superoptimized

260 μs ± 3.46 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Demo 1: Tanh Approximation for GELU Activation Layer¶

Imports and Setup¶

The GELU Function¶

Extend the Compiler for Needed Features¶

Type Inference for NumPy Module¶

Type Inference for NumPy Operations¶

Handle `module.attr`¶

Type Inference for `float32` Operations¶

Extend the RVSDG Grammar¶

Extend the Backend to Support NumPy Operations¶

Cost Model¶

Run the Extended Pipeline¶

Add Rules to Optimize¶

Define Pade44 approximation rewrite rule for tanh(x)¶

Run the Optimized Function¶

Compare the Result¶

Ufunc Version¶

Benchmark¶

Demo 1: Tanh Approximation for GELU Activation Layer¶

Imports and Setup¶

The GELU Function¶

Extend the Compiler for Needed Features¶

Type Inference for NumPy Module¶

Type Inference for NumPy Operations¶

Handle module.attr¶

Type Inference for float32 Operations¶

Extend the RVSDG Grammar¶

Extend the Backend to Support NumPy Operations¶

Cost Model¶

Run the Extended Pipeline¶

Add Rules to Optimize¶

Define Pade44 approximation rewrite rule for tanh(x)¶

Run the Optimized Function¶

Compare the Result¶

Ufunc Version¶

Benchmark¶

Handle `module.attr`¶

Type Inference for `float32` Operations¶