Convert Math to LLVM not working properly on math.tanh

I am trying to run an MLIR program:

memref.global "private" @global_seed : memref<i64> = dense<0>
  func.func @forward(%arg0: tensor<128x64x8x8xf32>) -> tensor<128x64x8x8xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<128x64x8x8xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg0 : tensor<128x64x8x8xf32>, tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %16 = arith.mulf %in, %in_0 : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.absf %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.sin %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = arith.cmpf ugt, %in, %cst : f32
      %17 = arith.select %16, %in, %cst : f32
      linalg.yield %17 : f32
    } -> tensor<128x64x8x8xf32>
    %5 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%out: f32):
      %16 = linalg.index 0 : index
      %17 = linalg.index 1 : index
      %18 = linalg.index 2 : index
      %19 = linalg.index 3 : index
      %extracted = tensor.extract %4[%16, %17, %18, %19] : tensor<128x64x8x8xf32>
      linalg.yield %extracted : f32
    } -> tensor<128x64x8x8xf32>
    %collapsed = tensor.collapse_shape %5 [[0, 1], [2], [3]] : tensor<128x64x8x8xf32> into tensor<8192x8x8xf32>
    %6 = tensor.empty() : tensor<8192x8x8xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<8192x8x8xf32>) -> tensor<8192x8x8xf32>
    %8 = linalg.batch_matmul ins(%collapsed, %collapsed : tensor<8192x8x8xf32>, tensor<8192x8x8xf32>) outs(%7 : tensor<8192x8x8xf32>) -> tensor<8192x8x8xf32>
    %expanded = tensor.expand_shape %8 [[0, 1], [2], [3]] : tensor<8192x8x8xf32> into tensor<128x64x8x8xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.tanh %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.exp %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<128x64x8x8xf32>, tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %16 = math.powf %in, %in_0 : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = arith.cmpf ugt, %in, %cst : f32
      %17 = arith.select %16, %in, %cst : f32
      linalg.yield %17 : f32
    } -> tensor<128x64x8x8xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %12 : tensor<128x64x8x8xf32>, tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %16 = arith.addf %in, %in_0 : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.cos %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14 : tensor<128x64x8x8xf32>) outs(%0 : tensor<128x64x8x8xf32>) {
    ^bb0(%in: f32, %out: f32):
      %16 = math.sin %in : f32
      linalg.yield %16 : f32
    } -> tensor<128x64x8x8xf32>
    return %15 : tensor<128x64x8x8xf32>
  }

When trying to run it using mlir-cpu-runner, I am facing this error error: Dialect math not found for custom op math.tanh`.

It turns out that math.tanh is not getting lowered with the other math operations after using -convert-math-to-llvm.
The following is the full lowering passes applied:
-loop-invariant-code-motion -cse -canonicalize -cse -eliminate-empty-tensors -empty-tensor-to-alloc-tensor --one-shot-bufferize="bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" -convert-linalg-to-loops - -convert-vector-to-scf -canonicalize -lower-affine -expand-strided-metadata -finalize-memref-to-llvm -convert-scf-to-cf -lower-affine -convert-math-to-llvm -convert-arith-to-llvm -convert-vector-to-llvm -convert-cf-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts

Is this something from my end ? Am I missing a lowering pass ?
Any help is appreciated

You also need convert-math-to-libm pass