diff --git a/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6f17df72bc1f83d928266d2fd4d686b44fb68eae Binary files /dev/null and b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin differ diff --git a/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0b5f40b5e7eb9129cbd52fb5bfd25419609a39a6 --- /dev/null +++ b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir @@ -0,0 +1,125 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1> + %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked> + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked> + %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked> + %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked> + %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked> + %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2> + %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2> + %cst_10 = arith.constant 0.000000e+00 : f32 + %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked> + %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked> + %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked> + %cst_14 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1> + %c2_i32 = arith.constant 2 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c2_i32 : i32 + %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2> + %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2> + %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2> + %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked> + %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<2x1x!tt.ptr, #blocked> + %15 = tt.splat %arg0 : (!tt.ptr) -> tensor<2x1x!tt.ptr, #blocked2> + %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr, #blocked>, tensor<2x1xi32, #blocked> + %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr, #blocked2>, tensor<2x1xi32, #blocked2> + %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked> + %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2> + %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked> + %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked> + %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1> + %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked> + %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked> + %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked> + %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked> + %27 = tt.splat %arg2 : (!tt.ptr) -> tensor<2x256x!tt.ptr, #blocked> + %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr, #blocked>, tensor<2x256xi32, #blocked> + %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked> + %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked> + %31 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked> + %32 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2> + %33 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked> + %34 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2> + %35 = arith.select %33, %31, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked> + %36 = arith.select %34, %32, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2> + %37 = arith.cmpi sge, %36, %cst_8 : tensor<2x1xi64, #blocked2> + %38 = arith.cmpi slt, %36, %cst_9 : tensor<2x1xi64, #blocked2> + %39 = arith.andi %37, %38 : tensor<2x1xi1, #blocked2> + tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2> + %40 = arith.muli %35, %cst_5 : tensor<2x1xi64, #blocked> + %41 = tt.broadcast %40 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked> + %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked> + %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked> + %44 = arith.addi %43, %41 : tensor<2x256xi64, #blocked> + %45 = tt.splat %arg1 : (!tt.ptr) -> tensor<2x256x!tt.ptr, #blocked> + %46 = tt.addptr %45, %44 : tensor<2x256x!tt.ptr, #blocked>, tensor<2x256xi64, #blocked> + %47 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked> + %48 = arith.addf %47, %30 : tensor<2x256xf32, #blocked> + %49 = arith.addf %48, %cst_13 : tensor<2x256xf32, #blocked> + %50 = arith.subf %48, %49 : tensor<2x256xf32, #blocked> + %51 = arith.mulf %48, %50 : tensor<2x256xf32, #blocked> + %52 = arith.addf %51, %cst_13 : tensor<2x256xf32, #blocked> + %53 = arith.select %29, %49, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked> + %54 = arith.select %29, %52, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked> + %55 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked> + %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked> + %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %82 = arith.subf %arg10, %arg7 : f32 + %83 = arith.addf %arg9, %arg12 : f32 + %84 = arith.cmpf oeq, %83, %cst_10 : f32 + %85 = arith.divf %arg12, %83 : f32 + %86 = arith.select %84, %cst_10, %85 : f32 + %87 = arith.mulf %82, %86 : f32 + %88 = arith.addf %arg7, %87 : f32 + %89 = arith.addf %arg8, %arg11 : f32 + %90 = arith.mulf %82, %82 : f32 + %91 = arith.mulf %90, %arg9 : f32 + %92 = arith.mulf %91, %86 : f32 + %93 = arith.addf %89, %92 : f32 + tt.reduce.return %88, %93, %83 : f32, f32, f32 + }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) + %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked> + %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked> + %60 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked> + %61 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked1> + %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr, #blocked1>, tensor<1x256xi32, #blocked1> + %63 = tt.load %62, %22, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1> + tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2> + %64 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked> + %65 = arith.addf %64, %60 : tensor<2x256xf32, #blocked> + %66 = tt.broadcast %58 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked> + %67 = arith.subf %65, %66 : tensor<2x256xf32, #blocked> + %68 = arith.divf %59, %cst_12 : tensor<2x1xf32, #blocked> + %69 = arith.addf %68, %cst_11 : tensor<2x1xf32, #blocked> + %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked> + %71 = tt.broadcast %70 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked> + %72 = arith.mulf %67, %71 : tensor<2x256xf32, #blocked> + %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked> + %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked> + %75 = arith.mulf %72, %74 : tensor<2x256xf32, #blocked> + %76 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked> + %77 = tt.broadcast %76 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked> + %78 = arith.addi %24, %77 : tensor<2x256xi32, #blocked> + %79 = tt.splat %arg4 : (!tt.ptr) -> tensor<2x256x!tt.ptr, #blocked> + %80 = tt.addptr %79, %78 : tensor<2x256x!tt.ptr, #blocked>, tensor<2x256xi32, #blocked> + %81 = arith.truncf %75 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked> + tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..878f5f2999862ec7cfc2b979db9c71f4ee0b2648 --- /dev/null +++ b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx @@ -0,0 +1,782 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8de9de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7d8de9de( + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<45>; + .reg .b16 %rs<5>; + .reg .b32 %r<106>; + .reg .f32 %f<90>; + .reg .b64 %rd<44>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; + ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r74, %tid.x; + and.b32 %r75, %r74, 31; + ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; + ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; + ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; + shl.b32 %r76, %r74, 2; + ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; + and.b32 %r77, %r76, 252; + ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; + ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r78, %r1, 8; + .loc 1 30 36 + or.b32 %r79, %r78, %r77; + .loc 1 30 30 + mul.wide.s32 %rd33, %r79, 2; + add.s64 %rd1, %rd26, %rd33; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + cvt.u64.u32 %rd34, %r77; + mul.wide.u32 %rd35, %r77, 4; + add.s64 %rd2, %rd27, %rd35; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd36, %r79, 4; + add.s64 %rd3, %rd28, %rd36; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 30 + mul.wide.s32 %rd37, %r1, 4; + add.s64 %rd4, %rd29, %rd37; + .loc 1 33 35 + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ]; + mov.b32 %f13, %r26; + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ]; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ]; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ]; + .loc 1 34 31 + add.s64 %rd8, %rd30, %rd37; + .loc 1 34 36 + mov.u32 %r55, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ]; + mov.b32 %f14, %r55; + mov.u32 %r31, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ]; + mov.u32 %r32, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ]; + mov.u32 %r33, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ]; + .loc 1 35 31 + mul.wide.s32 %rd38, %r1, 8; + add.s64 %rd13, %rd31, %rd38; + .loc 1 35 36 + mov.u64 %rd12, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd13 + 0 ]; + mov.u64 %rd14, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd13 + 0 ]; + mov.u64 %rd16, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd13 + 0 ]; + mov.u64 %rd18, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd13 + 0 ]; + .loc 1 36 35 + add.s64 %rd20, %rd25, %rd36; + .loc 1 36 51 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd20 + 0 ]; + @!%p1 mov.u32 %r34, %r4; + @!%p1 mov.u32 %r35, %r4; + @!%p1 mov.u32 %r36, %r4; + @!%p1 mov.u32 %r37, %r4; + mov.b32 %f15, %r34; + mov.b32 %f16, %r35; + mov.b32 %f17, %r36; + mov.b32 %f18, %r37; + .loc 1 38 18 + mul.f32 %f19, %f1, %f5; + mul.f32 %f20, %f2, %f6; + mul.f32 %f21, %f3, %f7; + mul.f32 %f22, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f23, %f1, %f5, %f20; + fma.rn.f32 %f24, %f3, %f7, %f23; + fma.rn.f32 %f25, %f4, %f8, %f24; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r80, %f25; + shfl.sync.bfly.b32 %r81, %r80, 16, 31, -1; + mov.b32 %f26, %r81; +$L__tmp3: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r82, %f27; + shfl.sync.bfly.b32 %r83, %r82, 8, 31, -1; + mov.b32 %f28, %r83; +$L__tmp5: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r84, %f29; + shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1; + mov.b32 %f30, %r85; +$L__tmp7: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r86, %f31; + shfl.sync.bfly.b32 %r87, %r86, 2, 31, -1; + mov.b32 %f32, %r87; +$L__tmp9: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r88, %f33; + shfl.sync.bfly.b32 %r89, %r88, 1, 31, -1; + mov.b32 %f34, %r89; +$L__tmp11: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p31, %r75, 0; + shr.u32 %r90, %r74, 3; + and.b32 %r91, %r90, 4; + mov.u32 %r92, global_smem; + add.s32 %r42, %r92, %r91; + mov.b32 %r43, %f35; + @%p31 st.shared.b32 [ %r42 + 0 ], %r43; + bar.sync 0; + setp.lt.s32 %p32, %r74, 2; + add.s32 %r45, %r92, %r76; + @%p32 ld.shared.b32 %r44, [ %r45 + 0 ]; + mov.b32 %f36, %r44; + shfl.sync.bfly.b32 %r93, %r44, 1, 31, -1; + mov.b32 %f37, %r93; +$L__tmp13: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp14: + .loc 2 243 36 + and.b32 %r94, %r74, 1; + setp.eq.b32 %p41, %r94, 1; + not.pred %p42, %p41; + and.pred %p33, %p32, %p42; + mov.b32 %r47, %f38; + @%p33 st.shared.b32 [ %r45 + 0 ], %r47; + bar.sync 0; + ld.shared.f32 %f39, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f40, %f39, 0f00000000; +$L__tmp16: + .loc 1 42 19 + sub.f32 %f41, %f9, %f13; + sub.f32 %f42, %f10, %f13; + sub.f32 %f43, %f11, %f13; + sub.f32 %f44, %f12, %f13; + .loc 1 43 20 + mul.f32 %f45, %f41, %f14; + mul.f32 %f46, %f42, %f14; + mul.f32 %f47, %f43, %f14; + mul.f32 %f48, %f44, %f14; + .loc 1 44 19 + mul.f32 %f49, %f20, %f46; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f50, %f19, %f45, %f49; + fma.rn.f32 %f51, %f21, %f47, %f50; + fma.rn.f32 %f52, %f22, %f48, %f51; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r95, %f52; + shfl.sync.bfly.b32 %r96, %r95, 16, 31, -1; + mov.b32 %f53, %r96; +$L__tmp20: + .loc 2 233 15 + add.f32 %f54, %f52, %f53; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r97, %f54; + shfl.sync.bfly.b32 %r98, %r97, 8, 31, -1; + mov.b32 %f55, %r98; +$L__tmp22: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r99, %f56; + shfl.sync.bfly.b32 %r100, %r99, 4, 31, -1; + mov.b32 %f57, %r100; +$L__tmp24: + .loc 2 233 15 + add.f32 %f58, %f56, %f57; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r101, %f58; + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + mov.b32 %f59, %r102; +$L__tmp26: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r103, %f60; + shfl.sync.bfly.b32 %r104, %r103, 1, 31, -1; + mov.b32 %f61, %r104; +$L__tmp28: + .loc 2 233 15 + add.f32 %f62, %f60, %f61; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r49, %f62; + @%p31 st.shared.b32 [ %r42 + 0 ], %r49; + bar.sync 0; + @%p32 ld.shared.b32 %r50, [ %r45 + 0 ]; + mov.b32 %f63, %r50; + shfl.sync.bfly.b32 %r105, %r50, 1, 31, -1; + mov.b32 %f64, %r105; +$L__tmp30: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r53, %f65; + @%p33 st.shared.b32 [ %r45 + 0 ], %r53; + bar.sync 0; + ld.shared.f32 %f66, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f67, %f66, 0f00000000; +$L__tmp33: + .loc 1 49 21 + setp.eq.s64 %p43, %rd12, -1; + mov.b32 %r56, 1132462080; + .loc 1 51 20 + div.full.f32 %r54, %r55, %r56; + mov.b32 %f68, %r54; + .loc 1 53 20 + neg.f32 %f69, %f40; + fma.rn.f32 %f70, %f19, 0f43800000, %f69; + fma.rn.f32 %f71, %f20, 0f43800000, %f69; + fma.rn.f32 %f72, %f21, 0f43800000, %f69; + fma.rn.f32 %f73, %f22, 0f43800000, %f69; + .loc 1 55 20 + neg.f32 %f74, %f45; + fma.rn.f32 %f75, %f74, %f67, %f70; + neg.f32 %f76, %f46; + fma.rn.f32 %f77, %f76, %f67, %f71; + neg.f32 %f78, %f47; + fma.rn.f32 %f79, %f78, %f67, %f72; + neg.f32 %f80, %f48; + fma.rn.f32 %f81, %f80, %f67, %f73; + .loc 1 57 20 + fma.rn.f32 %f82, %f68, %f75, %f15; + fma.rn.f32 %f83, %f68, %f77, %f16; + fma.rn.f32 %f84, %f68, %f79, %f17; + fma.rn.f32 %f85, %f68, %f81, %f18; + .loc 1 59 35 + selp.f32 %f86, 0f00000000, %f82, %p43; + selp.f32 %f87, 0f00000000, %f83, %p43; + selp.f32 %f88, 0f00000000, %f84, %p43; + selp.f32 %f89, 0f00000000, %f85, %p43; + .loc 1 61 20 + setp.lt.s64 %p44, %rd12, 0; + .loc 1 63 56 + shl.b64 %rd39, %rd12, 8; + add.s64 %rd40, %rd39, 12865792; + selp.b64 %rd41, %rd40, %rd39, %p44; + .loc 1 63 52 + or.b64 %rd42, %rd41, %rd34; + .loc 1 63 30 + shl.b64 %rd43, %rd42, 2; + add.s64 %rd21, %rd32, %rd43; + add.s64 %rd22, %rd21, 4; + add.s64 %rd23, %rd21, 8; + add.s64 %rd24, %rd21, 12; + .loc 1 63 83 + mov.b32 %r67, %f86; + mov.u32 %r66, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r66, [ %rd21 + 0 ], %r67; + mov.b32 %r69, %f87; + mov.u32 %r68, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r68, [ %rd22 + 0 ], %r69; + mov.b32 %r71, %f88; + mov.u32 %r70, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r70, [ %rd23 + 0 ], %r71; + mov.b32 %r73, %f89; + mov.u32 %r72, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r72, [ %rd24 + 0 ], %r73; + .loc 1 63 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/qr/cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 407 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 113 +.b8 114 +.b8 121 +.b8 120 +.b8 109 +.b8 52 +.b8 54 +.b8 106 +.b8 99 +.b8 120 +.b8 121 +.b8 114 +.b8 51 +.b8 113 +.b8 100 +.b8 107 +.b8 116 +.b8 113 +.b8 105 +.b8 114 +.b8 110 +.b8 53 +.b8 51 +.b8 101 +.b8 97 +.b8 112 +.b8 55 +.b8 104 +.b8 51 +.b8 112 +.b8 106 +.b8 106 +.b8 113 +.b8 105 +.b8 113 +.b8 97 +.b8 118 +.b8 121 +.b8 113 +.b8 113 +.b8 121 +.b8 118 +.b8 102 +.b8 108 +.b8 97 +.b8 98 +.b8 106 +.b8 112 +.b8 118 +.b8 109 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 113 +.b8 114 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 41 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 47 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 47 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 47 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a507cf313a4764eede03b5cd03f0cc9917090f2a Binary files /dev/null and b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin differ diff --git a/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..d034eca6434698ea4f1637ab108ae64b71fc8f55 --- /dev/null +++ b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir @@ -0,0 +1,156 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %11 = lshr i32 %10, 2, !dbg !8 + %12 = and i32 %11, 63, !dbg !8 + %13 = and i32 %10, 3, !dbg !9 + %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !10 + %15 = sext i32 %14 to i64, !dbg !11 + %16 = shl nsw i64 %15, 6, !dbg !12 + %17 = zext nneg i32 %12 to i64 + %18 = or i64 %16, %17, !dbg !13 + %19 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #2, !dbg !15 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #2, !dbg !16 + %22 = bitcast i32 %21 to float, !dbg !16 + %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #2, !dbg !17 + %24 = bitcast i32 %23 to float, !dbg !17 + %25 = mul nsw i64 %18, 50257, !dbg !18 + %.not = icmp eq i64 %20, -1, !dbg !19 + %26 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %22, float %24) #2, !dbg !20 + %27 = select i1 %.not, float 0.000000e+00, float %26, !dbg !21 + %28 = getelementptr float, ptr addrspace(1) %0, i64 %25 + br label %29, !dbg !22 + +29: ; preds = %9, %29 + %30 = phi float [ 0.000000e+00, %9 ], [ %40, %29 ] + %31 = phi i32 [ 0, %9 ], [ %41, %29 ] + %32 = or i32 %31, %13, !dbg !23 + %33 = zext nneg i32 %32 to i64, !dbg !23 + %34 = icmp ult i32 %32, 50257, !dbg !24 + %35 = getelementptr float, ptr addrspace(1) %28, i64 %33, !dbg !25 + %36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %35, i1 %34, i32 0, i1 %34) #2, !dbg !26 + %37 = bitcast i32 %36 to float, !dbg !26 + %38 = fmul float %27, %37, !dbg !27 + %39 = select i1 %34, float %38, float -0.000000e+00, !dbg !28 + %40 = fadd float %30, %39, !dbg !28 + %41 = add nuw nsw i32 %31, 4, !dbg !22 + %42 = icmp ult i32 %31, 50253, !dbg !22 + br i1 %42, label %29, label %43, !dbg !22 + +43: ; preds = %29 + %44 = bitcast float %40 to i32, !dbg !29 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !29 + %46 = bitcast i32 %45 to float, !dbg !29 + %47 = fadd float %40, %46, !dbg !33 + %48 = bitcast float %47 to i32, !dbg !29 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 1, i32 31), !dbg !29 + %50 = bitcast i32 %49 to float, !dbg !29 + %51 = fadd float %47, %50, !dbg !33 + br label %52, !dbg !37 + +52: ; preds = %43, %52 + %53 = phi i32 [ 0, %43 ], [ %75, %52 ] + %54 = or i32 %53, %13, !dbg !38 + %55 = zext nneg i32 %54 to i64, !dbg !38 + %56 = icmp ult i32 %54, 50257, !dbg !39 + %57 = add nsw i64 %25, %55, !dbg !40 + %58 = getelementptr i16, ptr addrspace(1) %4, i64 %57, !dbg !41 + %59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %58, i1 %56, i16 0, i1 %56) #2, !dbg !42 + %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #2, !dbg !43 + %61 = getelementptr float, ptr addrspace(1) %0, i64 %57, !dbg !44 + %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %61, i1 %56, i32 0, i1 %56) #2, !dbg !45 + %63 = bitcast i32 %62 to float, !dbg !45 + %64 = getelementptr i16, ptr addrspace(1) %5, i64 %57, !dbg !46 + %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %64, i1 %56, i16 0, i1 %56) #2, !dbg !47 + %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !48 + %67 = fmul float %27, %63, !dbg !49 + %68 = fmul float %66, 0x3FF7154760000000, !dbg !50 + %69 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %68) #2, !dbg !50 + %70 = fmul float %51, %69, !dbg !51 + %71 = fsub float %67, %70, !dbg !52 + %72 = fadd float %60, %71, !dbg !53 + %73 = getelementptr i16, ptr addrspace(1) %6, i64 %57, !dbg !54 + %74 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %72) #2, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %74, ptr addrspace(1) %73, i1 %56) #2, !dbg !55 + %75 = add nuw nsw i32 %53, 4, !dbg !37 + %76 = icmp ult i32 %53, 50253, !dbg !37 + br i1 %76, label %52, label %77, !dbg !37 + +77: ; preds = %52 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 34, scope: !5) +!12 = !DILocation(line: 21, column: 46, scope: !5) +!13 = !DILocation(line: 22, column: 23, scope: !5) +!14 = !DILocation(line: 26, column: 30, scope: !5) +!15 = !DILocation(line: 26, column: 35, scope: !5) +!16 = !DILocation(line: 27, column: 19, scope: !5) +!17 = !DILocation(line: 29, column: 19, scope: !5) +!18 = !DILocation(line: 36, column: 46, scope: !5) +!19 = !DILocation(line: 38, column: 23, scope: !5) +!20 = !DILocation(line: 39, column: 22, scope: !5) +!21 = !DILocation(line: 41, column: 37, scope: !5) +!22 = !DILocation(line: 32, column: 36, scope: !5) +!23 = !DILocation(line: 33, column: 27, scope: !5) +!24 = !DILocation(line: 34, column: 25, scope: !5) +!25 = !DILocation(line: 36, column: 34, scope: !5) +!26 = !DILocation(line: 36, column: 52, scope: !5) +!27 = !DILocation(line: 42, column: 23, scope: !5) +!28 = !DILocation(line: 45, column: 40, scope: !5) +!29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!32 = !DILocation(line: 46, column: 27, scope: !30) +!33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35) +!34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0) +!35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 27, scope: !34) +!37 = !DILocation(line: 51, column: 36, scope: !5) +!38 = !DILocation(line: 52, column: 27, scope: !5) +!39 = !DILocation(line: 53, column: 25, scope: !5) +!40 = !DILocation(line: 55, column: 41, scope: !5) +!41 = !DILocation(line: 55, column: 35, scope: !5) +!42 = !DILocation(line: 55, column: 53, scope: !5) +!43 = !DILocation(line: 55, column: 105, scope: !5) +!44 = !DILocation(line: 56, column: 35, scope: !5) +!45 = !DILocation(line: 56, column: 53, scope: !5) +!46 = !DILocation(line: 57, column: 35, scope: !5) +!47 = !DILocation(line: 57, column: 53, scope: !5) +!48 = !DILocation(line: 57, column: 105, scope: !5) +!49 = !DILocation(line: 63, column: 24, scope: !5) +!50 = !DILocation(line: 65, column: 23, scope: !5) +!51 = !DILocation(line: 66, column: 24, scope: !5) +!52 = !DILocation(line: 67, column: 24, scope: !5) +!53 = !DILocation(line: 69, column: 24, scope: !5) +!54 = !DILocation(line: 70, column: 29, scope: !5) +!55 = !DILocation(line: 70, column: 54, scope: !5) +!56 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..eddd3a9bbcda244c1e45dfdc008de58b4d748e7a --- /dev/null +++ b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx @@ -0,0 +1,525 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7de8 + +.visible .entry triton__0d1d2d3d4d5d6d7de8( + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<16>; + .reg .b16 %rs<9>; + .reg .b32 %r<31>; + .reg .f32 %f<23>; + .reg .b64 %rd<51>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4]; + ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_0]; + ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r13, %tid.x; + ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_2]; + bfe.u32 %r14, %r13, 2, 6; + ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_3]; + .loc 1 24 33 + and.b32 %r1, %r13, 3; + .loc 1 21 28 + mov.u32 %r6, %ctaid.x; + .loc 1 21 34 + cvt.s64.s32 %rd1, %r6; + .loc 1 21 46 + mul.wide.s32 %rd27, %r6, 64; + cvt.u64.u32 %rd2, %r14; + .loc 1 22 23 + or.b64 %rd28, %rd27, %rd2; + .loc 1 26 30 + shl.b64 %rd29, %rd28, 3; + add.s64 %rd22, %rd26, %rd29; + mov.pred %p1, -1; + .loc 1 26 35 + mov.u64 %rd21, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ]; + .loc 1 27 19 + mov.u32 %r10, 0x0; + @%p1 ld.global.b32 { %r10 }, [ %rd23 + 0 ]; + .loc 1 29 19 + mov.u32 %r11, 0x0; + @%p1 ld.global.b32 { %r11 }, [ %rd24 + 0 ]; + .loc 1 38 23 + setp.eq.s64 %p4, %rd21, -1; + .loc 1 39 22 + div.full.f32 %r9, %r10, %r11; + mov.b32 %f6, %r9; + .loc 1 41 37 + selp.f32 %f1, 0f00000000, %f6, %p4; + .loc 1 32 36 + mul.wide.s32 %rd30, %r6, 12865792; + mul.wide.u32 %rd31, %r14, 201028; + add.s64 %rd32, %rd30, %rd31; + cvt.u64.u32 %rd33, %r13; + and.b64 %rd3, %rd33, 3; + mul.wide.u32 %rd34, %r1, 4; + add.s64 %rd35, %rd32, %rd34; + add.s64 %rd50, %rd25, %rd35; + mov.f32 %f22, 0f00000000; + mov.b32 %r29, -4; + mov.u64 %rd46, %rd50; +$L__BB0_1: + add.s32 %r29, %r29, 4; + .loc 1 33 27 + add.s32 %r17, %r29, %r1; + .loc 1 34 25 + setp.lt.u32 %p5, %r17, 50257; + mov.b32 %r16, 0; + .loc 1 36 52 + mov.u32 %r15, 0x0; + @%p5 ld.global.L1::evict_last.b32 { %r15 }, [ %rd46 + 0 ]; + @!%p5 mov.u32 %r15, %r16; + mov.b32 %f7, %r15; + .loc 1 42 23 + mul.f32 %f8, %f1, %f7; + .loc 1 45 40 + selp.f32 %f9, %f8, 0f80000000, %p5; + add.f32 %f22, %f22, %f9; + .loc 1 32 36 + add.s64 %rd46, %rd46, 16; + setp.lt.u32 %p7, %r29, 50253; + @%p7 bra $L__BB0_1; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r19, %f22; + shfl.sync.bfly.b32 %r20, %r19, 2, 31, -1; + mov.b32 %f10, %r20; +$L__tmp2: + .loc 2 233 15 + add.f32 %f11, %f22, %f10; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r21, %f11; + shfl.sync.bfly.b32 %r22, %r21, 1, 31, -1; + mov.b32 %f12, %r22; +$L__tmp4: + .loc 2 233 15 + add.f32 %f4, %f11, %f12; +$L__tmp5: + .loc 1 51 36 + mul.lo.s64 %rd37, %rd1, 3216448; + mul.lo.s64 %rd38, %rd2, 50257; + add.s64 %rd39, %rd37, %rd38; + add.s64 %rd40, %rd39, %rd3; + shl.b64 %rd41, %rd40, 1; + add.s64 %rd49, %rd20, %rd41; + add.s64 %rd48, %rd19, %rd41; + add.s64 %rd47, %rd18, %rd41; + mov.b32 %r30, -4; + mov.u16 %rs2, 0; +$L__BB0_3: + add.s32 %r30, %r30, 4; + .loc 1 52 27 + add.s32 %r28, %r30, %r1; + .loc 1 53 25 + setp.lt.u32 %p8, %r28, 50257; + .loc 1 55 53 + mov.u16 %rs1, 0x0; + @%p8 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ]; + @!%p8 mov.u16 %rs1, %rs2; + .loc 1 55 105 + cvt.f32.bf16 %r23, %rs1; + mov.b32 %f15, %r23; + .loc 1 56 53 + mov.u32 %r24, 0x0; + @%p8 ld.global.L1::evict_first.b32 { %r24 }, [ %rd50 + 0 ]; + @!%p8 mov.u32 %r24, %r16; + mov.b32 %f16, %r24; + .loc 1 57 53 + mov.u16 %rs4, 0x0; + @%p8 ld.global.L1::evict_first.b16 { %rs4 }, [ %rd48 + 0 ]; + @!%p8 mov.u16 %rs4, %rs2; + .loc 1 57 105 + cvt.f32.bf16 %r26, %rs4; + mov.b32 %f17, %r26; + .loc 1 65 23 + mul.f32 %f14, %f17, 0f3FB8AA3B; + ex2.approx.f32 %f13, %f14; + .loc 1 66 24 + mul.f32 %f18, %f4, %f13; + .loc 1 67 24 + neg.f32 %f19, %f18; + fma.rn.f32 %f20, %f1, %f16, %f19; + .loc 1 69 24 + add.f32 %f21, %f15, %f20; + .loc 1 70 54 + mov.b32 %r27, %f21; + cvt.rn.bf16.f32 %rs7, %r27; + @%p8 st.global.b16 [ %rd49 + 0 ], { %rs7 }; + .loc 1 51 36 + add.s64 %rd50, %rd50, 16; + add.s64 %rd49, %rd49, 8; + add.s64 %rd48, %rd48, 8; + add.s64 %rd47, %rd47, 8; + setp.lt.u32 %p15, %r30, 50253; + @%p15 bra $L__BB0_3; + .loc 1 51 4 + ret; +$L__tmp6: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 278 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 107 +.b8 122 +.b8 103 +.b8 108 +.b8 55 +.b8 116 +.b8 104 +.b8 98 +.b8 52 +.b8 120 +.b8 100 +.b8 102 +.b8 107 +.b8 102 +.b8 110 +.b8 100 +.b8 50 +.b8 116 +.b8 105 +.b8 100 +.b8 107 +.b8 115 +.b8 54 +.b8 109 +.b8 116 +.b8 53 +.b8 102 +.b8 51 +.b8 104 +.b8 97 +.b8 117 +.b8 119 +.b8 102 +.b8 121 +.b8 106 +.b8 102 +.b8 108 +.b8 98 +.b8 116 +.b8 122 +.b8 121 +.b8 101 +.b8 112 +.b8 111 +.b8 53 +.b8 111 +.b8 120 +.b8 107 +.b8 118 +.b8 104 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 107 +.b8 122 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 46 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp5 +.b8 2 +.b8 46 +.b8 27 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp5 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2507581c9ec09567723ca23b5a7385d8f18f3c79 --- /dev/null +++ b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir @@ -0,0 +1,92 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x1xf32, #blocked> + %cst_0 = arith.constant dense<50257> : tensor<64x1xi64, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<64x1xi64, #blocked> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> + %c64_i64 = arith.constant 64 : i64 + %cst_3 = arith.constant dense<50257> : tensor<1x4xi64, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c4_i32 = arith.constant 4 : i32 + %c50257_i32 = arith.constant 50257 : i32 + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c64_i64 : i64 + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = arith.extsi %4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %6 = tt.splat %2 : (i64) -> tensor<64x1xi64, #blocked> + %7 = arith.addi %6, %5 : tensor<64x1xi64, #blocked> + %8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked> + %10 = arith.extsi %9 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> + %11 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked> + %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi64, #blocked> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked> + %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %18 = arith.muli %7, %cst_0 : tensor<64x1xi64, #blocked> + %19 = tt.broadcast %18 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %20 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + %21 = arith.cmpi ne, %13, %cst_1 : tensor<64x1xi64, #blocked> + %22 = arith.divf %15, %17 : f32 + %23 = tt.splat %22 : (f32) -> tensor<64x1xf32, #blocked> + %24 = arith.select %21, %23, %cst : tensor<64x1xi1, #blocked>, tensor<64x1xf32, #blocked> + %25 = tt.broadcast %24 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked> + %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_2) -> (tensor<64x4xf32, #blocked>) : i32 { + %33 = arith.extsi %arg9 : i32 to i64 + %34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked> + %35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked> + %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked> + %37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked> + %39 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked> + %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + %42 = arith.mulf %41, %25 : tensor<64x4xf32, #blocked> + %43 = arith.addf %arg10, %42 : tensor<64x4xf32, #blocked> + %44 = arith.select %40, %43, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> + scf.yield %44 : tensor<64x4xf32, #blocked> + } + %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %33 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %33 : f32 + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked> + %29 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + %30 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + %31 = tt.broadcast %28 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked> + %32 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 { + %33 = arith.extsi %arg9 : i32 to i64 + %34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked> + %35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked> + %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked> + %37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked> + %39 = tt.addptr %29, %38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked> + %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked> + %42 = arith.extf %41 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> + %43 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + %45 = tt.addptr %30, %38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked> + %47 = arith.extf %46 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> + %48 = arith.mulf %44, %25 : tensor<64x4xf32, #blocked> + %49 = math.exp %47 : tensor<64x4xf32, #blocked> + %50 = arith.mulf %49, %31 : tensor<64x4xf32, #blocked> + %51 = arith.subf %48, %50 : tensor<64x4xf32, #blocked> + %52 = arith.addf %42, %51 : tensor<64x4xf32, #blocked> + %53 = tt.addptr %32, %38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %54 = arith.truncf %52 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> + tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..9c541dda348c665152360aa9472a076897e8a3cc --- /dev/null +++ b/.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir @@ -0,0 +1,99 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> + %c50257_i32 = arith.constant 50257 : i32 + %c4_i32 = arith.constant 4 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<50257> : tensor<64x1xi64> + %cst_2 = arith.constant dense<50257> : tensor<1x4xi64> + %c64_i64 = arith.constant 64 : i64 + %cst_3 = arith.constant dense<-1> : tensor<64x1xi64> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c64_i64 : i64 + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64> + %6 = tt.splat %2 : (i64) -> tensor<64x1xi64> + %7 = arith.addi %6, %5 : tensor<64x1xi64> + %8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> + %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32> + %10 = arith.extsi %9 : tensor<1x4xi32> to tensor<1x4xi64> + %11 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr>, tensor<64x1xi64> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %18 = arith.muli %7, %cst_1 : tensor<64x1xi64> + %19 = tt.broadcast %18 : (tensor<64x1xi64>) -> tensor<64x4xi64> + %20 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %21 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64> + %22 = arith.divf %15, %17 : f32 + %23 = tt.splat %22 : (f32) -> tensor<64x1xf32> + %24 = arith.select %21, %23, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32> + %25 = tt.broadcast %24 : (tensor<64x1xf32>) -> tensor<64x4xf32> + %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x4xf32>) : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x4xi64> + %43 = arith.addi %42, %10 : tensor<1x4xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64> + %45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64> + %46 = arith.addi %45, %19 : tensor<64x4xi64> + %47 = tt.addptr %20, %46 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1> + %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32> + %50 = arith.mulf %49, %25 : tensor<64x4xf32> + %51 = arith.addf %arg10, %50 : tensor<64x4xf32> + %52 = arith.select %48, %51, %arg10 : tensor<64x4xi1>, tensor<64x4xf32> + scf.yield %52 : tensor<64x4xf32> + } + %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %41 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %41 : f32 + }) : (tensor<64x4xf32>) -> tensor<64xf32> + %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %29 = arith.muli %7, %cst_1 : tensor<64x1xi64> + %30 = tt.broadcast %29 : (tensor<64x1xi64>) -> tensor<64x4xi64> + %31 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %32 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %33 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %34 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64> + %35 = arith.divf %15, %17 : f32 + %36 = tt.splat %35 : (f32) -> tensor<64x1xf32> + %37 = arith.select %34, %36, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32> + %38 = tt.broadcast %37 : (tensor<64x1xf32>) -> tensor<64x4xf32> + %39 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x4xf32> + %40 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x4xi64> + %43 = arith.addi %42, %10 : tensor<1x4xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64> + %45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64> + %46 = arith.addi %45, %30 : tensor<64x4xi64> + %47 = tt.addptr %31, %46 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1> + %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16> + %50 = arith.extf %49 : tensor<64x4xbf16> to tensor<64x4xf32> + %51 = tt.addptr %32, %46 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32> + %53 = tt.addptr %33, %46 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16> + %55 = arith.extf %54 : tensor<64x4xbf16> to tensor<64x4xf32> + %56 = arith.mulf %52, %38 : tensor<64x4xf32> + %57 = math.exp %55 : tensor<64x4xf32> + %58 = arith.mulf %57, %39 : tensor<64x4xf32> + %59 = arith.subf %56, %58 : tensor<64x4xf32> + %60 = arith.addf %50, %59 : tensor<64x4xf32> + %61 = tt.addptr %40, %46 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %62 = arith.truncf %60 : tensor<64x4xf32> to tensor<64x4xbf16> + tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16> + } + tt.return + } +} diff --git a/.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir b/.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..74dfc2d7f11a2890b307d525c3b85e5ab664b620 --- /dev/null +++ b/.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir @@ -0,0 +1,99 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x512xbf16> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x1xf32> + %c50257_i32 = arith.constant 50257 : i32 + %c512_i32 = arith.constant 512 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<50257> : tensor<8x1xi64> + %cst_2 = arith.constant dense<50257> : tensor<1x512xi64> + %c8_i64 = arith.constant 8 : i64 + %cst_3 = arith.constant dense<-1> : tensor<8x1xi64> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c8_i64 : i64 + %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32>) -> tensor<8x1xi32> + %5 = arith.extsi %4 : tensor<8x1xi32> to tensor<8x1xi64> + %6 = tt.splat %2 : (i64) -> tensor<8x1xi64> + %7 = arith.addi %6, %5 : tensor<8x1xi64> + %8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32>) -> tensor<1x512xi32> + %10 = arith.extsi %9 : tensor<1x512xi32> to tensor<1x512xi64> + %11 = tt.splat %arg1 : (!tt.ptr) -> tensor<8x1x!tt.ptr> + %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64> + %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %18 = arith.muli %7, %cst_1 : tensor<8x1xi64> + %19 = tt.broadcast %18 : (tensor<8x1xi64>) -> tensor<8x512xi64> + %20 = tt.splat %arg0 : (!tt.ptr) -> tensor<8x512x!tt.ptr> + %21 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64> + %22 = arith.divf %15, %17 : f32 + %23 = tt.splat %22 : (f32) -> tensor<8x1xf32> + %24 = arith.select %21, %23, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32> + %25 = tt.broadcast %24 : (tensor<8x1xf32>) -> tensor<8x512xf32> + %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_4) -> (tensor<8x512xf32>) : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x512xi64> + %43 = arith.addi %42, %10 : tensor<1x512xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64> + %45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64> + %46 = arith.addi %45, %19 : tensor<8x512xi64> + %47 = tt.addptr %20, %46 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> + %48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1> + %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32> + %50 = arith.mulf %49, %25 : tensor<8x512xf32> + %51 = arith.addf %arg10, %50 : tensor<8x512xf32> + %52 = arith.select %48, %51, %arg10 : tensor<8x512xi1>, tensor<8x512xf32> + scf.yield %52 : tensor<8x512xf32> + } + %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %41 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %41 : f32 + }) : (tensor<8x512xf32>) -> tensor<8xf32> + %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32>) -> tensor<8x1xf32> + %29 = arith.muli %7, %cst_1 : tensor<8x1xi64> + %30 = tt.broadcast %29 : (tensor<8x1xi64>) -> tensor<8x512xi64> + %31 = tt.splat %arg4 : (!tt.ptr) -> tensor<8x512x!tt.ptr> + %32 = tt.splat %arg0 : (!tt.ptr) -> tensor<8x512x!tt.ptr> + %33 = tt.splat %arg5 : (!tt.ptr) -> tensor<8x512x!tt.ptr> + %34 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64> + %35 = arith.divf %15, %17 : f32 + %36 = tt.splat %35 : (f32) -> tensor<8x1xf32> + %37 = arith.select %34, %36, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32> + %38 = tt.broadcast %37 : (tensor<8x1xf32>) -> tensor<8x512xf32> + %39 = tt.broadcast %28 : (tensor<8x1xf32>) -> tensor<8x512xf32> + %40 = tt.splat %arg6 : (!tt.ptr) -> tensor<8x512x!tt.ptr> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x512xi64> + %43 = arith.addi %42, %10 : tensor<1x512xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64> + %45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64> + %46 = arith.addi %45, %30 : tensor<8x512xi64> + %47 = tt.addptr %31, %46 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> + %48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1> + %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16> + %50 = arith.extf %49 : tensor<8x512xbf16> to tensor<8x512xf32> + %51 = tt.addptr %32, %46 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> + %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32> + %53 = tt.addptr %33, %46 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> + %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16> + %55 = arith.extf %54 : tensor<8x512xbf16> to tensor<8x512xf32> + %56 = arith.mulf %52, %38 : tensor<8x512xf32> + %57 = math.exp %55 : tensor<8x512xf32> + %58 = arith.mulf %57, %39 : tensor<8x512xf32> + %59 = arith.subf %56, %58 : tensor<8x512xf32> + %60 = arith.addf %50, %59 : tensor<8x512xf32> + %61 = tt.addptr %40, %46 : tensor<8x512x!tt.ptr>, tensor<8x512xi64> + %62 = arith.truncf %60 : tensor<8x512xf32> to tensor<8x512xbf16> + tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16> + } + tt.return + } +} diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a41f85bf446fd199b0ca8ee65d4f48219156278b Binary files /dev/null and b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin differ diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..5ce5384541c607020921ddb3f7450a59734695c4 --- /dev/null +++ b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir @@ -0,0 +1,43 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 { + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %4 = shl i32 %3, 1, !dbg !8 + %5 = and i32 %4, 510, !dbg !8 + %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %7 = shl i32 %6, 9, !dbg !10 + %8 = or i32 %7, %5, !dbg !11 + %9 = icmp slt i32 %8, 12865792, !dbg !12 + %10 = sext i32 %8 to i64, !dbg !13 + %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 %9) #1, !dbg !14 + ret void, !dbg !15 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y") +!3 = !{ptr @triton__0d1de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1de, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 22, column: 21, scope: !5) +!13 = !DILocation(line: 25, column: 25, scope: !5) +!14 = !DILocation(line: 25, column: 36, scope: !5) +!15 = !DILocation(line: 25, column: 4, scope: !5) diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..60dc5c09b9b0527f6e98d65a2aefc0d9d18712a3 --- /dev/null +++ b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx @@ -0,0 +1,278 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1de + +.visible .entry triton__0d1de( + .param .u64 triton__0d1de_param_0, + .param .u32 triton__0d1de_param_1 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<2>; + .reg .b32 %r<9>; + .reg .b64 %rd<4>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd2, [triton__0d1de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r7, %r1, 9; + .loc 1 21 23 + or.b32 %r8, %r7, %r6; + .loc 1 22 21 + setp.lt.s32 %p1, %r8, 12865792; + .loc 1 25 25 + mul.wide.s32 %rd3, %r8, 4; + add.s64 %rd1, %rd2, %rd3; + mov.b32 %r2, 0; + .loc 1 25 36 + @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 }; + .loc 1 25 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 172 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 52 +.b8 121 +.b8 115 +.b8 101 +.b8 108 +.b8 100 +.b8 119 +.b8 109 +.b8 117 +.b8 51 +.b8 116 +.b8 111 +.b8 53 +.b8 50 +.b8 112 +.b8 98 +.b8 104 +.b8 50 +.b8 109 +.b8 100 +.b8 50 +.b8 111 +.b8 101 +.b8 117 +.b8 102 +.b8 114 +.b8 113 +.b8 51 +.b8 102 +.b8 99 +.b8 100 +.b8 109 +.b8 97 +.b8 112 +.b8 107 +.b8 116 +.b8 52 +.b8 110 +.b8 120 +.b8 100 +.b8 122 +.b8 109 +.b8 121 +.b8 113 +.b8 116 +.b8 103 +.b8 100 +.b8 50 +.b8 121 +.b8 115 +.b8 112 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 52 +.b8 121 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..264bcb3e46ad8faf7ec57d76cebbee0e3d6c9ab8 --- /dev/null +++ b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir @@ -0,0 +1,18 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked> + %c512_i32 = arith.constant 512 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a2b3493da23a3f13c51a60229f3dc4aeebbbfee0 --- /dev/null +++ b/.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir @@ -0,0 +1,17 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xf32> + %cst_0 = arith.constant dense<12865792> : tensor<512xi32> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = arith.cmpi slt, %4, %cst_0 : tensor<512xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a3e8322ca46a193b593712ce59ff85743c529845 Binary files /dev/null and b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin differ diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..6aac9921ae9465e5100834d6f6563760f8bddbff --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir @@ -0,0 +1,296 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !5 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %11 = and i32 %10, 31, !dbg !8 + %12 = lshr i32 %10, 5, !dbg !8 + %13 = and i32 %12, 1, !dbg !8 + %urem = shl i32 %10, 2, !dbg !8 + %14 = and i32 %urem, 252, !dbg !8 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %16 = shl i32 %15, 8, !dbg !10 + %17 = or i32 %16, %14, !dbg !11 + %18 = sext i32 %17 to i64, !dbg !12 + %19 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !12 + %20 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %21 = extractvalue { i32, i32 } %20, 0, !dbg !13 + %22 = extractvalue { i32, i32 } %20, 1, !dbg !13 + %23 = trunc i32 %21 to i16, !dbg !13 + %extelt.offset = lshr i32 %21, 16, !dbg !13 + %24 = trunc i32 %extelt.offset to i16, !dbg !13 + %25 = trunc i32 %22 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %22, 16, !dbg !13 + %26 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14 + %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14 + %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14 + %31 = zext nneg i32 %14 to i64, !dbg !15 + %32 = getelementptr float, ptr addrspace(1) %2, i64 %31, !dbg !15 + %33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !16 + %35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !16 + %36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !16 + %37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !16 + %38 = bitcast i32 %34 to float, !dbg !16 + %39 = bitcast i32 %35 to float, !dbg !16 + %40 = bitcast i32 %36 to float, !dbg !16 + %41 = bitcast i32 %37 to float, !dbg !16 + %42 = getelementptr float, ptr addrspace(1) %3, i64 %18, !dbg !17 + %43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18 + %45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18 + %46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18 + %47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18 + %48 = bitcast i32 %44 to float, !dbg !18 + %49 = bitcast i32 %45 to float, !dbg !18 + %50 = bitcast i32 %46 to float, !dbg !18 + %51 = bitcast i32 %47 to float, !dbg !18 + %52 = sext i32 %15 to i64, !dbg !19 + %53 = getelementptr float, ptr addrspace(1) %4, i64 %52, !dbg !19 + %54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20 + %55 = bitcast i32 %54 to float, !dbg !20 + %56 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20 + %57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20 + %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20 + %59 = getelementptr float, ptr addrspace(1) %5, i64 %52, !dbg !21 + %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22 + %61 = bitcast i32 %60 to float, !dbg !22 + %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22 + %63 = bitcast i32 %62 to float, !dbg !22 + %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22 + %65 = bitcast i32 %64 to float, !dbg !22 + %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22 + %67 = bitcast i32 %66 to float, !dbg !22 + %68 = getelementptr float, ptr addrspace(1) %0, i64 %18, !dbg !23 + %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24 + %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !24 + %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !24 + %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !24 + %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !24 + %74 = bitcast i32 %70 to float, !dbg !24 + %75 = bitcast i32 %71 to float, !dbg !24 + %76 = bitcast i32 %72 to float, !dbg !24 + %77 = bitcast i32 %73 to float, !dbg !24 + %78 = fmul float %27, %38, !dbg !25 + %79 = fmul float %28, %39, !dbg !25 + %80 = fmul float %29, %40, !dbg !25 + %81 = fmul float %30, %41, !dbg !25 + %82 = fadd float %78, %79, !dbg !26 + %83 = fadd float %80, %82, !dbg !26 + %84 = fadd float %81, %83, !dbg !26 + %85 = bitcast float %84 to i32, !dbg !32 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !32 + %87 = bitcast i32 %86 to float, !dbg !32 + %88 = fadd float %84, %87, !dbg !26 + %89 = bitcast float %88 to i32, !dbg !32 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !32 + %91 = bitcast i32 %90 to float, !dbg !32 + %92 = fadd float %88, %91, !dbg !26 + %93 = bitcast float %92 to i32, !dbg !32 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !32 + %95 = bitcast i32 %94 to float, !dbg !32 + %96 = fadd float %92, %95, !dbg !26 + %97 = bitcast float %96 to i32, !dbg !32 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !32 + %99 = bitcast i32 %98 to float, !dbg !32 + %100 = fadd float %96, %99, !dbg !26 + %101 = bitcast float %100 to i32, !dbg !32 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !32 + %103 = bitcast i32 %102 to float, !dbg !32 + %104 = fadd float %100, %103, !dbg !26 + %105 = icmp eq i32 %11, 0, !dbg !32 + %106 = zext nneg i32 %13 to i64, !dbg !32 + %107 = getelementptr float, ptr addrspace(3) @global_smem, i64 %106, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %104, i1 %105) #3, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %108 = icmp slt i32 %10, 2, !dbg !32 + %109 = sext i32 %10 to i64, !dbg !32 + %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !32 + %111 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !32 + %112 = bitcast float %111 to i32, !dbg !32 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !32 + %114 = bitcast i32 %113 to float, !dbg !32 + %115 = fadd float %111, %114, !dbg !26 + %116 = and i32 %10, 1, !dbg !32 + %117 = icmp eq i32 %116, 0, !dbg !32 + %118 = and i1 %108, %117, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %115, i1 %118) #3, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %119 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 + %120 = fadd float %119, 0.000000e+00, !dbg !34 + %121 = fsub float %48, %55, !dbg !38 + %122 = fsub float %49, %55, !dbg !38 + %123 = fsub float %50, %55, !dbg !38 + %124 = fsub float %51, %55, !dbg !38 + %125 = fmul float %121, %61, !dbg !39 + %126 = fmul float %122, %61, !dbg !39 + %127 = fmul float %123, %61, !dbg !39 + %128 = fmul float %124, %61, !dbg !39 + %129 = fmul float %78, %125, !dbg !40 + %130 = fmul float %79, %126, !dbg !40 + %131 = fmul float %80, %127, !dbg !40 + %132 = fmul float %81, %128, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %133 = fadd float %129, %130, !dbg !43 + %134 = fadd float %131, %133, !dbg !43 + %135 = fadd float %132, %134, !dbg !43 + %136 = bitcast float %135 to i32, !dbg !41 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !41 + %138 = bitcast i32 %137 to float, !dbg !41 + %139 = fadd float %135, %138, !dbg !43 + %140 = bitcast float %139 to i32, !dbg !41 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !41 + %142 = bitcast i32 %141 to float, !dbg !41 + %143 = fadd float %139, %142, !dbg !43 + %144 = bitcast float %143 to i32, !dbg !41 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !41 + %146 = bitcast i32 %145 to float, !dbg !41 + %147 = fadd float %143, %146, !dbg !43 + %148 = bitcast float %147 to i32, !dbg !41 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !41 + %150 = bitcast i32 %149 to float, !dbg !41 + %151 = fadd float %147, %150, !dbg !43 + %152 = bitcast float %151 to i32, !dbg !41 + %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !41 + %154 = bitcast i32 %153 to float, !dbg !41 + %155 = fadd float %151, %154, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %155, i1 %105) #3, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !41 + %157 = bitcast float %156 to i32, !dbg !41 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !41 + %159 = bitcast i32 %158 to float, !dbg !41 + %160 = fadd float %156, %159, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %160, i1 %118) #3, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41 + %162 = fadd float %161, 0.000000e+00, !dbg !46 + %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %61, float 2.560000e+02) #3, !dbg !48 + %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %63, float 2.560000e+02) #3, !dbg !48 + %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !48 + %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !48 + %167 = fmul float %78, 2.560000e+02, !dbg !49 + %168 = fmul float %79, 2.560000e+02, !dbg !49 + %169 = fmul float %80, 2.560000e+02, !dbg !49 + %170 = fmul float %81, 2.560000e+02, !dbg !49 + %171 = fsub float %167, %120, !dbg !50 + %172 = fsub float %168, %120, !dbg !50 + %173 = fsub float %169, %120, !dbg !50 + %174 = fsub float %170, %120, !dbg !50 + %175 = fmul float %125, %162, !dbg !51 + %176 = fmul float %126, %162, !dbg !51 + %177 = fmul float %127, %162, !dbg !51 + %178 = fmul float %128, %162, !dbg !51 + %179 = fsub float %171, %175, !dbg !52 + %180 = fsub float %172, %176, !dbg !52 + %181 = fsub float %173, %177, !dbg !52 + %182 = fsub float %174, %178, !dbg !52 + %183 = fmul float %163, %179, !dbg !53 + %184 = fmul float %163, %180, !dbg !53 + %185 = fmul float %163, %181, !dbg !53 + %186 = fmul float %163, %182, !dbg !53 + %187 = fadd float %183, %74, !dbg !54 + %188 = fadd float %184, %75, !dbg !54 + %189 = fadd float %185, %76, !dbg !54 + %190 = fadd float %186, %77, !dbg !54 + %191 = bitcast float %187 to i32, !dbg !55 + %192 = bitcast float %188 to i32, !dbg !55 + %193 = bitcast float %189 to i32, !dbg !55 + %194 = bitcast float %190 to i32, !dbg !55 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %191, i32 %192, i32 %193, i32 %194, ptr addrspace(1) %68, i1 true) #3, !dbg !55 + %195 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56 + %196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #3, !dbg !57 + %197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !57 + %198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !57 + %199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !57 + %200 = insertelement <2 x i16> undef, i16 %196, i64 0, !dbg !57 + %201 = insertelement <2 x i16> %200, i16 %197, i64 1, !dbg !57 + %202 = bitcast <2 x i16> %201 to i32, !dbg !57 + %203 = insertelement <2 x i16> undef, i16 %198, i64 0, !dbg !57 + %204 = insertelement <2 x i16> %203, i16 %199, i64 1, !dbg !57 + %205 = bitcast <2 x i16> %204 to i32, !dbg !57 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %202, i32 %205, ptr addrspace(1) %195, i1 true) #3, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py", directory: "/tmp/torchinductor_root/sn") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 30, column: 67, scope: !5) +!15 = !DILocation(line: 31, column: 30, scope: !5) +!16 = !DILocation(line: 31, column: 35, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 33, column: 30, scope: !5) +!20 = !DILocation(line: 33, column: 35, scope: !5) +!21 = !DILocation(line: 34, column: 31, scope: !5) +!22 = !DILocation(line: 34, column: 36, scope: !5) +!23 = !DILocation(line: 35, column: 35, scope: !5) +!24 = !DILocation(line: 35, column: 51, scope: !5) +!25 = !DILocation(line: 37, column: 18, scope: !5) +!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) +!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) +!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) +!31 = !DILocation(line: 40, column: 57, scope: !27) +!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) +!33 = !DILocation(line: 40, column: 57, scope: !29) +!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37) +!35 = distinct !DILexicalBlockFile(scope: !5, file: !36, discriminator: 0) +!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!37 = !DILocation(line: 40, column: 44, scope: !35) +!38 = !DILocation(line: 41, column: 19, scope: !5) +!39 = !DILocation(line: 42, column: 20, scope: !5) +!40 = !DILocation(line: 43, column: 19, scope: !5) +!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42) +!42 = !DILocation(line: 46, column: 59, scope: !29) +!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44) +!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45) +!45 = !DILocation(line: 46, column: 59, scope: !27) +!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47) +!47 = !DILocation(line: 46, column: 45, scope: !35) +!48 = !DILocation(line: 48, column: 20, scope: !5) +!49 = !DILocation(line: 49, column: 19, scope: !5) +!50 = !DILocation(line: 50, column: 20, scope: !5) +!51 = !DILocation(line: 51, column: 20, scope: !5) +!52 = !DILocation(line: 52, column: 20, scope: !5) +!53 = !DILocation(line: 53, column: 20, scope: !5) +!54 = !DILocation(line: 54, column: 20, scope: !5) +!55 = !DILocation(line: 56, column: 51, scope: !5) +!56 = !DILocation(line: 57, column: 25, scope: !5) +!57 = !DILocation(line: 57, column: 48, scope: !5) +!58 = !DILocation(line: 57, column: 4, scope: !5) diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7a6024757479d55932884faecaac8e7c2a0102b4 --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx @@ -0,0 +1,743 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7de8de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7de8de( + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6, + .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<37>; + .reg .b16 %rs<9>; + .reg .b32 %r<110>; + .reg .f32 %f<86>; + .reg .b64 %rd<26>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r76, %tid.x; + and.b32 %r77, %r76, 31; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4]; + shl.b32 %r78, %r76, 2; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5]; + and.b32 %r79, %r78, 252; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r80, %r1, 8; + .loc 1 30 36 + or.b32 %r81, %r80, %r79; + .loc 1 30 30 + mul.wide.s32 %rd22, %r81, 2; + add.s64 %rd1, %rd16, %rd22; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + mul.wide.u32 %rd23, %r79, 4; + add.s64 %rd2, %rd17, %rd23; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd24, %r81, 4; + add.s64 %rd3, %rd18, %rd24; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 30 + mul.wide.s32 %rd25, %r1, 4; + add.s64 %rd4, %rd19, %rd25; + .loc 1 33 35 + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ]; + mov.b32 %f13, %r26; + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ]; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ]; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ]; + .loc 1 34 31 + add.s64 %rd8, %rd20, %rd25; + .loc 1 34 36 + mov.u32 %r55, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ]; + mov.b32 %f14, %r55; + mov.u32 %r31, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ]; + mov.u32 %r32, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ]; + mov.u32 %r33, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ]; + .loc 1 35 35 + add.s64 %rd12, %rd15, %rd24; + .loc 1 35 51 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ]; + @!%p1 mov.u32 %r34, %r4; + @!%p1 mov.u32 %r35, %r4; + @!%p1 mov.u32 %r36, %r4; + @!%p1 mov.u32 %r37, %r4; + mov.b32 %f15, %r34; + mov.b32 %f16, %r35; + mov.b32 %f17, %r36; + mov.b32 %f18, %r37; + .loc 1 37 18 + mul.f32 %f19, %f1, %f5; + mul.f32 %f20, %f2, %f6; + mul.f32 %f21, %f3, %f7; + mul.f32 %f22, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f23, %f1, %f5, %f20; + fma.rn.f32 %f24, %f3, %f7, %f23; + fma.rn.f32 %f25, %f4, %f8, %f24; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r82, %f25; + shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1; + mov.b32 %f26, %r83; +$L__tmp3: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r84, %f27; + shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1; + mov.b32 %f28, %r85; +$L__tmp5: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r86, %f29; + shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1; + mov.b32 %f30, %r87; +$L__tmp7: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r88, %f31; + shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1; + mov.b32 %f32, %r89; +$L__tmp9: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r90, %f33; + shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1; + mov.b32 %f34, %r91; +$L__tmp11: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p27, %r77, 0; + shr.u32 %r92, %r76, 3; + and.b32 %r93, %r92, 4; + mov.u32 %r94, global_smem; + add.s32 %r42, %r94, %r93; + mov.b32 %r43, %f35; + @%p27 st.shared.b32 [ %r42 + 0 ], %r43; + bar.sync 0; + setp.lt.s32 %p28, %r76, 2; + add.s32 %r45, %r94, %r78; + @%p28 ld.shared.b32 %r44, [ %r45 + 0 ]; + mov.b32 %f36, %r44; + shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1; + mov.b32 %f37, %r95; +$L__tmp13: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp14: + .loc 2 243 36 + and.b32 %r96, %r76, 1; + setp.eq.b32 %p35, %r96, 1; + not.pred %p36, %p35; + and.pred %p29, %p28, %p36; + mov.b32 %r47, %f38; + @%p29 st.shared.b32 [ %r45 + 0 ], %r47; + bar.sync 0; + ld.shared.f32 %f39, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f40, %f39, 0f00000000; +$L__tmp16: + .loc 1 41 19 + sub.f32 %f41, %f9, %f13; + sub.f32 %f42, %f10, %f13; + sub.f32 %f43, %f11, %f13; + sub.f32 %f44, %f12, %f13; + .loc 1 42 20 + mul.f32 %f45, %f41, %f14; + mul.f32 %f46, %f42, %f14; + mul.f32 %f47, %f43, %f14; + mul.f32 %f48, %f44, %f14; + .loc 1 43 19 + mul.f32 %f49, %f20, %f46; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f50, %f19, %f45, %f49; + fma.rn.f32 %f51, %f21, %f47, %f50; + fma.rn.f32 %f52, %f22, %f48, %f51; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r97, %f52; + shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1; + mov.b32 %f53, %r98; +$L__tmp20: + .loc 2 233 15 + add.f32 %f54, %f52, %f53; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r99, %f54; + shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1; + mov.b32 %f55, %r100; +$L__tmp22: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r101, %f56; + shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1; + mov.b32 %f57, %r102; +$L__tmp24: + .loc 2 233 15 + add.f32 %f58, %f56, %f57; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r103, %f58; + shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1; + mov.b32 %f59, %r104; +$L__tmp26: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r105, %f60; + shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1; + mov.b32 %f61, %r106; +$L__tmp28: + .loc 2 233 15 + add.f32 %f62, %f60, %f61; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r49, %f62; + @%p27 st.shared.b32 [ %r42 + 0 ], %r49; + bar.sync 0; + @%p28 ld.shared.b32 %r50, [ %r45 + 0 ]; + mov.b32 %f63, %r50; + shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1; + mov.b32 %f64, %r107; +$L__tmp30: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r53, %f65; + @%p29 st.shared.b32 [ %r45 + 0 ], %r53; + bar.sync 0; + ld.shared.f32 %f66, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f67, %f66, 0f00000000; + mov.b32 %r56, 1132462080; +$L__tmp33: + .loc 1 48 20 + div.full.f32 %r54, %r55, %r56; + mov.b32 %f68, %r54; + .loc 1 50 20 + neg.f32 %f69, %f40; + fma.rn.f32 %f70, %f19, 0f43800000, %f69; + fma.rn.f32 %f71, %f20, 0f43800000, %f69; + fma.rn.f32 %f72, %f21, 0f43800000, %f69; + fma.rn.f32 %f73, %f22, 0f43800000, %f69; + .loc 1 52 20 + neg.f32 %f74, %f45; + fma.rn.f32 %f75, %f74, %f67, %f70; + neg.f32 %f76, %f46; + fma.rn.f32 %f77, %f76, %f67, %f71; + neg.f32 %f78, %f47; + fma.rn.f32 %f79, %f78, %f67, %f72; + neg.f32 %f80, %f48; + fma.rn.f32 %f81, %f80, %f67, %f73; + .loc 1 54 20 + fma.rn.f32 %f82, %f68, %f75, %f15; + fma.rn.f32 %f83, %f68, %f77, %f16; + fma.rn.f32 %f84, %f68, %f79, %f17; + fma.rn.f32 %f85, %f68, %f81, %f18; + .loc 1 56 51 + mov.b32 %r66, %f82; + mov.b32 %r67, %f83; + mov.b32 %r68, %f84; + mov.b32 %r69, %f85; + @%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 }; + .loc 1 57 25 + add.s64 %rd14, %rd21, %rd22; + .loc 1 57 48 + cvt.rn.bf16.f32 %rs5, %r66; + cvt.rn.bf16.f32 %rs6, %r67; + cvt.rn.bf16.f32 %rs7, %r68; + cvt.rn.bf16.f32 %rs8, %r69; + mov.b32 %r108, {%rs5, %rs6}; + mov.b32 %r109, {%rs7, %rs8}; + @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 }; + .loc 1 57 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 403 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 115 +.b8 110 +.b8 101 +.b8 100 +.b8 52 +.b8 104 +.b8 121 +.b8 120 +.b8 112 +.b8 103 +.b8 119 +.b8 117 +.b8 53 +.b8 116 +.b8 116 +.b8 117 +.b8 98 +.b8 115 +.b8 51 +.b8 114 +.b8 55 +.b8 117 +.b8 120 +.b8 107 +.b8 106 +.b8 113 +.b8 53 +.b8 121 +.b8 102 +.b8 108 +.b8 51 +.b8 122 +.b8 104 +.b8 54 +.b8 99 +.b8 50 +.b8 115 +.b8 111 +.b8 122 +.b8 111 +.b8 98 +.b8 116 +.b8 107 +.b8 101 +.b8 107 +.b8 50 +.b8 117 +.b8 122 +.b8 102 +.b8 99 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 115 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 40 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 40 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 40 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 46 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 46 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 46 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 407 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 407 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5d5a0f08fc13b7c6dfa6263958334def1d6b4661 --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir @@ -0,0 +1,73 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %24 = tt.load %23, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %25 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %26 = arith.select %2, %25, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %28 = arith.addf %27, %cst_1 : f32 + %29 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %30 = arith.subf %15, %29 : tensor<256xf32, #blocked> + %31 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %32 = arith.mulf %30, %31 : tensor<256xf32, #blocked> + %33 = arith.mulf %25, %32 : tensor<256xf32, #blocked> + %34 = arith.select %2, %33, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %36 = arith.addf %35, %cst_1 : f32 + %37 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked> + %38 = arith.mulf %25, %cst_3 : tensor<256xf32, #blocked> + %39 = tt.splat %28 : (f32) -> tensor<256xf32, #blocked> + %40 = arith.subf %38, %39 : tensor<256xf32, #blocked> + %41 = tt.splat %36 : (f32) -> tensor<256xf32, #blocked> + %42 = arith.mulf %32, %41 : tensor<256xf32, #blocked> + %43 = arith.subf %40, %42 : tensor<256xf32, #blocked> + %44 = tt.broadcast %37 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %45 = arith.mulf %44, %43 : tensor<256xf32, #blocked> + %46 = arith.addf %24, %45 : tensor<256xf32, #blocked> + tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %47 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %49 = arith.truncf %46 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0f80420eebcc22a5a99ebb4cc6417788c7064252 --- /dev/null +++ b/.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir @@ -0,0 +1,72 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %25 = arith.mulf %9, %12 : tensor<256xf32> + %26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32> + %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32>) -> f32 + %28 = arith.addf %27, %cst_0 : f32 + %29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32> + %30 = arith.subf %15, %29 : tensor<256xf32> + %31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32> + %32 = arith.mulf %30, %31 : tensor<256xf32> + %33 = arith.mulf %25, %32 : tensor<256xf32> + %34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32> + %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %50 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %50 : f32 + }) : (tensor<256xf32>) -> f32 + %36 = arith.addf %35, %cst_0 : f32 + %37 = arith.divf %21, %cst_3 : tensor<1xf32> + %38 = arith.mulf %25, %cst_2 : tensor<256xf32> + %39 = tt.splat %28 : (f32) -> tensor<256xf32> + %40 = arith.subf %38, %39 : tensor<256xf32> + %41 = tt.splat %36 : (f32) -> tensor<256xf32> + %42 = arith.mulf %32, %41 : tensor<256xf32> + %43 = arith.subf %40, %42 : tensor<256xf32> + %44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32> + %45 = arith.mulf %44, %43 : tensor<256xf32> + %46 = arith.addf %24, %45 : tensor<256xf32> + tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %47 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr> + %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16> + tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir b/.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a3ae107784895496638baeb1ae5d61c3be208225 --- /dev/null +++ b/.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir @@ -0,0 +1,98 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> + %c50257_i32 = arith.constant 50257 : i32 + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<50257> : tensor<64x1xi64> + %cst_2 = arith.constant dense<50257> : tensor<1x64xi64> + %c64_i64 = arith.constant 64 : i64 + %cst_3 = arith.constant dense<-1> : tensor<64x1xi64> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c64_i64 : i64 + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64> + %6 = tt.splat %2 : (i64) -> tensor<64x1xi64> + %7 = arith.addi %6, %5 : tensor<64x1xi64> + %8 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> + %9 = arith.extsi %8 : tensor<1x64xi32> to tensor<1x64xi64> + %10 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %11 = tt.addptr %10, %7 : tensor<64x1x!tt.ptr>, tensor<64x1xi64> + %12 = tt.load %11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %13 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %14 = tt.load %13 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %15 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %16 = tt.load %15 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %17 = arith.muli %7, %cst_1 : tensor<64x1xi64> + %18 = tt.broadcast %17 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %19 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %20 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64> + %21 = arith.divf %14, %16 : f32 + %22 = tt.splat %21 : (f32) -> tensor<64x1xf32> + %23 = arith.select %20, %22, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32> + %24 = tt.broadcast %23 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %25 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x64xf32>) : i32 { + %40 = arith.extsi %arg9 : i32 to i64 + %41 = tt.splat %40 : (i64) -> tensor<1x64xi64> + %42 = arith.addi %41, %9 : tensor<1x64xi64> + %43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64> + %44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %45 = arith.addi %44, %18 : tensor<64x64xi64> + %46 = tt.addptr %19, %45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %48 = tt.load %46, %47, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %49 = arith.mulf %48, %24 : tensor<64x64xf32> + %50 = arith.addf %arg10, %49 : tensor<64x64xf32> + %51 = arith.select %47, %50, %arg10 : tensor<64x64xi1>, tensor<64x64xf32> + scf.yield %51 : tensor<64x64xf32> + } + %26 = "tt.reduce"(%25) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %40 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<64x64xf32>) -> tensor<64xf32> + %27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %28 = arith.muli %7, %cst_1 : tensor<64x1xi64> + %29 = tt.broadcast %28 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %30 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %31 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %32 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %33 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64> + %34 = arith.divf %14, %16 : f32 + %35 = tt.splat %34 : (f32) -> tensor<64x1xf32> + %36 = arith.select %33, %35, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32> + %37 = tt.broadcast %36 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %38 = tt.broadcast %27 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %39 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 : i32 { + %40 = arith.extsi %arg9 : i32 to i64 + %41 = tt.splat %40 : (i64) -> tensor<1x64xi64> + %42 = arith.addi %41, %9 : tensor<1x64xi64> + %43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64> + %44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %45 = arith.addi %44, %29 : tensor<64x64xi64> + %46 = tt.addptr %30, %45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %48 = tt.load %46, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16> + %49 = arith.extf %48 : tensor<64x64xbf16> to tensor<64x64xf32> + %50 = tt.addptr %31, %45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %51 = tt.load %50, %47, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32> + %52 = tt.addptr %32, %45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %53 = tt.load %52, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16> + %54 = arith.extf %53 : tensor<64x64xbf16> to tensor<64x64xf32> + %55 = arith.mulf %51, %37 : tensor<64x64xf32> + %56 = math.exp %54 : tensor<64x64xf32> + %57 = arith.mulf %56, %38 : tensor<64x64xf32> + %58 = arith.subf %55, %57 : tensor<64x64xf32> + %59 = arith.addf %49, %58 : tensor<64x64xf32> + %60 = tt.addptr %39, %45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %61 = arith.truncf %59 : tensor<64x64xf32> to tensor<64x64xbf16> + tt.store %60, %61, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16> + } + tt.return + } +} diff --git a/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e3aaa8b0fc66c66f6d7ef712b39b80d68a9c306a Binary files /dev/null and b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin differ diff --git a/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ca9a894ba0839abaf31c56d84eca5575c9a50afc --- /dev/null +++ b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx @@ -0,0 +1,1054 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<56>; + .reg .b16 %rs<13>; + .reg .b32 %r<185>; + .reg .f32 %f<169>; + .reg .b64 %rd<59>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4]; + ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1]; + ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2]; + ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3]; + bfe.u32 %r3, %r1, 6, 1; + and.b32 %r4, %r1, 1; + .loc 1 24 33 + bfe.u32 %r5, %r1, 5, 1; + shl.b32 %r31, %r1, 2; + and.b32 %r6, %r31, 252; + shl.b32 %r32, %r1, 1; + and.b32 %r7, %r32, 254; + .loc 1 21 28 + mov.u32 %r14, %ctaid.x; + .loc 1 21 33 + shl.b32 %r33, %r14, 1; + .loc 1 22 23 + or.b32 %r34, %r33, %r3; + or.b32 %r35, %r33, %r4; + .loc 1 26 30 + mul.wide.s32 %rd25, %r34, 8; + add.s64 %rd11, %rd22, %rd25; + mul.wide.s32 %rd26, %r35, 8; + add.s64 %rd19, %rd22, %rd26; + mov.pred %p50, -1; + .loc 1 26 35 + mov.u64 %rd10, 0x0; + @%p50 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ]; + mov.u64 %rd12, 0x0; + @%p50 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ]; + mov.u64 %rd14, 0x0; + @%p50 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ]; + mov.u64 %rd16, 0x0; + @%p50 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ]; + mov.u64 %rd18, 0x0; + @%p50 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ]; + .loc 1 27 18 + bfe.s32 %r36, %r14, 30, 1; + shr.u32 %r37, %r36, 23; + add.s32 %r38, %r34, %r37; + and.b32 %r39, %r38, 16776704; + sub.s32 %r40, %r34, %r39; + .loc 1 35 44 + shl.b32 %r41, %r40, 8; + .loc 1 35 40 + or.b32 %r42, %r41, %r6; + .loc 1 35 34 + mul.wide.s32 %rd27, %r42, 4; + add.s64 %rd38, %rd23, %rd27; + mov.b32 %r155, 0; + .loc 1 35 50 + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + mov.u32 %r18, 0x0; + @%p50 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd38 + 0 ]; + @!%p50 mov.u32 %r15, %r155; + @!%p50 mov.u32 %r16, %r155; + @!%p50 mov.u32 %r17, %r155; + @!%p50 mov.u32 %r18, %r155; + mov.b32 %f2, %r15; + mov.b32 %f1, %r16; + mov.b32 %f3, %r17; + mov.b32 %f4, %r18; + .loc 1 36 44 + shl.b32 %r43, %r34, 8; + .loc 1 36 40 + or.b32 %r44, %r43, %r6; + .loc 1 36 34 + mul.wide.s32 %rd28, %r44, 2; + add.s64 %rd39, %rd24, %rd28; + .loc 1 36 50 + mov.u32 %r23, 0x0; + mov.u32 %r24, 0x0; + @%p50 ld.global.L1::evict_last.v2.b32 { %r23, %r24 }, [ %rd39 + 0 ]; + @!%p50 mov.u32 %r23, %r155; + @!%p50 mov.u32 %r24, %r155; + cvt.u16.u32 %rs1, %r23; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r23; } + cvt.u16.u32 %rs3, %r24; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r24; } + .loc 1 36 101 + cvt.f32.bf16 %r27, %rs1; + mov.b32 %f5, %r27; + cvt.f32.bf16 %r28, %rs2; + mov.b32 %f6, %r28; + cvt.f32.bf16 %r29, %rs3; + mov.b32 %f7, %r29; + cvt.f32.bf16 %r30, %rs4; + mov.b32 %f8, %r30; + .loc 1 37 22 + add.s64 %rd29, %rd18, 50257; + .loc 1 38 22 + setp.lt.s64 %p14, %rd18, 0; + .loc 1 39 36 + selp.b64 %rd5, %rd29, %rd18, %p14; + .loc 1 40 40 + setp.lt.u64 %p15, %rd5, 50257; + mov.b32 %r184, 883; + mov.u64 %rd58, 1; + .loc 1 40 55 + @%p15 bra $L__BB0_2; + mov.u64 %rd30, assertMessage_0; + cvta.global.u64 %rd31, %rd30; + mov.u64 %rd32, assertFile_0; + cvta.global.u64 %rd33, %rd32; + mov.u64 %rd34, assertFunc_0; + cvta.global.u64 %rd35, %rd34; + { // callseq 4, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd31; + .param .b64 param1; + st.param.b64 [param1+0], %rd33; + .param .b32 param2; + st.param.b32 [param2+0], %r184; + .param .b64 param3; + st.param.b64 [param3+0], %rd35; + .param .b64 param4; + st.param.b64 [param4+0], %rd58; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 4 +$L__BB0_2: + .loc 1 0 55 + ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5]; + cvt.s64.s32 %rd3, %r44; + .loc 1 38 22 + setp.lt.s64 %p42, %rd10, 0; + .loc 1 41 44 + shl.b64 %rd41, %rd10, 8; + add.s64 %rd42, %rd41, 12865792; + selp.b64 %rd43, %rd42, %rd41, %p42; + cvt.u64.u32 %rd44, %r6; + .loc 1 41 40 + or.b64 %rd45, %rd43, %rd44; + .loc 1 41 34 + shl.b64 %rd46, %rd45, 2; + add.s64 %rd55, %rd7, %rd46; + .loc 1 41 52 + mov.u32 %r46, 0x0; + mov.u32 %r47, 0x0; + mov.u32 %r48, 0x0; + mov.u32 %r49, 0x0; + @%p50 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd55 + 0 ]; + @!%p50 mov.u32 %r46, %r155; + @!%p50 mov.u32 %r47, %r155; + @!%p50 mov.u32 %r48, %r155; + @!%p50 mov.u32 %r49, %r155; + mov.b32 %f15, %r48; + mov.b32 %f16, %r49; + .loc 1 42 22 + add.f32 %f17, %f3, %f15; + add.f32 %f18, %f4, %f16; + .loc 1 44 22 + add.f32 %f19, %f7, %f17; + add.f32 %f20, %f8, %f18; + .loc 1 41 52 + mov.b32 %f21, %r46; + mov.b32 %f22, %r47; + .loc 1 42 22 + add.f32 %f23, %f1, %f22; + add.f32 %f24, %f2, %f21; + .loc 1 44 22 + add.f32 %f25, %f5, %f24; + add.f32 %f26, %f6, %f23; +$L__tmp1: + .loc 2 98 22 + add.f32 %f27, %f26, 0f00000000; + add.f32 %f28, %f25, 0f00000000; + add.f32 %f29, %f19, 0f00000000; + add.f32 %f30, %f20, 0f00000000; + .loc 2 101 30 + sub.f32 %f31, %f25, %f28; + sub.f32 %f32, %f26, %f27; + sub.f32 %f33, %f19, %f29; + sub.f32 %f34, %f20, %f30; + .loc 2 101 13 + fma.rn.f32 %f35, %f25, %f31, 0f00000000; + fma.rn.f32 %f36, %f26, %f32, 0f00000000; + fma.rn.f32 %f37, %f19, %f33, 0f00000000; + fma.rn.f32 %f38, %f20, %f34, 0f00000000; +$L__tmp2: + .loc 2 108 21 + sub.f32 %f39, %f27, %f28; + mov.b32 %r55, 1065353216; + mov.b32 %r56, 1073741824; + .loc 2 110 60 + div.full.f32 %r54, %r55, %r56; + mov.b32 %f40, %r54; + .loc 2 112 17 + fma.rn.f32 %f41, %f40, %f39, %f28; + .loc 2 113 15 + add.f32 %f42, %f35, %f36; + .loc 2 113 30 + mul.f32 %f43, %f39, %f39; + .loc 2 113 22 + fma.rn.f32 %f44, %f40, %f43, %f42; + .loc 2 108 21 + sub.f32 %f45, %f29, %f41; + mov.b32 %r59, 1077936128; + .loc 2 110 60 + div.full.f32 %r57, %r55, %r59; + mov.b32 %f46, %r57; + .loc 2 112 17 + fma.rn.f32 %f47, %f46, %f45, %f41; + .loc 2 113 15 + add.f32 %f48, %f37, %f44; + .loc 2 113 30 + mul.f32 %f49, %f45, %f45; + .loc 2 113 38 + fma.rn.f32 %f50, %f45, %f45, %f49; + .loc 2 113 22 + fma.rn.f32 %f51, %f46, %f50, %f48; + .loc 2 108 21 + sub.f32 %f52, %f30, %f47; + mov.b32 %r62, 1082130432; + .loc 2 110 60 + div.full.f32 %r60, %r55, %r62; + mov.b32 %f53, %r60; + .loc 2 112 17 + fma.rn.f32 %f54, %f53, %f52, %f47; + .loc 2 113 15 + add.f32 %f55, %f38, %f51; + .loc 2 113 30 + mul.f32 %f56, %f52, %f52; + .loc 2 113 38 + mul.f32 %f57, %f56, 0f40400000; + .loc 2 113 22 + fma.rn.f32 %f58, %f53, %f57, %f55; +$L__tmp3: + .loc 2 120 46 + mov.b32 %r119, %f54; + shfl.sync.bfly.b32 %r120, %r119, 16, 31, -1; + mov.b32 %f59, %r120; + mov.b32 %r121, %f58; + shfl.sync.bfly.b32 %r122, %r121, 16, 31, -1; + mov.b32 %f60, %r122; + shfl.sync.bfly.b32 %r64, %r62, 16, 31, -1; + mov.b32 %f61, %r64; +$L__tmp4: + .loc 2 108 21 + sub.f32 %f62, %f59, %f54; + .loc 2 109 28 + add.f32 %f63, %f61, 0f40800000; + .loc 2 110 39 + setp.eq.f32 %p43, %f63, 0f00000000; + .loc 2 110 60 + mov.b32 %r65, %f63; + div.full.f32 %r63, %r64, %r65; + mov.b32 %f64, %r63; + .loc 2 110 49 + selp.f32 %f65, 0f00000000, %f64, %p43; + .loc 2 112 17 + fma.rn.f32 %f66, %f65, %f62, %f54; + .loc 2 113 15 + add.f32 %f67, %f58, %f60; + .loc 2 113 30 + mul.f32 %f68, %f62, %f62; + .loc 2 113 38 + mul.f32 %f69, %f68, 0f40800000; + .loc 2 113 22 + fma.rn.f32 %f70, %f65, %f69, %f67; +$L__tmp5: + .loc 2 120 46 + mov.b32 %r123, %f66; + shfl.sync.bfly.b32 %r124, %r123, 8, 31, -1; + mov.b32 %f71, %r124; + mov.b32 %r125, %f70; + shfl.sync.bfly.b32 %r126, %r125, 8, 31, -1; + mov.b32 %f72, %r126; + shfl.sync.bfly.b32 %r67, %r65, 8, 31, -1; + mov.b32 %f73, %r67; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f74, %f71, %f66; + .loc 2 109 28 + add.f32 %f75, %f63, %f73; + .loc 2 110 39 + setp.eq.f32 %p44, %f75, 0f00000000; + .loc 2 110 60 + mov.b32 %r68, %f75; + div.full.f32 %r66, %r67, %r68; + mov.b32 %f76, %r66; + .loc 2 110 49 + selp.f32 %f77, 0f00000000, %f76, %p44; + .loc 2 112 17 + fma.rn.f32 %f78, %f77, %f74, %f66; + .loc 2 113 15 + add.f32 %f79, %f70, %f72; + .loc 2 113 30 + mul.f32 %f80, %f74, %f74; + .loc 2 113 38 + mul.f32 %f81, %f63, %f80; + .loc 2 113 22 + fma.rn.f32 %f82, %f77, %f81, %f79; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r127, %f78; + shfl.sync.bfly.b32 %r128, %r127, 4, 31, -1; + mov.b32 %f83, %r128; + mov.b32 %r129, %f82; + shfl.sync.bfly.b32 %r130, %r129, 4, 31, -1; + mov.b32 %f84, %r130; + shfl.sync.bfly.b32 %r70, %r68, 4, 31, -1; + mov.b32 %f85, %r70; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f86, %f83, %f78; + .loc 2 109 28 + add.f32 %f87, %f75, %f85; + .loc 2 110 39 + setp.eq.f32 %p45, %f87, 0f00000000; + .loc 2 110 60 + mov.b32 %r71, %f87; + div.full.f32 %r69, %r70, %r71; + mov.b32 %f88, %r69; + .loc 2 110 49 + selp.f32 %f89, 0f00000000, %f88, %p45; + .loc 2 112 17 + fma.rn.f32 %f90, %f89, %f86, %f78; + .loc 2 113 15 + add.f32 %f91, %f82, %f84; + .loc 2 113 30 + mul.f32 %f92, %f86, %f86; + .loc 2 113 38 + mul.f32 %f93, %f75, %f92; + .loc 2 113 22 + fma.rn.f32 %f94, %f89, %f93, %f91; +$L__tmp9: + .loc 2 120 46 + mov.b32 %r131, %f90; + shfl.sync.bfly.b32 %r132, %r131, 2, 31, -1; + mov.b32 %f95, %r132; + mov.b32 %r133, %f94; + shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1; + mov.b32 %f96, %r134; + shfl.sync.bfly.b32 %r73, %r71, 2, 31, -1; + mov.b32 %f97, %r73; +$L__tmp10: + .loc 2 108 21 + sub.f32 %f98, %f95, %f90; + .loc 2 109 28 + add.f32 %f99, %f87, %f97; + .loc 2 110 39 + setp.eq.f32 %p46, %f99, 0f00000000; + .loc 2 110 60 + mov.b32 %r74, %f99; + div.full.f32 %r72, %r73, %r74; + mov.b32 %f100, %r72; + .loc 2 110 49 + selp.f32 %f101, 0f00000000, %f100, %p46; + .loc 2 112 17 + fma.rn.f32 %f102, %f101, %f98, %f90; + .loc 2 113 15 + add.f32 %f103, %f94, %f96; + .loc 2 113 30 + mul.f32 %f104, %f98, %f98; + .loc 2 113 38 + mul.f32 %f105, %f87, %f104; + .loc 2 113 22 + fma.rn.f32 %f106, %f101, %f105, %f103; +$L__tmp11: + .loc 2 120 46 + mov.b32 %r135, %f102; + shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1; + mov.b32 %f107, %r136; + mov.b32 %r137, %f106; + shfl.sync.bfly.b32 %r138, %r137, 1, 31, -1; + mov.b32 %f108, %r138; + shfl.sync.bfly.b32 %r76, %r74, 1, 31, -1; + mov.b32 %f109, %r76; +$L__tmp12: + .loc 2 108 21 + sub.f32 %f110, %f107, %f102; + .loc 2 109 28 + add.f32 %f111, %f99, %f109; + .loc 2 110 39 + setp.eq.f32 %p47, %f111, 0f00000000; + .loc 2 110 60 + mov.b32 %r77, %f111; + div.full.f32 %r75, %r76, %r77; + mov.b32 %f112, %r75; + .loc 2 110 49 + selp.f32 %f113, 0f00000000, %f112, %p47; + .loc 2 112 17 + fma.rn.f32 %f114, %f113, %f110, %f102; + .loc 2 113 15 + add.f32 %f115, %f106, %f108; + .loc 2 113 30 + mul.f32 %f116, %f110, %f110; + .loc 2 113 38 + mul.f32 %f117, %f99, %f116; + .loc 2 113 22 + fma.rn.f32 %f118, %f113, %f117, %f115; +$L__tmp13: + .loc 2 120 46 + setp.eq.s32 %p21, %r2, 0; + shl.b32 %r139, %r5, 2; + shl.b32 %r140, %r3, 3; + or.b32 %r141, %r140, %r139; + mov.u32 %r142, global_smem; + add.s32 %r78, %r142, %r141; + mov.b32 %r79, %f114; + @%p21 st.shared.b32 [ %r78 + 0 ], %r79; + add.s32 %r143, %r142, 16; + add.s32 %r80, %r143, %r141; + mov.b32 %r81, %f118; + @%p21 st.shared.b32 [ %r80 + 0 ], %r81; + add.s32 %r144, %r142, 32; + add.s32 %r82, %r144, %r141; + @%p21 st.shared.b32 [ %r82 + 0 ], %r77; + bar.sync 0; + setp.lt.s32 %p24, %r1, 4; + add.s32 %r85, %r142, %r31; + @%p24 ld.shared.b32 %r84, [ %r85 + 0 ]; + mov.b32 %f119, %r84; + add.s32 %r87, %r143, %r31; + @%p24 ld.shared.b32 %r86, [ %r87 + 0 ]; + mov.b32 %f120, %r86; + add.s32 %r89, %r144, %r31; + @%p24 ld.shared.b32 %r88, [ %r89 + 0 ]; + mov.b32 %f121, %r88; + shfl.sync.bfly.b32 %r146, %r84, 1, 31, -1; + mov.b32 %f122, %r146; + shfl.sync.bfly.b32 %r147, %r86, 1, 31, -1; + mov.b32 %f123, %r147; + shfl.sync.bfly.b32 %r91, %r88, 1, 31, -1; + mov.b32 %f124, %r91; +$L__tmp14: + .loc 2 108 21 + sub.f32 %f125, %f122, %f119; + .loc 2 109 28 + add.f32 %f126, %f121, %f124; + .loc 2 110 39 + setp.eq.f32 %p48, %f126, 0f00000000; + .loc 2 110 60 + mov.b32 %r92, %f126; + div.full.f32 %r90, %r91, %r92; + mov.b32 %f127, %r90; + .loc 2 110 49 + selp.f32 %f128, 0f00000000, %f127, %p48; + .loc 2 112 17 + fma.rn.f32 %f129, %f125, %f128, %f119; + .loc 2 113 15 + add.f32 %f130, %f120, %f123; + .loc 2 113 30 + mul.f32 %f131, %f125, %f125; + .loc 2 113 38 + mul.f32 %f132, %f121, %f131; + .loc 2 113 22 + fma.rn.f32 %f133, %f132, %f128, %f130; +$L__tmp15: + .loc 2 120 46 + setp.eq.s32 %p49, %r4, 0; + and.pred %p27, %p24, %p49; + mov.b32 %r94, %f129; + @%p27 st.shared.b32 [ %r85 + 0 ], %r94; + mov.b32 %r96, %f133; + @%p27 st.shared.b32 [ %r87 + 0 ], %r96; + @%p27 st.shared.b32 [ %r89 + 0 ], %r92; + bar.sync 0; + add.s32 %r148, %r142, %r140; + ld.shared.f32 %f9, [%r148]; + add.s32 %r149, %r143, %r140; + ld.shared.f32 %f10, [%r149]; +$L__tmp16: + .loc 1 62 51 + mov.u32 %r99, 0x0; + mov.u32 %r100, 0x0; + mov.u32 %r101, 0x0; + mov.u32 %r102, 0x0; + @%p50 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd38 + 0 ]; + @!%p50 mov.u32 %r99, %r155; + @!%p50 mov.u32 %r100, %r155; + @!%p50 mov.u32 %r101, %r155; + @!%p50 mov.u32 %r102, %r155; + .loc 1 63 51 + mov.u32 %r107, 0x0; + mov.u32 %r108, 0x0; + @%p50 ld.global.L1::evict_first.v2.b32 { %r107, %r108 }, [ %rd39 + 0 ]; + @!%p50 mov.u32 %r107, %r155; + @!%p50 mov.u32 %r108, %r155; + cvt.u16.u32 %rs5, %r107; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r107; } + cvt.u16.u32 %rs7, %r108; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r108; } + .loc 1 63 103 + cvt.f32.bf16 %r111, %rs5; + mov.b32 %f11, %r111; + cvt.f32.bf16 %r112, %rs6; + mov.b32 %f12, %r112; + cvt.f32.bf16 %r113, %rs7; + mov.b32 %f13, %r113; + cvt.f32.bf16 %r114, %rs8; + mov.b32 %f14, %r114; + .loc 1 64 35 + mul.wide.u32 %rd47, %r7, 4; + add.s64 %rd40, %rd8, %rd47; + .loc 1 64 40 + mov.u32 %r115, 0x0; + mov.u32 %r116, 0x0; + @%p50 ld.global.L1::evict_last.v2.b32 { %r115, %r116 }, [ %rd40 + 0 ]; + @!%p50 mov.u32 %r115, %r155; + @!%p50 mov.u32 %r116, %r155; + .loc 1 68 57 + @%p15 bra $L__BB0_4; + mov.u64 %rd48, assertMessage_1; + cvta.global.u64 %rd49, %rd48; + mov.u64 %rd50, assertFile_1; + cvta.global.u64 %rd51, %rd50; + mov.u64 %rd52, assertFunc_1; + cvta.global.u64 %rd53, %rd52; + { // callseq 5, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd49; + .param .b64 param1; + st.param.b64 [param1+0], %rd51; + .param .b32 param2; + st.param.b32 [param2+0], %r184; + .param .b64 param3; + st.param.b64 [param3+0], %rd53; + .param .b64 param4; + st.param.b64 [param4+0], %rd58; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 5 +$L__BB0_4: + .loc 1 69 54 + mov.u32 %r151, 0x0; + mov.u32 %r152, 0x0; + mov.u32 %r153, 0x0; + mov.u32 %r154, 0x0; + @%p50 ld.global.L1::evict_first.v4.b32 { %r151, %r152, %r153, %r154 }, [ %rd55 + 0 ]; + @!%p50 mov.u32 %r151, %r155; + @!%p50 mov.u32 %r152, %r155; + @!%p50 mov.u32 %r153, %r155; + @!%p50 mov.u32 %r154, %r155; + .loc 1 75 24 + mov.b32 %r160, %f10; + mov.b32 %r161, 1132462080; + div.full.f32 %r159, %r160, %r161; + mov.b32 %f134, %r159; + .loc 1 77 24 + add.f32 %f135, %f134, 0f3727C5AC; + .loc 1 78 30 + rsqrt.approx.ftz.f32 %f136, %f135; + .loc 1 69 54 + mov.b32 %f137, %r154; + .loc 1 62 51 + mov.b32 %f138, %r102; + .loc 1 70 24 + add.f32 %f139, %f138, %f137; + .loc 1 72 24 + add.f32 %f140, %f14, %f139; + .loc 1 73 24 + sub.f32 %f141, %f140, %f9; + .loc 1 69 54 + mov.b32 %f142, %r153; + .loc 1 62 51 + mov.b32 %f143, %r101; + .loc 1 70 24 + add.f32 %f144, %f143, %f142; + .loc 1 72 24 + add.f32 %f145, %f13, %f144; + .loc 1 73 24 + sub.f32 %f146, %f145, %f9; + .loc 1 69 54 + mov.b32 %f147, %r152; + .loc 1 62 51 + mov.b32 %f148, %r100; + .loc 1 70 24 + add.f32 %f149, %f148, %f147; + .loc 1 72 24 + add.f32 %f150, %f12, %f149; + .loc 1 73 24 + sub.f32 %f151, %f150, %f9; + .loc 1 69 54 + mov.b32 %f152, %r151; + .loc 1 62 51 + mov.b32 %f153, %r99; + .loc 1 70 24 + add.f32 %f154, %f153, %f152; + .loc 1 72 24 + add.f32 %f155, %f11, %f154; + .loc 1 73 24 + sub.f32 %f156, %f155, %f9; + .loc 1 79 24 + mul.f32 %f157, %f156, %f136; + mul.f32 %f158, %f151, %f136; + mul.f32 %f159, %f146, %f136; + mul.f32 %f160, %f141, %f136; + .loc 1 80 24 + bar.sync 0; + shl.b32 %r177, %r7, 2; + add.s32 %r179, %r142, %r177; + st.shared.v2.u32 [%r179], {%r115, %r116}; + bar.sync 0; + shl.b32 %r180, %r6, 2; + add.s32 %r181, %r142, %r180; + ld.shared.v4.f32 {%f161, %f162, %f163, %f164}, [%r181]; + mul.f32 %f165, %f157, %f161; + mul.f32 %f166, %f158, %f162; + mul.f32 %f167, %f159, %f163; + mul.f32 %f168, %f160, %f164; + .loc 1 82 29 + shl.b64 %rd57, %rd3, 1; + add.s64 %rd56, %rd9, %rd57; + .loc 1 82 52 + mov.b32 %r171, %f165; + cvt.rn.bf16.f32 %rs9, %r171; + mov.b32 %r172, %f166; + cvt.rn.bf16.f32 %rs10, %r172; + mov.b32 %r173, %f167; + cvt.rn.bf16.f32 %rs11, %r173; + mov.b32 %r174, %f168; + cvt.rn.bf16.f32 %rs12, %r174; + mov.b32 %r182, {%rs9, %rs10}; + mov.b32 %r183, {%rs11, %rs12}; + @%p50 st.global.v2.b32 [ %rd56 + 0 ], { %r182, %r183 }; + .loc 1 58 4 + ret; +$L__tmp17: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 302 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 110 +.b8 51 +.b8 108 +.b8 97 +.b8 119 +.b8 103 +.b8 54 +.b8 53 +.b8 108 +.b8 112 +.b8 105 +.b8 54 +.b8 51 +.b8 103 +.b8 118 +.b8 54 +.b8 99 +.b8 54 +.b8 112 +.b8 110 +.b8 52 +.b8 111 +.b8 105 +.b8 107 +.b8 104 +.b8 103 +.b8 54 +.b8 113 +.b8 118 +.b8 97 +.b8 50 +.b8 104 +.b8 50 +.b8 113 +.b8 106 +.b8 100 +.b8 112 +.b8 120 +.b8 101 +.b8 54 +.b8 113 +.b8 106 +.b8 52 +.b8 108 +.b8 118 +.b8 116 +.b8 116 +.b8 119 +.b8 101 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 47 +.b8 41 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 53 +.b8 44 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp16 +.b8 2 +.b8 53 +.b8 44 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a807e0f784418f9585cf71d6acba378f6c52e502 Binary files /dev/null and b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin differ diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..000d202ce76cdac20b0cf40cd4c302f1f5d40032 --- /dev/null +++ b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir @@ -0,0 +1,1211 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_1 = internal constant [38 x i8] c"" +@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257" +@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_0 = internal constant [38 x i8] c"" +@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr + +define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %9 = lshr i32 %8, 3, !dbg !10 + %10 = and i32 %9, 31, !dbg !10 + %11 = and i32 %8, 63, !dbg !10 + %12 = shl i32 %8, 3, !dbg !11 + %13 = and i32 %12, 56, !dbg !11 + %14 = or i32 %13, 4, !dbg !11 + %15 = lshr i32 %8, 6, !dbg !12 + %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13 + %17 = shl i32 %16, 6, !dbg !14 + %18 = or i32 %17, %10, !dbg !15 + %19 = or i32 %18, 32, !dbg !15 + %20 = or i32 %17, %11, !dbg !15 + %21 = sext i32 %18 to i64, !dbg !16 + %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !16 + %23 = sext i32 %19 to i64, !dbg !16 + %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !16 + %25 = sext i32 %20 to i64, !dbg !16 + %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17 + %44 = srem i32 %18, 512, !dbg !18 + %45 = srem i32 %19, 512, !dbg !18 + %46 = shl nsw i32 %44, 8, !dbg !19 + %47 = shl nsw i32 %45, 8, !dbg !19 + %48 = add i64 %43, 50257, !dbg !20 + %49 = icmp slt i64 %27, 0, !dbg !21 + %50 = icmp slt i64 %35, 0, !dbg !21 + %51 = icmp slt i64 %43, 0, !dbg !21 + %52 = select i1 %51, i64 %48, i64 %43, !dbg !22 + %53 = icmp ugt i64 %52, 50256, !dbg !23 + %54 = shl i64 %27, 8, !dbg !24 + %55 = add i64 %54, 12865792, !dbg !24 + %56 = select i1 %49, i64 %55, i64 %54, !dbg !24 + %57 = shl i64 %35, 8, !dbg !24 + %58 = add i64 %57, 12865792, !dbg !24 + %59 = select i1 %50, i64 %58, i64 %57, !dbg !24 + %60 = getelementptr float, ptr addrspace(1) %1, i64 %56 + %61 = getelementptr float, ptr addrspace(1) %1, i64 %59 + br label %62, !dbg !12 + +62: ; preds = %7, %179 + %63 = phi float [ 0.000000e+00, %7 ], [ %254, %179 ] + %64 = phi float [ 0.000000e+00, %7 ], [ %255, %179 ] + %65 = phi float [ 0.000000e+00, %7 ], [ %256, %179 ] + %66 = phi float [ 0.000000e+00, %7 ], [ %257, %179 ] + %67 = phi float [ 0.000000e+00, %7 ], [ %258, %179 ] + %68 = phi float [ 0.000000e+00, %7 ], [ %259, %179 ] + %69 = phi float [ 0.000000e+00, %7 ], [ %260, %179 ] + %70 = phi float [ 0.000000e+00, %7 ], [ %261, %179 ] + %71 = phi float [ 0.000000e+00, %7 ], [ %262, %179 ] + %72 = phi float [ 0.000000e+00, %7 ], [ %263, %179 ] + %73 = phi float [ 0.000000e+00, %7 ], [ %264, %179 ] + %74 = phi float [ 0.000000e+00, %7 ], [ %265, %179 ] + %75 = phi float [ 0.000000e+00, %7 ], [ %266, %179 ] + %76 = phi float [ 0.000000e+00, %7 ], [ %267, %179 ] + %77 = phi float [ 0.000000e+00, %7 ], [ %268, %179 ] + %78 = phi float [ 0.000000e+00, %7 ], [ %269, %179 ] + %79 = phi float [ 0.000000e+00, %7 ], [ %270, %179 ] + %80 = phi float [ 0.000000e+00, %7 ], [ %271, %179 ] + %81 = phi float [ 0.000000e+00, %7 ], [ %272, %179 ] + %82 = phi float [ 0.000000e+00, %7 ], [ %273, %179 ] + %83 = phi float [ 0.000000e+00, %7 ], [ %274, %179 ] + %84 = phi float [ 0.000000e+00, %7 ], [ %275, %179 ] + %85 = phi float [ 0.000000e+00, %7 ], [ %276, %179 ] + %86 = phi float [ 0.000000e+00, %7 ], [ %277, %179 ] + %87 = phi float [ 0.000000e+00, %7 ], [ %278, %179 ] + %88 = phi float [ 0.000000e+00, %7 ], [ %279, %179 ] + %89 = phi float [ 0.000000e+00, %7 ], [ %280, %179 ] + %90 = phi float [ 0.000000e+00, %7 ], [ %281, %179 ] + %91 = phi float [ 0.000000e+00, %7 ], [ %282, %179 ] + %92 = phi float [ 0.000000e+00, %7 ], [ %283, %179 ] + %93 = phi float [ 0.000000e+00, %7 ], [ %284, %179 ] + %94 = phi float [ 0.000000e+00, %7 ], [ %285, %179 ] + %95 = phi float [ 0.000000e+00, %7 ], [ %350, %179 ] + %96 = phi float [ 0.000000e+00, %7 ], [ %351, %179 ] + %97 = phi float [ 0.000000e+00, %7 ], [ %352, %179 ] + %98 = phi float [ 0.000000e+00, %7 ], [ %353, %179 ] + %99 = phi float [ 0.000000e+00, %7 ], [ %354, %179 ] + %100 = phi float [ 0.000000e+00, %7 ], [ %355, %179 ] + %101 = phi float [ 0.000000e+00, %7 ], [ %356, %179 ] + %102 = phi float [ 0.000000e+00, %7 ], [ %357, %179 ] + %103 = phi float [ 0.000000e+00, %7 ], [ %358, %179 ] + %104 = phi float [ 0.000000e+00, %7 ], [ %359, %179 ] + %105 = phi float [ 0.000000e+00, %7 ], [ %360, %179 ] + %106 = phi float [ 0.000000e+00, %7 ], [ %361, %179 ] + %107 = phi float [ 0.000000e+00, %7 ], [ %362, %179 ] + %108 = phi float [ 0.000000e+00, %7 ], [ %363, %179 ] + %109 = phi float [ 0.000000e+00, %7 ], [ %364, %179 ] + %110 = phi float [ 0.000000e+00, %7 ], [ %365, %179 ] + %111 = phi float [ 0.000000e+00, %7 ], [ %302, %179 ] + %112 = phi float [ 0.000000e+00, %7 ], [ %303, %179 ] + %113 = phi float [ 0.000000e+00, %7 ], [ %304, %179 ] + %114 = phi float [ 0.000000e+00, %7 ], [ %305, %179 ] + %115 = phi float [ 0.000000e+00, %7 ], [ %306, %179 ] + %116 = phi float [ 0.000000e+00, %7 ], [ %307, %179 ] + %117 = phi float [ 0.000000e+00, %7 ], [ %308, %179 ] + %118 = phi float [ 0.000000e+00, %7 ], [ %309, %179 ] + %119 = phi float [ 0.000000e+00, %7 ], [ %310, %179 ] + %120 = phi float [ 0.000000e+00, %7 ], [ %311, %179 ] + %121 = phi float [ 0.000000e+00, %7 ], [ %312, %179 ] + %122 = phi float [ 0.000000e+00, %7 ], [ %313, %179 ] + %123 = phi float [ 0.000000e+00, %7 ], [ %314, %179 ] + %124 = phi float [ 0.000000e+00, %7 ], [ %315, %179 ] + %125 = phi float [ 0.000000e+00, %7 ], [ %316, %179 ] + %126 = phi float [ 0.000000e+00, %7 ], [ %317, %179 ] + %127 = phi i32 [ 0, %7 ], [ %366, %179 ] + %128 = or i32 %127, %13, !dbg !25 + %129 = or i32 %127, %14, !dbg !25 + %130 = add i32 %128, %46, !dbg !26 + %131 = add i32 %129, %46, !dbg !26 + %132 = add i32 %128, %47, !dbg !26 + %133 = add i32 %129, %47, !dbg !26 + %134 = sext i32 %130 to i64, !dbg !27 + %135 = getelementptr float, ptr addrspace(1) %2, i64 %134, !dbg !27 + %136 = sext i32 %131 to i64, !dbg !27 + %137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !27 + %138 = sext i32 %132 to i64, !dbg !27 + %139 = getelementptr float, ptr addrspace(1) %2, i64 %138, !dbg !27 + %140 = sext i32 %133 to i64, !dbg !27 + %141 = getelementptr float, ptr addrspace(1) %2, i64 %140, !dbg !27 + %142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %135, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28 + %143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !28 + %144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !28 + %145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !28 + %146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !28 + %147 = bitcast i32 %143 to float, !dbg !28 + %148 = bitcast i32 %144 to float, !dbg !28 + %149 = bitcast i32 %145 to float, !dbg !28 + %150 = bitcast i32 %146 to float, !dbg !28 + %151 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28 + %152 = extractvalue { i32, i32, i32, i32 } %151, 0, !dbg !28 + %153 = extractvalue { i32, i32, i32, i32 } %151, 1, !dbg !28 + %154 = extractvalue { i32, i32, i32, i32 } %151, 2, !dbg !28 + %155 = extractvalue { i32, i32, i32, i32 } %151, 3, !dbg !28 + %156 = bitcast i32 %152 to float, !dbg !28 + %157 = bitcast i32 %153 to float, !dbg !28 + %158 = bitcast i32 %154 to float, !dbg !28 + %159 = bitcast i32 %155 to float, !dbg !28 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %139, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !28 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !28 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !28 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !28 + %165 = bitcast i32 %161 to float, !dbg !28 + %166 = bitcast i32 %162 to float, !dbg !28 + %167 = bitcast i32 %163 to float, !dbg !28 + %168 = bitcast i32 %164 to float, !dbg !28 + %169 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28 + %170 = extractvalue { i32, i32, i32, i32 } %169, 0, !dbg !28 + %171 = extractvalue { i32, i32, i32, i32 } %169, 1, !dbg !28 + %172 = extractvalue { i32, i32, i32, i32 } %169, 2, !dbg !28 + %173 = extractvalue { i32, i32, i32, i32 } %169, 3, !dbg !28 + %174 = bitcast i32 %170 to float, !dbg !28 + %175 = bitcast i32 %171 to float, !dbg !28 + %176 = bitcast i32 %172 to float, !dbg !28 + %177 = bitcast i32 %173 to float, !dbg !28 + br i1 %53, label %178, label %179, !dbg !29 + +178: ; preds = %62 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29 + br label %179, !dbg !29 + +179: ; preds = %178, %62 + %180 = zext nneg i32 %128 to i64, !dbg !30 + %181 = zext nneg i32 %129 to i64, !dbg !30 + %182 = getelementptr float, ptr addrspace(1) %60, i64 %180, !dbg !31 + %183 = getelementptr float, ptr addrspace(1) %60, i64 %181, !dbg !31 + %184 = getelementptr float, ptr addrspace(1) %61, i64 %180, !dbg !31 + %185 = getelementptr float, ptr addrspace(1) %61, i64 %181, !dbg !31 + %186 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %182, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %187 = extractvalue { i32, i32, i32, i32 } %186, 0, !dbg !32 + %188 = extractvalue { i32, i32, i32, i32 } %186, 1, !dbg !32 + %189 = extractvalue { i32, i32, i32, i32 } %186, 2, !dbg !32 + %190 = extractvalue { i32, i32, i32, i32 } %186, 3, !dbg !32 + %191 = bitcast i32 %187 to float, !dbg !32 + %192 = bitcast i32 %188 to float, !dbg !32 + %193 = bitcast i32 %189 to float, !dbg !32 + %194 = bitcast i32 %190 to float, !dbg !32 + %195 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %183, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %196 = extractvalue { i32, i32, i32, i32 } %195, 0, !dbg !32 + %197 = extractvalue { i32, i32, i32, i32 } %195, 1, !dbg !32 + %198 = extractvalue { i32, i32, i32, i32 } %195, 2, !dbg !32 + %199 = extractvalue { i32, i32, i32, i32 } %195, 3, !dbg !32 + %200 = bitcast i32 %196 to float, !dbg !32 + %201 = bitcast i32 %197 to float, !dbg !32 + %202 = bitcast i32 %198 to float, !dbg !32 + %203 = bitcast i32 %199 to float, !dbg !32 + %204 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %205 = extractvalue { i32, i32, i32, i32 } %204, 0, !dbg !32 + %206 = extractvalue { i32, i32, i32, i32 } %204, 1, !dbg !32 + %207 = extractvalue { i32, i32, i32, i32 } %204, 2, !dbg !32 + %208 = extractvalue { i32, i32, i32, i32 } %204, 3, !dbg !32 + %209 = bitcast i32 %205 to float, !dbg !32 + %210 = bitcast i32 %206 to float, !dbg !32 + %211 = bitcast i32 %207 to float, !dbg !32 + %212 = bitcast i32 %208 to float, !dbg !32 + %213 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %185, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %214 = extractvalue { i32, i32, i32, i32 } %213, 0, !dbg !32 + %215 = extractvalue { i32, i32, i32, i32 } %213, 1, !dbg !32 + %216 = extractvalue { i32, i32, i32, i32 } %213, 2, !dbg !32 + %217 = extractvalue { i32, i32, i32, i32 } %213, 3, !dbg !32 + %218 = bitcast i32 %214 to float, !dbg !32 + %219 = bitcast i32 %215 to float, !dbg !32 + %220 = bitcast i32 %216 to float, !dbg !32 + %221 = bitcast i32 %217 to float, !dbg !32 + %222 = fadd float %147, %191, !dbg !33 + %223 = fadd float %148, %192, !dbg !33 + %224 = fadd float %149, %193, !dbg !33 + %225 = fadd float %150, %194, !dbg !33 + %226 = fadd float %156, %200, !dbg !33 + %227 = fadd float %157, %201, !dbg !33 + %228 = fadd float %158, %202, !dbg !33 + %229 = fadd float %159, %203, !dbg !33 + %230 = fadd float %165, %209, !dbg !33 + %231 = fadd float %166, %210, !dbg !33 + %232 = fadd float %167, %211, !dbg !33 + %233 = fadd float %168, %212, !dbg !33 + %234 = fadd float %174, %218, !dbg !33 + %235 = fadd float %175, %219, !dbg !33 + %236 = fadd float %176, %220, !dbg !33 + %237 = fadd float %177, %221, !dbg !33 + %238 = fsub float %222, %111, !dbg !34 + %239 = fsub float %223, %112, !dbg !34 + %240 = fsub float %224, %113, !dbg !34 + %241 = fsub float %225, %114, !dbg !34 + %242 = fsub float %226, %115, !dbg !34 + %243 = fsub float %227, %116, !dbg !34 + %244 = fsub float %228, %117, !dbg !34 + %245 = fsub float %229, %118, !dbg !34 + %246 = fsub float %230, %119, !dbg !34 + %247 = fsub float %231, %120, !dbg !34 + %248 = fsub float %232, %121, !dbg !34 + %249 = fsub float %233, %122, !dbg !34 + %250 = fsub float %234, %123, !dbg !34 + %251 = fsub float %235, %124, !dbg !34 + %252 = fsub float %236, %125, !dbg !34 + %253 = fsub float %237, %126, !dbg !34 + %254 = fadd float %63, 1.000000e+00, !dbg !38 + %255 = fadd float %64, 1.000000e+00, !dbg !38 + %256 = fadd float %65, 1.000000e+00, !dbg !38 + %257 = fadd float %66, 1.000000e+00, !dbg !38 + %258 = fadd float %67, 1.000000e+00, !dbg !38 + %259 = fadd float %68, 1.000000e+00, !dbg !38 + %260 = fadd float %69, 1.000000e+00, !dbg !38 + %261 = fadd float %70, 1.000000e+00, !dbg !38 + %262 = fadd float %71, 1.000000e+00, !dbg !38 + %263 = fadd float %72, 1.000000e+00, !dbg !38 + %264 = fadd float %73, 1.000000e+00, !dbg !38 + %265 = fadd float %74, 1.000000e+00, !dbg !38 + %266 = fadd float %75, 1.000000e+00, !dbg !38 + %267 = fadd float %76, 1.000000e+00, !dbg !38 + %268 = fadd float %77, 1.000000e+00, !dbg !38 + %269 = fadd float %78, 1.000000e+00, !dbg !38 + %270 = fadd float %79, 1.000000e+00, !dbg !38 + %271 = fadd float %80, 1.000000e+00, !dbg !38 + %272 = fadd float %81, 1.000000e+00, !dbg !38 + %273 = fadd float %82, 1.000000e+00, !dbg !38 + %274 = fadd float %83, 1.000000e+00, !dbg !38 + %275 = fadd float %84, 1.000000e+00, !dbg !38 + %276 = fadd float %85, 1.000000e+00, !dbg !38 + %277 = fadd float %86, 1.000000e+00, !dbg !38 + %278 = fadd float %87, 1.000000e+00, !dbg !38 + %279 = fadd float %88, 1.000000e+00, !dbg !38 + %280 = fadd float %89, 1.000000e+00, !dbg !38 + %281 = fadd float %90, 1.000000e+00, !dbg !38 + %282 = fadd float %91, 1.000000e+00, !dbg !38 + %283 = fadd float %92, 1.000000e+00, !dbg !38 + %284 = fadd float %93, 1.000000e+00, !dbg !38 + %285 = fadd float %94, 1.000000e+00, !dbg !38 + %286 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %238, float %254) #6, !dbg !39 + %287 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %239, float %255) #6, !dbg !39 + %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %256) #6, !dbg !39 + %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %241, float %257) #6, !dbg !39 + %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %242, float %258) #6, !dbg !39 + %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %243, float %259) #6, !dbg !39 + %292 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %260) #6, !dbg !39 + %293 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float %261) #6, !dbg !39 + %294 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %246, float %262) #6, !dbg !39 + %295 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %247, float %263) #6, !dbg !39 + %296 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %264) #6, !dbg !39 + %297 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %249, float %265) #6, !dbg !39 + %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %266) #6, !dbg !39 + %299 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %251, float %267) #6, !dbg !39 + %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %268) #6, !dbg !39 + %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %269) #6, !dbg !39 + %302 = fadd float %111, %286, !dbg !40 + %303 = fadd float %112, %287, !dbg !40 + %304 = fadd float %113, %288, !dbg !40 + %305 = fadd float %114, %289, !dbg !40 + %306 = fadd float %115, %290, !dbg !40 + %307 = fadd float %116, %291, !dbg !40 + %308 = fadd float %117, %292, !dbg !40 + %309 = fadd float %118, %293, !dbg !40 + %310 = fadd float %119, %294, !dbg !40 + %311 = fadd float %120, %295, !dbg !40 + %312 = fadd float %121, %296, !dbg !40 + %313 = fadd float %122, %297, !dbg !40 + %314 = fadd float %123, %298, !dbg !40 + %315 = fadd float %124, %299, !dbg !40 + %316 = fadd float %125, %300, !dbg !40 + %317 = fadd float %126, %301, !dbg !40 + %318 = fsub float %222, %302, !dbg !41 + %319 = fsub float %223, %303, !dbg !41 + %320 = fsub float %224, %304, !dbg !41 + %321 = fsub float %225, %305, !dbg !41 + %322 = fsub float %226, %306, !dbg !41 + %323 = fsub float %227, %307, !dbg !41 + %324 = fsub float %228, %308, !dbg !41 + %325 = fsub float %229, %309, !dbg !41 + %326 = fsub float %230, %310, !dbg !41 + %327 = fsub float %231, %311, !dbg !41 + %328 = fsub float %232, %312, !dbg !41 + %329 = fsub float %233, %313, !dbg !41 + %330 = fsub float %234, %314, !dbg !41 + %331 = fsub float %235, %315, !dbg !41 + %332 = fsub float %236, %316, !dbg !41 + %333 = fsub float %237, %317, !dbg !41 + %334 = fmul float %238, %318, !dbg !42 + %335 = fmul float %239, %319, !dbg !42 + %336 = fmul float %240, %320, !dbg !42 + %337 = fmul float %241, %321, !dbg !42 + %338 = fmul float %242, %322, !dbg !42 + %339 = fmul float %243, %323, !dbg !42 + %340 = fmul float %244, %324, !dbg !42 + %341 = fmul float %245, %325, !dbg !42 + %342 = fmul float %246, %326, !dbg !42 + %343 = fmul float %247, %327, !dbg !42 + %344 = fmul float %248, %328, !dbg !42 + %345 = fmul float %249, %329, !dbg !42 + %346 = fmul float %250, %330, !dbg !42 + %347 = fmul float %251, %331, !dbg !42 + %348 = fmul float %252, %332, !dbg !42 + %349 = fmul float %253, %333, !dbg !42 + %350 = fadd float %95, %334, !dbg !43 + %351 = fadd float %96, %335, !dbg !43 + %352 = fadd float %97, %336, !dbg !43 + %353 = fadd float %98, %337, !dbg !43 + %354 = fadd float %99, %338, !dbg !43 + %355 = fadd float %100, %339, !dbg !43 + %356 = fadd float %101, %340, !dbg !43 + %357 = fadd float %102, %341, !dbg !43 + %358 = fadd float %103, %342, !dbg !43 + %359 = fadd float %104, %343, !dbg !43 + %360 = fadd float %105, %344, !dbg !43 + %361 = fadd float %106, %345, !dbg !43 + %362 = fadd float %107, %346, !dbg !43 + %363 = fadd float %108, %347, !dbg !43 + %364 = fadd float %109, %348, !dbg !43 + %365 = fadd float %110, %349, !dbg !43 + %366 = add nuw nsw i32 %127, 64, !dbg !12 + %367 = icmp ult i32 %127, 192, !dbg !12 + br i1 %367, label %62, label %368, !dbg !12 + +368: ; preds = %179 + %369 = and i32 %15, 3, !dbg !12 + %370 = mul nuw nsw i32 %369, 72, !dbg !12 + %371 = add nuw nsw i32 %370, %11, !dbg !12 + %372 = zext nneg i32 %371 to i64, !dbg !12 + %373 = getelementptr float, ptr addrspace(3) @global_smem, i64 %372, !dbg !12 + %374 = insertelement <1 x float> undef, float %270, i64 0, !dbg !12 + store <1 x float> %374, ptr addrspace(3) %373, align 4, !dbg !12 + %375 = add nuw nsw i32 %11, 288, !dbg !12 + %376 = add nuw nsw i32 %375, %370, !dbg !12 + %377 = zext nneg i32 %376 to i64, !dbg !12 + %378 = getelementptr float, ptr addrspace(3) @global_smem, i64 %377, !dbg !12 + %379 = insertelement <1 x float> undef, float %271, i64 0, !dbg !12 + store <1 x float> %379, ptr addrspace(3) %378, align 4, !dbg !12 + %380 = or i32 %11, 576, !dbg !12 + %381 = add nuw nsw i32 %380, %370, !dbg !12 + %382 = zext nneg i32 %381 to i64, !dbg !12 + %383 = getelementptr float, ptr addrspace(3) @global_smem, i64 %382, !dbg !12 + %384 = insertelement <1 x float> undef, float %272, i64 0, !dbg !12 + store <1 x float> %384, ptr addrspace(3) %383, align 4, !dbg !12 + %385 = add nuw nsw i32 %11, 864, !dbg !12 + %386 = add nuw nsw i32 %385, %370, !dbg !12 + %387 = zext nneg i32 %386 to i64, !dbg !12 + %388 = getelementptr float, ptr addrspace(3) @global_smem, i64 %387, !dbg !12 + %389 = insertelement <1 x float> undef, float %273, i64 0, !dbg !12 + store <1 x float> %389, ptr addrspace(3) %388, align 4, !dbg !12 + %390 = or i32 %11, 1152, !dbg !12 + %391 = add nuw nsw i32 %390, %370, !dbg !12 + %392 = zext nneg i32 %391 to i64, !dbg !12 + %393 = getelementptr float, ptr addrspace(3) @global_smem, i64 %392, !dbg !12 + %394 = insertelement <1 x float> undef, float %274, i64 0, !dbg !12 + store <1 x float> %394, ptr addrspace(3) %393, align 4, !dbg !12 + %395 = add nuw nsw i32 %11, 1440, !dbg !12 + %396 = add nuw nsw i32 %395, %370, !dbg !12 + %397 = zext nneg i32 %396 to i64, !dbg !12 + %398 = getelementptr float, ptr addrspace(3) @global_smem, i64 %397, !dbg !12 + %399 = insertelement <1 x float> undef, float %275, i64 0, !dbg !12 + store <1 x float> %399, ptr addrspace(3) %398, align 4, !dbg !12 + %400 = or i32 %11, 1728, !dbg !12 + %401 = add nuw nsw i32 %400, %370, !dbg !12 + %402 = zext nneg i32 %401 to i64, !dbg !12 + %403 = getelementptr float, ptr addrspace(3) @global_smem, i64 %402, !dbg !12 + %404 = insertelement <1 x float> undef, float %276, i64 0, !dbg !12 + store <1 x float> %404, ptr addrspace(3) %403, align 4, !dbg !12 + %405 = add nuw nsw i32 %11, 2016, !dbg !12 + %406 = add nuw nsw i32 %405, %370, !dbg !12 + %407 = zext nneg i32 %406 to i64, !dbg !12 + %408 = getelementptr float, ptr addrspace(3) @global_smem, i64 %407, !dbg !12 + %409 = insertelement <1 x float> undef, float %277, i64 0, !dbg !12 + store <1 x float> %409, ptr addrspace(3) %408, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %410 = mul nuw nsw i32 %10, 72, !dbg !12 + %411 = add nuw nsw i32 %410, %13, !dbg !12 + %412 = zext nneg i32 %411 to i64, !dbg !12 + %413 = getelementptr float, ptr addrspace(3) @global_smem, i64 %412, !dbg !12 + %414 = load float, ptr addrspace(3) %413, align 32, !dbg !12 + %415 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 1, !dbg !12 + %416 = load float, ptr addrspace(3) %415, align 4, !dbg !12 + %417 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 2, !dbg !12 + %418 = load float, ptr addrspace(3) %417, align 8, !dbg !12 + %419 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 3, !dbg !12 + %420 = load float, ptr addrspace(3) %419, align 4, !dbg !12 + %421 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 4, !dbg !12 + %422 = load float, ptr addrspace(3) %421, align 16, !dbg !12 + %423 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 5, !dbg !12 + %424 = load float, ptr addrspace(3) %423, align 4, !dbg !12 + %425 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 6, !dbg !12 + %426 = load float, ptr addrspace(3) %425, align 8, !dbg !12 + %427 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 7, !dbg !12 + %428 = load float, ptr addrspace(3) %427, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %429 = insertelement <1 x float> undef, float %278, i64 0, !dbg !12 + store <1 x float> %429, ptr addrspace(3) %373, align 4, !dbg !12 + %430 = insertelement <1 x float> undef, float %279, i64 0, !dbg !12 + store <1 x float> %430, ptr addrspace(3) %378, align 4, !dbg !12 + %431 = insertelement <1 x float> undef, float %280, i64 0, !dbg !12 + store <1 x float> %431, ptr addrspace(3) %383, align 4, !dbg !12 + %432 = insertelement <1 x float> undef, float %281, i64 0, !dbg !12 + store <1 x float> %432, ptr addrspace(3) %388, align 4, !dbg !12 + %433 = insertelement <1 x float> undef, float %282, i64 0, !dbg !12 + store <1 x float> %433, ptr addrspace(3) %393, align 4, !dbg !12 + %434 = insertelement <1 x float> undef, float %283, i64 0, !dbg !12 + store <1 x float> %434, ptr addrspace(3) %398, align 4, !dbg !12 + %435 = insertelement <1 x float> undef, float %284, i64 0, !dbg !12 + store <1 x float> %435, ptr addrspace(3) %403, align 4, !dbg !12 + %436 = insertelement <1 x float> undef, float %285, i64 0, !dbg !12 + store <1 x float> %436, ptr addrspace(3) %408, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %437 = load float, ptr addrspace(3) %413, align 32, !dbg !12 + %438 = load float, ptr addrspace(3) %415, align 4, !dbg !12 + %439 = load float, ptr addrspace(3) %417, align 8, !dbg !12 + %440 = load float, ptr addrspace(3) %419, align 4, !dbg !12 + %441 = load float, ptr addrspace(3) %421, align 16, !dbg !12 + %442 = load float, ptr addrspace(3) %423, align 4, !dbg !12 + %443 = load float, ptr addrspace(3) %425, align 8, !dbg !12 + %444 = load float, ptr addrspace(3) %427, align 4, !dbg !12 + %445 = fsub float %303, %302, !dbg !44 + %446 = fadd float %414, %416, !dbg !48 + %447 = fcmp oeq float %446, 0.000000e+00, !dbg !49 + %448 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %446) #6, !dbg !50 + %449 = select i1 %447, float 0.000000e+00, float %448, !dbg !51 + %450 = fmul float %445, %449, !dbg !52 + %451 = fadd float %302, %450, !dbg !53 + %452 = fadd float %350, %351, !dbg !54 + %453 = fmul float %445, %445, !dbg !55 + %454 = fmul float %453, %414, !dbg !56 + %455 = fmul float %454, %449, !dbg !57 + %456 = fadd float %452, %455, !dbg !58 + %457 = fsub float %304, %451, !dbg !44 + %458 = fadd float %418, %446, !dbg !48 + %459 = fcmp oeq float %458, 0.000000e+00, !dbg !49 + %460 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %418, float %458) #6, !dbg !50 + %461 = select i1 %459, float 0.000000e+00, float %460, !dbg !51 + %462 = fmul float %461, %457, !dbg !52 + %463 = fadd float %451, %462, !dbg !53 + %464 = fadd float %352, %456, !dbg !54 + %465 = fmul float %457, %457, !dbg !55 + %466 = fmul float %446, %465, !dbg !56 + %467 = fmul float %461, %466, !dbg !57 + %468 = fadd float %464, %467, !dbg !58 + %469 = fsub float %305, %463, !dbg !44 + %470 = fadd float %420, %458, !dbg !48 + %471 = fcmp oeq float %470, 0.000000e+00, !dbg !49 + %472 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %420, float %470) #6, !dbg !50 + %473 = select i1 %471, float 0.000000e+00, float %472, !dbg !51 + %474 = fmul float %473, %469, !dbg !52 + %475 = fadd float %463, %474, !dbg !53 + %476 = fadd float %353, %468, !dbg !54 + %477 = fmul float %469, %469, !dbg !55 + %478 = fmul float %458, %477, !dbg !56 + %479 = fmul float %473, %478, !dbg !57 + %480 = fadd float %476, %479, !dbg !58 + %481 = fsub float %306, %475, !dbg !44 + %482 = fadd float %422, %470, !dbg !48 + %483 = fcmp oeq float %482, 0.000000e+00, !dbg !49 + %484 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %422, float %482) #6, !dbg !50 + %485 = select i1 %483, float 0.000000e+00, float %484, !dbg !51 + %486 = fmul float %485, %481, !dbg !52 + %487 = fadd float %475, %486, !dbg !53 + %488 = fadd float %354, %480, !dbg !54 + %489 = fmul float %481, %481, !dbg !55 + %490 = fmul float %470, %489, !dbg !56 + %491 = fmul float %485, %490, !dbg !57 + %492 = fadd float %488, %491, !dbg !58 + %493 = fsub float %307, %487, !dbg !44 + %494 = fadd float %424, %482, !dbg !48 + %495 = fcmp oeq float %494, 0.000000e+00, !dbg !49 + %496 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %424, float %494) #6, !dbg !50 + %497 = select i1 %495, float 0.000000e+00, float %496, !dbg !51 + %498 = fmul float %497, %493, !dbg !52 + %499 = fadd float %487, %498, !dbg !53 + %500 = fadd float %355, %492, !dbg !54 + %501 = fmul float %493, %493, !dbg !55 + %502 = fmul float %482, %501, !dbg !56 + %503 = fmul float %497, %502, !dbg !57 + %504 = fadd float %500, %503, !dbg !58 + %505 = fsub float %308, %499, !dbg !44 + %506 = fadd float %426, %494, !dbg !48 + %507 = fcmp oeq float %506, 0.000000e+00, !dbg !49 + %508 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %426, float %506) #6, !dbg !50 + %509 = select i1 %507, float 0.000000e+00, float %508, !dbg !51 + %510 = fmul float %509, %505, !dbg !52 + %511 = fadd float %499, %510, !dbg !53 + %512 = fadd float %356, %504, !dbg !54 + %513 = fmul float %505, %505, !dbg !55 + %514 = fmul float %494, %513, !dbg !56 + %515 = fmul float %509, %514, !dbg !57 + %516 = fadd float %512, %515, !dbg !58 + %517 = fsub float %309, %511, !dbg !44 + %518 = fadd float %428, %506, !dbg !48 + %519 = fcmp oeq float %518, 0.000000e+00, !dbg !49 + %520 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float %518) #6, !dbg !50 + %521 = select i1 %519, float 0.000000e+00, float %520, !dbg !51 + %522 = fmul float %521, %517, !dbg !52 + %523 = fadd float %511, %522, !dbg !53 + %524 = fadd float %357, %516, !dbg !54 + %525 = fmul float %517, %517, !dbg !55 + %526 = fmul float %506, %525, !dbg !56 + %527 = fmul float %521, %526, !dbg !57 + %528 = fadd float %524, %527, !dbg !58 + %529 = fsub float %311, %310, !dbg !44 + %530 = fadd float %437, %438, !dbg !48 + %531 = fcmp oeq float %530, 0.000000e+00, !dbg !49 + %532 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %438, float %530) #6, !dbg !50 + %533 = select i1 %531, float 0.000000e+00, float %532, !dbg !51 + %534 = fmul float %529, %533, !dbg !52 + %535 = fadd float %310, %534, !dbg !53 + %536 = fadd float %358, %359, !dbg !54 + %537 = fmul float %529, %529, !dbg !55 + %538 = fmul float %537, %437, !dbg !56 + %539 = fmul float %538, %533, !dbg !57 + %540 = fadd float %536, %539, !dbg !58 + %541 = fsub float %312, %535, !dbg !44 + %542 = fadd float %439, %530, !dbg !48 + %543 = fcmp oeq float %542, 0.000000e+00, !dbg !49 + %544 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %439, float %542) #6, !dbg !50 + %545 = select i1 %543, float 0.000000e+00, float %544, !dbg !51 + %546 = fmul float %545, %541, !dbg !52 + %547 = fadd float %535, %546, !dbg !53 + %548 = fadd float %360, %540, !dbg !54 + %549 = fmul float %541, %541, !dbg !55 + %550 = fmul float %530, %549, !dbg !56 + %551 = fmul float %545, %550, !dbg !57 + %552 = fadd float %548, %551, !dbg !58 + %553 = fsub float %313, %547, !dbg !44 + %554 = fadd float %440, %542, !dbg !48 + %555 = fcmp oeq float %554, 0.000000e+00, !dbg !49 + %556 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %440, float %554) #6, !dbg !50 + %557 = select i1 %555, float 0.000000e+00, float %556, !dbg !51 + %558 = fmul float %557, %553, !dbg !52 + %559 = fadd float %547, %558, !dbg !53 + %560 = fadd float %361, %552, !dbg !54 + %561 = fmul float %553, %553, !dbg !55 + %562 = fmul float %542, %561, !dbg !56 + %563 = fmul float %557, %562, !dbg !57 + %564 = fadd float %560, %563, !dbg !58 + %565 = fsub float %314, %559, !dbg !44 + %566 = fadd float %441, %554, !dbg !48 + %567 = fcmp oeq float %566, 0.000000e+00, !dbg !49 + %568 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %441, float %566) #6, !dbg !50 + %569 = select i1 %567, float 0.000000e+00, float %568, !dbg !51 + %570 = fmul float %569, %565, !dbg !52 + %571 = fadd float %559, %570, !dbg !53 + %572 = fadd float %362, %564, !dbg !54 + %573 = fmul float %565, %565, !dbg !55 + %574 = fmul float %554, %573, !dbg !56 + %575 = fmul float %569, %574, !dbg !57 + %576 = fadd float %572, %575, !dbg !58 + %577 = fsub float %315, %571, !dbg !44 + %578 = fadd float %442, %566, !dbg !48 + %579 = fcmp oeq float %578, 0.000000e+00, !dbg !49 + %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %442, float %578) #6, !dbg !50 + %581 = select i1 %579, float 0.000000e+00, float %580, !dbg !51 + %582 = fmul float %581, %577, !dbg !52 + %583 = fadd float %571, %582, !dbg !53 + %584 = fadd float %363, %576, !dbg !54 + %585 = fmul float %577, %577, !dbg !55 + %586 = fmul float %566, %585, !dbg !56 + %587 = fmul float %581, %586, !dbg !57 + %588 = fadd float %584, %587, !dbg !58 + %589 = fsub float %316, %583, !dbg !44 + %590 = fadd float %443, %578, !dbg !48 + %591 = fcmp oeq float %590, 0.000000e+00, !dbg !49 + %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %443, float %590) #6, !dbg !50 + %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !51 + %594 = fmul float %593, %589, !dbg !52 + %595 = fadd float %583, %594, !dbg !53 + %596 = fadd float %364, %588, !dbg !54 + %597 = fmul float %589, %589, !dbg !55 + %598 = fmul float %578, %597, !dbg !56 + %599 = fmul float %593, %598, !dbg !57 + %600 = fadd float %596, %599, !dbg !58 + %601 = fsub float %317, %595, !dbg !44 + %602 = fadd float %444, %590, !dbg !48 + %603 = fcmp oeq float %602, 0.000000e+00, !dbg !49 + %604 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %444, float %602) #6, !dbg !50 + %605 = select i1 %603, float 0.000000e+00, float %604, !dbg !51 + %606 = fmul float %605, %601, !dbg !52 + %607 = fadd float %595, %606, !dbg !53 + %608 = fadd float %365, %600, !dbg !54 + %609 = fmul float %601, %601, !dbg !55 + %610 = fmul float %590, %609, !dbg !56 + %611 = fmul float %605, %610, !dbg !57 + %612 = fadd float %608, %611, !dbg !58 + %613 = bitcast float %523 to i32, !dbg !59 + %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !59 + %615 = bitcast i32 %614 to float, !dbg !59 + %616 = bitcast float %528 to i32, !dbg !59 + %617 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %616, i32 4, i32 31), !dbg !59 + %618 = bitcast i32 %617 to float, !dbg !59 + %619 = bitcast float %518 to i32, !dbg !59 + %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !59 + %621 = bitcast i32 %620 to float, !dbg !59 + %622 = fsub float %615, %523, !dbg !44 + %623 = fadd float %518, %621, !dbg !48 + %624 = fcmp oeq float %623, 0.000000e+00, !dbg !49 + %625 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %621, float %623) #6, !dbg !50 + %626 = select i1 %624, float 0.000000e+00, float %625, !dbg !51 + %627 = fmul float %626, %622, !dbg !52 + %628 = fadd float %523, %627, !dbg !53 + %629 = fadd float %528, %618, !dbg !54 + %630 = fmul float %622, %622, !dbg !55 + %631 = fmul float %518, %630, !dbg !56 + %632 = fmul float %626, %631, !dbg !57 + %633 = fadd float %629, %632, !dbg !58 + %634 = bitcast float %628 to i32, !dbg !59 + %635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %634, i32 2, i32 31), !dbg !59 + %636 = bitcast i32 %635 to float, !dbg !59 + %637 = bitcast float %633 to i32, !dbg !59 + %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 2, i32 31), !dbg !59 + %639 = bitcast i32 %638 to float, !dbg !59 + %640 = bitcast float %623 to i32, !dbg !59 + %641 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 2, i32 31), !dbg !59 + %642 = bitcast i32 %641 to float, !dbg !59 + %643 = fsub float %636, %628, !dbg !44 + %644 = fadd float %623, %642, !dbg !48 + %645 = fcmp oeq float %644, 0.000000e+00, !dbg !49 + %646 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %642, float %644) #6, !dbg !50 + %647 = select i1 %645, float 0.000000e+00, float %646, !dbg !51 + %648 = fmul float %647, %643, !dbg !52 + %649 = fadd float %628, %648, !dbg !53 + %650 = fadd float %633, %639, !dbg !54 + %651 = fmul float %643, %643, !dbg !55 + %652 = fmul float %623, %651, !dbg !56 + %653 = fmul float %647, %652, !dbg !57 + %654 = fadd float %650, %653, !dbg !58 + %655 = bitcast float %649 to i32, !dbg !59 + %656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 1, i32 31), !dbg !59 + %657 = bitcast i32 %656 to float, !dbg !59 + %658 = bitcast float %654 to i32, !dbg !59 + %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 1, i32 31), !dbg !59 + %660 = bitcast i32 %659 to float, !dbg !59 + %661 = bitcast float %644 to i32, !dbg !59 + %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !59 + %663 = bitcast i32 %662 to float, !dbg !59 + %664 = fsub float %657, %649, !dbg !44 + %665 = fadd float %644, %663, !dbg !48 + %666 = fcmp oeq float %665, 0.000000e+00, !dbg !49 + %667 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %663, float %665) #6, !dbg !50 + %668 = select i1 %666, float 0.000000e+00, float %667, !dbg !51 + %669 = fmul float %664, %668, !dbg !52 + %670 = fadd float %649, %669, !dbg !53 + %671 = fadd float %654, %660, !dbg !54 + %672 = fmul float %664, %664, !dbg !55 + %673 = fmul float %644, %672, !dbg !56 + %674 = fmul float %668, %673, !dbg !57 + %675 = fadd float %671, %674, !dbg !58 + %676 = bitcast float %607 to i32, !dbg !59 + %677 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 4, i32 31), !dbg !59 + %678 = bitcast i32 %677 to float, !dbg !59 + %679 = bitcast float %612 to i32, !dbg !59 + %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %679, i32 4, i32 31), !dbg !59 + %681 = bitcast i32 %680 to float, !dbg !59 + %682 = bitcast float %602 to i32, !dbg !59 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 4, i32 31), !dbg !59 + %684 = bitcast i32 %683 to float, !dbg !59 + %685 = fsub float %678, %607, !dbg !44 + %686 = fadd float %602, %684, !dbg !48 + %687 = fcmp oeq float %686, 0.000000e+00, !dbg !49 + %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %684, float %686) #6, !dbg !50 + %689 = select i1 %687, float 0.000000e+00, float %688, !dbg !51 + %690 = fmul float %685, %689, !dbg !52 + %691 = fadd float %607, %690, !dbg !53 + %692 = fadd float %612, %681, !dbg !54 + %693 = fmul float %685, %685, !dbg !55 + %694 = fmul float %602, %693, !dbg !56 + %695 = fmul float %694, %689, !dbg !57 + %696 = fadd float %692, %695, !dbg !58 + %697 = bitcast float %691 to i32, !dbg !59 + %698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 2, i32 31), !dbg !59 + %699 = bitcast i32 %698 to float, !dbg !59 + %700 = bitcast float %696 to i32, !dbg !59 + %701 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %700, i32 2, i32 31), !dbg !59 + %702 = bitcast i32 %701 to float, !dbg !59 + %703 = bitcast float %686 to i32, !dbg !59 + %704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 2, i32 31), !dbg !59 + %705 = bitcast i32 %704 to float, !dbg !59 + %706 = fsub float %699, %691, !dbg !44 + %707 = fadd float %686, %705, !dbg !48 + %708 = fcmp oeq float %707, 0.000000e+00, !dbg !49 + %709 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %705, float %707) #6, !dbg !50 + %710 = select i1 %708, float 0.000000e+00, float %709, !dbg !51 + %711 = fmul float %706, %710, !dbg !52 + %712 = fadd float %691, %711, !dbg !53 + %713 = fadd float %696, %702, !dbg !54 + %714 = fmul float %706, %706, !dbg !55 + %715 = fmul float %686, %714, !dbg !56 + %716 = fmul float %710, %715, !dbg !57 + %717 = fadd float %713, %716, !dbg !58 + %718 = bitcast float %712 to i32, !dbg !59 + %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !59 + %720 = bitcast i32 %719 to float, !dbg !59 + %721 = bitcast float %717 to i32, !dbg !59 + %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !59 + %723 = bitcast i32 %722 to float, !dbg !59 + %724 = bitcast float %707 to i32, !dbg !59 + %725 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %724, i32 1, i32 31), !dbg !59 + %726 = bitcast i32 %725 to float, !dbg !59 + %727 = fsub float %720, %712, !dbg !44 + %728 = fadd float %707, %726, !dbg !48 + %729 = fcmp oeq float %728, 0.000000e+00, !dbg !49 + %730 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %726, float %728) #6, !dbg !50 + %731 = select i1 %729, float 0.000000e+00, float %730, !dbg !51 + %732 = fmul float %727, %731, !dbg !52 + %733 = fadd float %712, %732, !dbg !53 + %734 = fadd float %717, %723, !dbg !54 + %735 = fmul float %727, %727, !dbg !55 + %736 = fmul float %707, %735, !dbg !56 + %737 = fmul float %731, %736, !dbg !57 + %738 = fadd float %734, %737, !dbg !58 + %739 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %740 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %741 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %742 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %743 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %744 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %745 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %746 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61 + %747 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %748 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %749 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %750 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %751 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %752 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %753 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %754 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61 + %755 = fadd float %739, 0x3EE4F8B580000000, !dbg !62 + %756 = fadd float %747, 0x3EE4F8B580000000, !dbg !62 + %757 = shl i32 %18, 8, !dbg !63 + %758 = shl i32 %19, 8, !dbg !63 + br label %759, !dbg !64 + +759: ; preds = %368, %__nv_rsqrtf.exit25 + %760 = phi i32 [ 0, %368 ], [ %1009, %__nv_rsqrtf.exit25 ] + %761 = or i32 %760, %13, !dbg !65 + %762 = or i32 %760, %14, !dbg !65 + %763 = add i32 %761, %46, !dbg !66 + %764 = add i32 %762, %46, !dbg !66 + %765 = add i32 %761, %47, !dbg !66 + %766 = add i32 %762, %47, !dbg !66 + %767 = sext i32 %763 to i64, !dbg !67 + %768 = getelementptr float, ptr addrspace(1) %2, i64 %767, !dbg !67 + %769 = sext i32 %764 to i64, !dbg !67 + %770 = getelementptr float, ptr addrspace(1) %2, i64 %769, !dbg !67 + %771 = sext i32 %765 to i64, !dbg !67 + %772 = getelementptr float, ptr addrspace(1) %2, i64 %771, !dbg !67 + %773 = sext i32 %766 to i64, !dbg !67 + %774 = getelementptr float, ptr addrspace(1) %2, i64 %773, !dbg !67 + %775 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %768, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 + %776 = extractvalue { i32, i32, i32, i32 } %775, 0, !dbg !68 + %777 = extractvalue { i32, i32, i32, i32 } %775, 1, !dbg !68 + %778 = extractvalue { i32, i32, i32, i32 } %775, 2, !dbg !68 + %779 = extractvalue { i32, i32, i32, i32 } %775, 3, !dbg !68 + %780 = bitcast i32 %776 to float, !dbg !68 + %781 = bitcast i32 %777 to float, !dbg !68 + %782 = bitcast i32 %778 to float, !dbg !68 + %783 = bitcast i32 %779 to float, !dbg !68 + %784 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %770, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 + %785 = extractvalue { i32, i32, i32, i32 } %784, 0, !dbg !68 + %786 = extractvalue { i32, i32, i32, i32 } %784, 1, !dbg !68 + %787 = extractvalue { i32, i32, i32, i32 } %784, 2, !dbg !68 + %788 = extractvalue { i32, i32, i32, i32 } %784, 3, !dbg !68 + %789 = bitcast i32 %785 to float, !dbg !68 + %790 = bitcast i32 %786 to float, !dbg !68 + %791 = bitcast i32 %787 to float, !dbg !68 + %792 = bitcast i32 %788 to float, !dbg !68 + %793 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %772, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 + %794 = extractvalue { i32, i32, i32, i32 } %793, 0, !dbg !68 + %795 = extractvalue { i32, i32, i32, i32 } %793, 1, !dbg !68 + %796 = extractvalue { i32, i32, i32, i32 } %793, 2, !dbg !68 + %797 = extractvalue { i32, i32, i32, i32 } %793, 3, !dbg !68 + %798 = bitcast i32 %794 to float, !dbg !68 + %799 = bitcast i32 %795 to float, !dbg !68 + %800 = bitcast i32 %796 to float, !dbg !68 + %801 = bitcast i32 %797 to float, !dbg !68 + %802 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %774, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 + %803 = extractvalue { i32, i32, i32, i32 } %802, 0, !dbg !68 + %804 = extractvalue { i32, i32, i32, i32 } %802, 1, !dbg !68 + %805 = extractvalue { i32, i32, i32, i32 } %802, 2, !dbg !68 + %806 = extractvalue { i32, i32, i32, i32 } %802, 3, !dbg !68 + %807 = bitcast i32 %803 to float, !dbg !68 + %808 = bitcast i32 %804 to float, !dbg !68 + %809 = bitcast i32 %805 to float, !dbg !68 + %810 = bitcast i32 %806 to float, !dbg !68 + %811 = zext nneg i32 %761 to i64, !dbg !69 + %812 = getelementptr float, ptr addrspace(1) %3, i64 %811, !dbg !69 + %813 = zext nneg i32 %762 to i64, !dbg !69 + %814 = getelementptr float, ptr addrspace(1) %3, i64 %813, !dbg !69 + %815 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %812, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70 + %816 = extractvalue { i32, i32, i32, i32 } %815, 0, !dbg !70 + %817 = extractvalue { i32, i32, i32, i32 } %815, 1, !dbg !70 + %818 = extractvalue { i32, i32, i32, i32 } %815, 2, !dbg !70 + %819 = extractvalue { i32, i32, i32, i32 } %815, 3, !dbg !70 + %820 = bitcast i32 %816 to float, !dbg !70 + %821 = bitcast i32 %817 to float, !dbg !70 + %822 = bitcast i32 %818 to float, !dbg !70 + %823 = bitcast i32 %819 to float, !dbg !70 + %824 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %814, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70 + %825 = extractvalue { i32, i32, i32, i32 } %824, 0, !dbg !70 + %826 = extractvalue { i32, i32, i32, i32 } %824, 1, !dbg !70 + %827 = extractvalue { i32, i32, i32, i32 } %824, 2, !dbg !70 + %828 = extractvalue { i32, i32, i32, i32 } %824, 3, !dbg !70 + %829 = bitcast i32 %825 to float, !dbg !70 + %830 = bitcast i32 %826 to float, !dbg !70 + %831 = bitcast i32 %827 to float, !dbg !70 + %832 = bitcast i32 %828 to float, !dbg !70 + br i1 %53, label %833, label %834, !dbg !71 + +833: ; preds = %759 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71 + br label %834, !dbg !71 + +834: ; preds = %833, %759 + %835 = getelementptr float, ptr addrspace(1) %60, i64 %811, !dbg !72 + %836 = getelementptr float, ptr addrspace(1) %60, i64 %813, !dbg !72 + %837 = getelementptr float, ptr addrspace(1) %61, i64 %811, !dbg !72 + %838 = getelementptr float, ptr addrspace(1) %61, i64 %813, !dbg !72 + %839 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %840 = extractvalue { i32, i32, i32, i32 } %839, 0, !dbg !73 + %841 = extractvalue { i32, i32, i32, i32 } %839, 1, !dbg !73 + %842 = extractvalue { i32, i32, i32, i32 } %839, 2, !dbg !73 + %843 = extractvalue { i32, i32, i32, i32 } %839, 3, !dbg !73 + %844 = bitcast i32 %840 to float, !dbg !73 + %845 = bitcast i32 %841 to float, !dbg !73 + %846 = bitcast i32 %842 to float, !dbg !73 + %847 = bitcast i32 %843 to float, !dbg !73 + %848 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %836, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %849 = extractvalue { i32, i32, i32, i32 } %848, 0, !dbg !73 + %850 = extractvalue { i32, i32, i32, i32 } %848, 1, !dbg !73 + %851 = extractvalue { i32, i32, i32, i32 } %848, 2, !dbg !73 + %852 = extractvalue { i32, i32, i32, i32 } %848, 3, !dbg !73 + %853 = bitcast i32 %849 to float, !dbg !73 + %854 = bitcast i32 %850 to float, !dbg !73 + %855 = bitcast i32 %851 to float, !dbg !73 + %856 = bitcast i32 %852 to float, !dbg !73 + %857 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %858 = extractvalue { i32, i32, i32, i32 } %857, 0, !dbg !73 + %859 = extractvalue { i32, i32, i32, i32 } %857, 1, !dbg !73 + %860 = extractvalue { i32, i32, i32, i32 } %857, 2, !dbg !73 + %861 = extractvalue { i32, i32, i32, i32 } %857, 3, !dbg !73 + %862 = bitcast i32 %858 to float, !dbg !73 + %863 = bitcast i32 %859 to float, !dbg !73 + %864 = bitcast i32 %860 to float, !dbg !73 + %865 = bitcast i32 %861 to float, !dbg !73 + %866 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %838, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %867 = extractvalue { i32, i32, i32, i32 } %866, 0, !dbg !73 + %868 = extractvalue { i32, i32, i32, i32 } %866, 1, !dbg !73 + %869 = extractvalue { i32, i32, i32, i32 } %866, 2, !dbg !73 + %870 = extractvalue { i32, i32, i32, i32 } %866, 3, !dbg !73 + %871 = bitcast i32 %867 to float, !dbg !73 + %872 = bitcast i32 %868 to float, !dbg !73 + %873 = bitcast i32 %869 to float, !dbg !73 + %874 = bitcast i32 %870 to float, !dbg !73 + %875 = fadd float %780, %844, !dbg !74 + %876 = fadd float %781, %845, !dbg !74 + %877 = fadd float %782, %846, !dbg !74 + %878 = fadd float %783, %847, !dbg !74 + %879 = fadd float %789, %853, !dbg !74 + %880 = fadd float %790, %854, !dbg !74 + %881 = fadd float %791, %855, !dbg !74 + %882 = fadd float %792, %856, !dbg !74 + %883 = fadd float %798, %862, !dbg !74 + %884 = fadd float %799, %863, !dbg !74 + %885 = fadd float %800, %864, !dbg !74 + %886 = fadd float %801, %865, !dbg !74 + %887 = fadd float %807, %871, !dbg !74 + %888 = fadd float %808, %872, !dbg !74 + %889 = fadd float %809, %873, !dbg !74 + %890 = fadd float %810, %874, !dbg !74 + %891 = fsub float %875, %670, !dbg !75 + %892 = fsub float %876, %670, !dbg !75 + %893 = fsub float %877, %670, !dbg !75 + %894 = fsub float %878, %670, !dbg !75 + %895 = fsub float %879, %670, !dbg !75 + %896 = fsub float %880, %670, !dbg !75 + %897 = fsub float %881, %670, !dbg !75 + %898 = fsub float %882, %670, !dbg !75 + %899 = fsub float %883, %733, !dbg !75 + %900 = fsub float %884, %733, !dbg !75 + %901 = fsub float %885, %733, !dbg !75 + %902 = fsub float %886, %733, !dbg !75 + %903 = fsub float %887, %733, !dbg !75 + %904 = fsub float %888, %733, !dbg !75 + %905 = fsub float %889, %733, !dbg !75 + %906 = fsub float %890, %733, !dbg !75 + %907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %.not.i = icmp eq i32 %907, 0, !dbg !76 + br i1 %.not.i, label %910, label %908, !dbg !76 + +908: ; preds = %834 + %909 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %755), !dbg !76 + br label %__nv_rsqrtf.exit, !dbg !76 + +910: ; preds = %834 + %911 = tail call float @llvm.nvvm.rsqrt.approx.f(float %755), !dbg !76 + br label %__nv_rsqrtf.exit, !dbg !76 + +__nv_rsqrtf.exit: ; preds = %908, %910 + %.0.i = phi float [ %909, %908 ], [ %911, %910 ], !dbg !76 + %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %.not.i23 = icmp eq i32 %919, 0, !dbg !76 + br i1 %.not.i23, label %922, label %920, !dbg !76 + +920: ; preds = %__nv_rsqrtf.exit + %921 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %756), !dbg !76 + br label %__nv_rsqrtf.exit25, !dbg !76 + +922: ; preds = %__nv_rsqrtf.exit + %923 = tail call float @llvm.nvvm.rsqrt.approx.f(float %756), !dbg !76 + br label %__nv_rsqrtf.exit25, !dbg !76 + +__nv_rsqrtf.exit25: ; preds = %920, %922 + %.0.i24 = phi float [ %921, %920 ], [ %923, %922 ], !dbg !76 + %924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %928 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76 + %931 = fmul float %891, %.0.i, !dbg !77 + %932 = fmul float %892, %.0.i, !dbg !77 + %933 = fmul float %893, %.0.i, !dbg !77 + %934 = fmul float %894, %.0.i, !dbg !77 + %935 = fmul float %895, %.0.i, !dbg !77 + %936 = fmul float %896, %.0.i, !dbg !77 + %937 = fmul float %897, %.0.i, !dbg !77 + %938 = fmul float %898, %.0.i, !dbg !77 + %939 = fmul float %899, %.0.i24, !dbg !77 + %940 = fmul float %900, %.0.i24, !dbg !77 + %941 = fmul float %901, %.0.i24, !dbg !77 + %942 = fmul float %902, %.0.i24, !dbg !77 + %943 = fmul float %903, %.0.i24, !dbg !77 + %944 = fmul float %904, %.0.i24, !dbg !77 + %945 = fmul float %905, %.0.i24, !dbg !77 + %946 = fmul float %906, %.0.i24, !dbg !77 + %947 = fmul float %931, %820, !dbg !78 + %948 = fmul float %932, %821, !dbg !78 + %949 = fmul float %933, %822, !dbg !78 + %950 = fmul float %934, %823, !dbg !78 + %951 = fmul float %935, %829, !dbg !78 + %952 = fmul float %936, %830, !dbg !78 + %953 = fmul float %937, %831, !dbg !78 + %954 = fmul float %938, %832, !dbg !78 + %955 = fmul float %939, %820, !dbg !78 + %956 = fmul float %940, %821, !dbg !78 + %957 = fmul float %941, %822, !dbg !78 + %958 = fmul float %942, %823, !dbg !78 + %959 = fmul float %943, %829, !dbg !78 + %960 = fmul float %944, %830, !dbg !78 + %961 = fmul float %945, %831, !dbg !78 + %962 = fmul float %946, %832, !dbg !78 + %963 = add i32 %761, %757, !dbg !79 + %964 = add i32 %761, %758, !dbg !79 + %965 = sext i32 %963 to i64, !dbg !80 + %966 = getelementptr i16, ptr addrspace(1) %4, i64 %965, !dbg !80 + %967 = sext i32 %964 to i64, !dbg !80 + %968 = getelementptr i16, ptr addrspace(1) %4, i64 %967, !dbg !80 + %969 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #6, !dbg !81 + %970 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #6, !dbg !81 + %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #6, !dbg !81 + %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #6, !dbg !81 + %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #6, !dbg !81 + %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #6, !dbg !81 + %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #6, !dbg !81 + %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #6, !dbg !81 + %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %955) #6, !dbg !81 + %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %956) #6, !dbg !81 + %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %957) #6, !dbg !81 + %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %958) #6, !dbg !81 + %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %959) #6, !dbg !81 + %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %960) #6, !dbg !81 + %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %961) #6, !dbg !81 + %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %962) #6, !dbg !81 + %985 = insertelement <2 x i16> undef, i16 %969, i64 0, !dbg !81 + %986 = insertelement <2 x i16> %985, i16 %970, i64 1, !dbg !81 + %987 = bitcast <2 x i16> %986 to i32, !dbg !81 + %988 = insertelement <2 x i16> undef, i16 %971, i64 0, !dbg !81 + %989 = insertelement <2 x i16> %988, i16 %972, i64 1, !dbg !81 + %990 = bitcast <2 x i16> %989 to i32, !dbg !81 + %991 = insertelement <2 x i16> undef, i16 %973, i64 0, !dbg !81 + %992 = insertelement <2 x i16> %991, i16 %974, i64 1, !dbg !81 + %993 = bitcast <2 x i16> %992 to i32, !dbg !81 + %994 = insertelement <2 x i16> undef, i16 %975, i64 0, !dbg !81 + %995 = insertelement <2 x i16> %994, i16 %976, i64 1, !dbg !81 + %996 = bitcast <2 x i16> %995 to i32, !dbg !81 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %987, i32 %990, i32 %993, i32 %996, ptr addrspace(1) %966, i1 true) #6, !dbg !81 + %997 = insertelement <2 x i16> undef, i16 %977, i64 0, !dbg !81 + %998 = insertelement <2 x i16> %997, i16 %978, i64 1, !dbg !81 + %999 = bitcast <2 x i16> %998 to i32, !dbg !81 + %1000 = insertelement <2 x i16> undef, i16 %979, i64 0, !dbg !81 + %1001 = insertelement <2 x i16> %1000, i16 %980, i64 1, !dbg !81 + %1002 = bitcast <2 x i16> %1001 to i32, !dbg !81 + %1003 = insertelement <2 x i16> undef, i16 %981, i64 0, !dbg !81 + %1004 = insertelement <2 x i16> %1003, i16 %982, i64 1, !dbg !81 + %1005 = bitcast <2 x i16> %1004 to i32, !dbg !81 + %1006 = insertelement <2 x i16> undef, i16 %983, i64 0, !dbg !81 + %1007 = insertelement <2 x i16> %1006, i16 %984, i64 1, !dbg !81 + %1008 = bitcast <2 x i16> %1007 to i32, !dbg !81 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %999, i32 %1002, i32 %1005, i32 %1008, ptr addrspace(1) %968, i1 true) #6, !dbg !81 + %1009 = add nuw nsw i32 %760, 64, !dbg !64 + %1010 = icmp ult i32 %760, 192, !dbg !64 + br i1 %1010, label %759, label %1011, !dbg !64 + +1011: ; preds = %__nv_rsqrtf.exit25 + ret void, !dbg !82 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx") +!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 22, column: 44, scope: !7) +!11 = !DILocation(line: 24, column: 33, scope: !7) +!12 = !DILocation(line: 31, column: 36, scope: !7) +!13 = !DILocation(line: 21, column: 28, scope: !7) +!14 = !DILocation(line: 21, column: 33, scope: !7) +!15 = !DILocation(line: 22, column: 23, scope: !7) +!16 = !DILocation(line: 26, column: 30, scope: !7) +!17 = !DILocation(line: 26, column: 35, scope: !7) +!18 = !DILocation(line: 27, column: 18, scope: !7) +!19 = !DILocation(line: 35, column: 44, scope: !7) +!20 = !DILocation(line: 36, column: 22, scope: !7) +!21 = !DILocation(line: 37, column: 22, scope: !7) +!22 = !DILocation(line: 38, column: 36, scope: !7) +!23 = !DILocation(line: 39, column: 40, scope: !7) +!24 = !DILocation(line: 40, column: 44, scope: !7) +!25 = !DILocation(line: 32, column: 27, scope: !7) +!26 = !DILocation(line: 35, column: 40, scope: !7) +!27 = !DILocation(line: 35, column: 34, scope: !7) +!28 = !DILocation(line: 35, column: 50, scope: !7) +!29 = !DILocation(line: 39, column: 55, scope: !7) +!30 = !DILocation(line: 40, column: 40, scope: !7) +!31 = !DILocation(line: 40, column: 34, scope: !7) +!32 = !DILocation(line: 40, column: 52, scope: !7) +!33 = !DILocation(line: 41, column: 22, scope: !7) +!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37) +!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) +!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!37 = !DILocation(line: 44, column: 38, scope: !35) +!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37) +!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37) +!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37) +!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37) +!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37) +!43 = !DILocation(line: 47, column: 48, scope: !7) +!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46) +!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0) +!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47) +!47 = !DILocation(line: 50, column: 41, scope: !45) +!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46) +!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46) +!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46) +!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46) +!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46) +!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46) +!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46) +!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46) +!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46) +!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46) +!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46) +!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60) +!60 = !DILocation(line: 50, column: 41, scope: !35) +!61 = !DILocation(line: 69, column: 23, scope: !7) +!62 = !DILocation(line: 71, column: 24, scope: !7) +!63 = !DILocation(line: 76, column: 39, scope: !7) +!64 = !DILocation(line: 55, column: 36, scope: !7) +!65 = !DILocation(line: 56, column: 27, scope: !7) +!66 = !DILocation(line: 59, column: 41, scope: !7) +!67 = !DILocation(line: 59, column: 35, scope: !7) +!68 = !DILocation(line: 59, column: 51, scope: !7) +!69 = !DILocation(line: 60, column: 35, scope: !7) +!70 = !DILocation(line: 60, column: 40, scope: !7) +!71 = !DILocation(line: 64, column: 57, scope: !7) +!72 = !DILocation(line: 65, column: 35, scope: !7) +!73 = !DILocation(line: 65, column: 54, scope: !7) +!74 = !DILocation(line: 66, column: 24, scope: !7) +!75 = !DILocation(line: 67, column: 24, scope: !7) +!76 = !DILocation(line: 72, column: 30, scope: !7) +!77 = !DILocation(line: 73, column: 24, scope: !7) +!78 = !DILocation(line: 74, column: 24, scope: !7) +!79 = !DILocation(line: 76, column: 35, scope: !7) +!80 = !DILocation(line: 76, column: 29, scope: !7) +!81 = !DILocation(line: 76, column: 52, scope: !7) +!82 = !DILocation(line: 55, column: 4, scope: !7) diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..20a9f5d54612d3dc3aa30027e1fe420499ffcb4e --- /dev/null +++ b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx @@ -0,0 +1,1810 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5de6de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5de6de( + .param .u64 triton__0d1d2d3d4d5de6de_param_0, + .param .u64 triton__0d1d2d3d4d5de6de_param_1, + .param .u64 triton__0d1d2d3d4d5de6de_param_2, + .param .u64 triton__0d1d2d3d4d5de6de_param_3, + .param .u64 triton__0d1d2d3d4d5de6de_param_4, + .param .u32 triton__0d1d2d3d4d5de6de_param_5, + .param .u32 triton__0d1d2d3d4d5de6de_param_6 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<137>; + .reg .b16 %rs<17>; + .reg .b32 %r<408>; + .reg .f32 %f<614>; + .reg .b64 %rd<107>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_4]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_3]; + ld.param.u64 %rd49, [triton__0d1d2d3d4d5de6de_param_0]; + ld.param.u64 %rd50, [triton__0d1d2d3d4d5de6de_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r13, %tid.x; + ld.param.u64 %rd51, [triton__0d1d2d3d4d5de6de_param_2]; + bfe.u32 %r1, %r13, 3, 5; + and.b32 %r2, %r13, 63; + .loc 1 24 33 + shl.b32 %r14, %r13, 3; + and.b32 %r3, %r14, 56; + .loc 1 31 36 + shr.u32 %r4, %r13, 6; + .loc 1 21 28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 + shl.b32 %r15, %r11, 6; + .loc 1 22 23 + or.b32 %r16, %r15, %r1; + or.b32 %r17, %r16, 32; + or.b32 %r18, %r15, %r2; + .loc 1 26 30 + mul.wide.s32 %rd52, %r16, 8; + add.s64 %rd15, %rd49, %rd52; + add.s64 %rd31, %rd15, 256; + mul.wide.s32 %rd53, %r18, 8; + add.s64 %rd47, %rd49, %rd53; + mov.pred %p1, -1; + .loc 1 26 35 + mov.u64 %rd14, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd15 + 0 ]; + mov.u64 %rd16, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd15 + 0 ]; + mov.u64 %rd18, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd15 + 0 ]; + mov.u64 %rd20, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd15 + 0 ]; + mov.u64 %rd22, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd15 + 0 ]; + mov.u64 %rd24, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd15 + 0 ]; + mov.u64 %rd26, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd15 + 0 ]; + mov.u64 %rd28, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd15 + 0 ]; + mov.u64 %rd30, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ]; + mov.u64 %rd32, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd31 + 0 ]; + mov.u64 %rd34, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd31 + 0 ]; + mov.u64 %rd36, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd31 + 0 ]; + mov.u64 %rd38, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd31 + 0 ]; + mov.u64 %rd40, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd31 + 0 ]; + mov.u64 %rd42, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd31 + 0 ]; + mov.u64 %rd44, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd31 + 0 ]; + mov.u64 %rd46, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ]; + .loc 1 27 18 + bfe.s32 %r19, %r11, 25, 1; + shr.u32 %r20, %r19, 23; + add.s32 %r21, %r16, %r20; + and.b32 %r22, %r21, 16776704; + sub.s32 %r23, %r16, %r22; + add.s32 %r24, %r17, %r20; + and.b32 %r25, %r24, 16776704; + sub.s32 %r26, %r17, %r25; + .loc 1 35 44 + shl.b32 %r27, %r23, 8; + shl.b32 %r28, %r26, 8; + .loc 1 36 22 + add.s64 %rd54, %rd46, 50257; + .loc 1 37 22 + setp.lt.s64 %p18, %rd14, 0; + setp.lt.s64 %p19, %rd30, 0; + setp.lt.s64 %p20, %rd46, 0; + .loc 1 38 36 + selp.b64 %rd1, %rd54, %rd46, %p20; + .loc 1 40 44 + shl.b64 %rd55, %rd14, 8; + add.s64 %rd56, %rd55, 12865792; + selp.b64 %rd57, %rd56, %rd55, %p18; + shl.b64 %rd58, %rd30, 8; + add.s64 %rd59, %rd58, 12865792; + selp.b64 %rd60, %rd59, %rd58, %p19; + .loc 1 31 36 + and.b32 %r29, %r13, 7; + mul.wide.u32 %rd2, %r29, 32; + shl.b64 %rd61, %rd60, 2; + or.b64 %rd62, %rd2, %rd61; + add.s64 %rd3, %rd50, %rd62; + shl.b64 %rd63, %rd57, 2; + or.b64 %rd64, %rd2, %rd63; + add.s64 %rd4, %rd50, %rd64; + or.b32 %r30, %r28, %r3; + mul.wide.s32 %rd65, %r30, 4; + add.s64 %rd5, %rd51, %rd65; + or.b32 %r31, %r27, %r3; + mul.wide.s32 %rd66, %r31, 4; + add.s64 %rd6, %rd51, %rd66; + mov.f32 %f550, 0f00000000; + mov.u64 %rd105, 0; + mov.b32 %r406, -64; + mov.f32 %f551, %f550; + mov.f32 %f552, %f550; + mov.f32 %f553, %f550; + mov.f32 %f554, %f550; + mov.f32 %f555, %f550; + mov.f32 %f556, %f550; + mov.f32 %f557, %f550; + mov.f32 %f558, %f550; + mov.f32 %f559, %f550; + mov.f32 %f560, %f550; + mov.f32 %f561, %f550; + mov.f32 %f562, %f550; + mov.f32 %f563, %f550; + mov.f32 %f564, %f550; + mov.f32 %f565, %f550; + mov.f32 %f566, %f550; + mov.f32 %f567, %f550; + mov.f32 %f568, %f550; + mov.f32 %f569, %f550; + mov.f32 %f570, %f550; + mov.f32 %f571, %f550; + mov.f32 %f572, %f550; + mov.f32 %f573, %f550; + mov.f32 %f574, %f550; + mov.f32 %f575, %f550; + mov.f32 %f576, %f550; + mov.f32 %f577, %f550; + mov.f32 %f578, %f550; + mov.f32 %f579, %f550; + mov.f32 %f580, %f550; + mov.f32 %f581, %f550; + mov.f32 %f582, %f550; + mov.f32 %f583, %f550; + mov.f32 %f584, %f550; + mov.f32 %f585, %f550; + mov.f32 %f586, %f550; + mov.f32 %f587, %f550; + mov.f32 %f588, %f550; + mov.f32 %f589, %f550; + mov.f32 %f590, %f550; + mov.f32 %f591, %f550; + mov.f32 %f592, %f550; + mov.f32 %f593, %f550; + mov.f32 %f594, %f550; + mov.f32 %f595, %f550; + mov.f32 %f596, %f550; + mov.f32 %f597, %f550; + mov.f32 %f598, %f550; + mov.f32 %f599, %f550; + mov.f32 %f600, %f550; + mov.f32 %f601, %f550; + mov.f32 %f602, %f550; + mov.f32 %f603, %f550; + mov.f32 %f604, %f550; + mov.f32 %f605, %f550; + mov.f32 %f606, %f550; + mov.f32 %f607, %f550; + mov.f32 %f608, %f550; + mov.f32 %f609, %f550; + mov.f32 %f610, %f550; + mov.f32 %f611, %f550; + mov.f32 %f612, %f550; + mov.f32 %f613, %f550; + bra.uni $L__BB0_1; +$L__BB0_3: + .loc 1 40 40 + add.s64 %rd78, %rd4, %rd105; + .loc 1 40 34 + add.s64 %rd79, %rd78, 16; + add.s64 %rd80, %rd3, %rd105; + .loc 1 40 52 + add.s64 %rd81, %rd80, 16; + mov.u32 %r65, 0x0; + mov.u32 %r66, 0x0; + mov.u32 %r67, 0x0; + mov.u32 %r68, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd78 + 0 ]; + @!%p1 mov.u32 %r65, %r342; + @!%p1 mov.u32 %r66, %r342; + @!%p1 mov.u32 %r67, %r342; + @!%p1 mov.u32 %r68, %r342; + mov.b32 %f174, %r65; + mov.b32 %f175, %r66; + mov.b32 %f176, %r67; + mov.b32 %f177, %r68; + mov.u32 %r73, 0x0; + mov.u32 %r74, 0x0; + mov.u32 %r75, 0x0; + mov.u32 %r76, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r73, %r74, %r75, %r76 }, [ %rd79 + 0 ]; + @!%p1 mov.u32 %r73, %r342; + @!%p1 mov.u32 %r74, %r342; + @!%p1 mov.u32 %r75, %r342; + @!%p1 mov.u32 %r76, %r342; + mov.b32 %f178, %r73; + mov.b32 %f179, %r74; + mov.b32 %f180, %r75; + mov.b32 %f181, %r76; + mov.u32 %r81, 0x0; + mov.u32 %r82, 0x0; + mov.u32 %r83, 0x0; + mov.u32 %r84, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r81, %r82, %r83, %r84 }, [ %rd80 + 0 ]; + @!%p1 mov.u32 %r81, %r342; + @!%p1 mov.u32 %r82, %r342; + @!%p1 mov.u32 %r83, %r342; + @!%p1 mov.u32 %r84, %r342; + mov.b32 %f182, %r81; + mov.b32 %f183, %r82; + mov.b32 %f184, %r83; + mov.b32 %f185, %r84; + mov.u32 %r89, 0x0; + mov.u32 %r90, 0x0; + mov.u32 %r91, 0x0; + mov.u32 %r92, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd81 + 0 ]; + @!%p1 mov.u32 %r89, %r342; + @!%p1 mov.u32 %r90, %r342; + @!%p1 mov.u32 %r91, %r342; + @!%p1 mov.u32 %r92, %r342; + mov.b32 %f186, %r89; + mov.b32 %f187, %r90; + mov.b32 %f188, %r91; + mov.b32 %f189, %r92; + .loc 1 41 22 + add.f32 %f190, %f65, %f174; + add.f32 %f191, %f66, %f175; + add.f32 %f192, %f67, %f176; + add.f32 %f193, %f68, %f177; + add.f32 %f194, %f69, %f178; + add.f32 %f195, %f70, %f179; + add.f32 %f196, %f71, %f180; + add.f32 %f197, %f72, %f181; + add.f32 %f198, %f73, %f182; + add.f32 %f199, %f74, %f183; + add.f32 %f200, %f75, %f184; + add.f32 %f201, %f76, %f185; + add.f32 %f202, %f77, %f186; + add.f32 %f203, %f78, %f187; + add.f32 %f204, %f79, %f188; + add.f32 %f205, %f80, %f189; +$L__tmp1: + .loc 2 96 20 + sub.f32 %f206, %f190, %f598; + sub.f32 %f207, %f191, %f599; + sub.f32 %f208, %f192, %f600; + sub.f32 %f209, %f193, %f601; + sub.f32 %f210, %f194, %f602; + sub.f32 %f211, %f195, %f603; + sub.f32 %f212, %f196, %f604; + sub.f32 %f213, %f197, %f605; + sub.f32 %f214, %f198, %f606; + sub.f32 %f215, %f199, %f607; + sub.f32 %f216, %f200, %f608; + sub.f32 %f217, %f201, %f609; + sub.f32 %f218, %f202, %f610; + sub.f32 %f219, %f203, %f611; + sub.f32 %f220, %f204, %f612; + sub.f32 %f221, %f205, %f613; + .loc 2 97 26 + add.f32 %f550, %f550, 0f3F800000; + add.f32 %f551, %f551, 0f3F800000; + add.f32 %f552, %f552, 0f3F800000; + add.f32 %f553, %f553, 0f3F800000; + add.f32 %f554, %f554, 0f3F800000; + add.f32 %f555, %f555, 0f3F800000; + add.f32 %f556, %f556, 0f3F800000; + add.f32 %f557, %f557, 0f3F800000; + add.f32 %f558, %f558, 0f3F800000; + add.f32 %f559, %f559, 0f3F800000; + add.f32 %f560, %f560, 0f3F800000; + add.f32 %f561, %f561, 0f3F800000; + add.f32 %f562, %f562, 0f3F800000; + add.f32 %f563, %f563, 0f3F800000; + add.f32 %f564, %f564, 0f3F800000; + add.f32 %f565, %f565, 0f3F800000; + add.f32 %f566, %f566, 0f3F800000; + add.f32 %f567, %f567, 0f3F800000; + add.f32 %f568, %f568, 0f3F800000; + add.f32 %f569, %f569, 0f3F800000; + add.f32 %f570, %f570, 0f3F800000; + add.f32 %f571, %f571, 0f3F800000; + add.f32 %f572, %f572, 0f3F800000; + add.f32 %f573, %f573, 0f3F800000; + add.f32 %f574, %f574, 0f3F800000; + add.f32 %f575, %f575, 0f3F800000; + add.f32 %f576, %f576, 0f3F800000; + add.f32 %f577, %f577, 0f3F800000; + add.f32 %f578, %f578, 0f3F800000; + add.f32 %f579, %f579, 0f3F800000; + add.f32 %f580, %f580, 0f3F800000; + add.f32 %f581, %f581, 0f3F800000; + .loc 2 98 30 + mov.b32 %r98, %f206; + mov.b32 %r99, %f550; + div.full.f32 %r97, %r98, %r99; + mov.b32 %f222, %r97; + mov.b32 %r101, %f207; + mov.b32 %r102, %f551; + div.full.f32 %r100, %r101, %r102; + mov.b32 %f223, %r100; + mov.b32 %r104, %f208; + mov.b32 %r105, %f552; + div.full.f32 %r103, %r104, %r105; + mov.b32 %f224, %r103; + mov.b32 %r107, %f209; + mov.b32 %r108, %f553; + div.full.f32 %r106, %r107, %r108; + mov.b32 %f225, %r106; + mov.b32 %r110, %f210; + mov.b32 %r111, %f554; + div.full.f32 %r109, %r110, %r111; + mov.b32 %f226, %r109; + mov.b32 %r113, %f211; + mov.b32 %r114, %f555; + div.full.f32 %r112, %r113, %r114; + mov.b32 %f227, %r112; + mov.b32 %r116, %f212; + mov.b32 %r117, %f556; + div.full.f32 %r115, %r116, %r117; + mov.b32 %f228, %r115; + mov.b32 %r119, %f213; + mov.b32 %r120, %f557; + div.full.f32 %r118, %r119, %r120; + mov.b32 %f229, %r118; + mov.b32 %r122, %f214; + mov.b32 %r123, %f558; + div.full.f32 %r121, %r122, %r123; + mov.b32 %f230, %r121; + mov.b32 %r125, %f215; + mov.b32 %r126, %f559; + div.full.f32 %r124, %r125, %r126; + mov.b32 %f231, %r124; + mov.b32 %r128, %f216; + mov.b32 %r129, %f560; + div.full.f32 %r127, %r128, %r129; + mov.b32 %f232, %r127; + mov.b32 %r131, %f217; + mov.b32 %r132, %f561; + div.full.f32 %r130, %r131, %r132; + mov.b32 %f233, %r130; + mov.b32 %r134, %f218; + mov.b32 %r135, %f562; + div.full.f32 %r133, %r134, %r135; + mov.b32 %f234, %r133; + mov.b32 %r137, %f219; + mov.b32 %r138, %f563; + div.full.f32 %r136, %r137, %r138; + mov.b32 %f235, %r136; + mov.b32 %r140, %f220; + mov.b32 %r141, %f564; + div.full.f32 %r139, %r140, %r141; + mov.b32 %f236, %r139; + mov.b32 %r143, %f221; + mov.b32 %r144, %f565; + div.full.f32 %r142, %r143, %r144; + mov.b32 %f237, %r142; + .loc 2 98 22 + add.f32 %f598, %f598, %f222; + add.f32 %f599, %f599, %f223; + add.f32 %f600, %f600, %f224; + add.f32 %f601, %f601, %f225; + add.f32 %f602, %f602, %f226; + add.f32 %f603, %f603, %f227; + add.f32 %f604, %f604, %f228; + add.f32 %f605, %f605, %f229; + add.f32 %f606, %f606, %f230; + add.f32 %f607, %f607, %f231; + add.f32 %f608, %f608, %f232; + add.f32 %f609, %f609, %f233; + add.f32 %f610, %f610, %f234; + add.f32 %f611, %f611, %f235; + add.f32 %f612, %f612, %f236; + add.f32 %f613, %f613, %f237; + .loc 2 101 30 + sub.f32 %f238, %f190, %f598; + sub.f32 %f239, %f191, %f599; + sub.f32 %f240, %f192, %f600; + sub.f32 %f241, %f193, %f601; + sub.f32 %f242, %f194, %f602; + sub.f32 %f243, %f195, %f603; + sub.f32 %f244, %f196, %f604; + sub.f32 %f245, %f197, %f605; + sub.f32 %f246, %f198, %f606; + sub.f32 %f247, %f199, %f607; + sub.f32 %f248, %f200, %f608; + sub.f32 %f249, %f201, %f609; + sub.f32 %f250, %f202, %f610; + sub.f32 %f251, %f203, %f611; + sub.f32 %f252, %f204, %f612; + sub.f32 %f253, %f205, %f613; +$L__tmp2: + .loc 1 47 48 + fma.rn.f32 %f582, %f206, %f238, %f582; + fma.rn.f32 %f583, %f207, %f239, %f583; + fma.rn.f32 %f584, %f208, %f240, %f584; + fma.rn.f32 %f585, %f209, %f241, %f585; + fma.rn.f32 %f586, %f210, %f242, %f586; + fma.rn.f32 %f587, %f211, %f243, %f587; + fma.rn.f32 %f588, %f212, %f244, %f588; + fma.rn.f32 %f589, %f213, %f245, %f589; + fma.rn.f32 %f590, %f214, %f246, %f590; + fma.rn.f32 %f591, %f215, %f247, %f591; + fma.rn.f32 %f592, %f216, %f248, %f592; + fma.rn.f32 %f593, %f217, %f249, %f593; + fma.rn.f32 %f594, %f218, %f250, %f594; + fma.rn.f32 %f595, %f219, %f251, %f595; + fma.rn.f32 %f596, %f220, %f252, %f596; + fma.rn.f32 %f597, %f221, %f253, %f597; + .loc 1 31 36 + add.s64 %rd105, %rd105, 256; + add.s32 %r406, %r406, 64; + setp.lt.u32 %p62, %r406, 192; + @%p62 bra $L__BB0_1; + bra.uni $L__BB0_4; +$L__BB0_1: + .loc 1 39 40 + setp.lt.u64 %p41, %rd1, 50257; + .loc 1 35 34 + add.s64 %rd67, %rd6, %rd105; + add.s64 %rd68, %rd67, 16; + add.s64 %rd69, %rd5, %rd105; + .loc 1 35 50 + add.s64 %rd70, %rd69, 16; + mov.b32 %r342, 0; + mov.u32 %r32, 0x0; + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd67 + 0 ]; + @!%p1 mov.u32 %r32, %r342; + @!%p1 mov.u32 %r33, %r342; + @!%p1 mov.u32 %r34, %r342; + @!%p1 mov.u32 %r35, %r342; + mov.b32 %f65, %r32; + mov.b32 %f66, %r33; + mov.b32 %f67, %r34; + mov.b32 %f68, %r35; + mov.u32 %r40, 0x0; + mov.u32 %r41, 0x0; + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd68 + 0 ]; + @!%p1 mov.u32 %r40, %r342; + @!%p1 mov.u32 %r41, %r342; + @!%p1 mov.u32 %r42, %r342; + @!%p1 mov.u32 %r43, %r342; + mov.b32 %f69, %r40; + mov.b32 %f70, %r41; + mov.b32 %f71, %r42; + mov.b32 %f72, %r43; + mov.u32 %r48, 0x0; + mov.u32 %r49, 0x0; + mov.u32 %r50, 0x0; + mov.u32 %r51, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd69 + 0 ]; + @!%p1 mov.u32 %r48, %r342; + @!%p1 mov.u32 %r49, %r342; + @!%p1 mov.u32 %r50, %r342; + @!%p1 mov.u32 %r51, %r342; + mov.b32 %f73, %r48; + mov.b32 %f74, %r49; + mov.b32 %f75, %r50; + mov.b32 %f76, %r51; + mov.u32 %r56, 0x0; + mov.u32 %r57, 0x0; + mov.u32 %r58, 0x0; + mov.u32 %r59, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd70 + 0 ]; + @!%p1 mov.u32 %r56, %r342; + @!%p1 mov.u32 %r57, %r342; + @!%p1 mov.u32 %r58, %r342; + @!%p1 mov.u32 %r59, %r342; + mov.b32 %f77, %r56; + mov.b32 %f78, %r57; + mov.b32 %f79, %r58; + mov.b32 %f80, %r59; + mov.b32 %r405, 883; + mov.u64 %rd104, 1; + .loc 1 39 55 + @%p41 bra $L__BB0_3; + mov.u64 %rd71, assertMessage_0; + cvta.global.u64 %rd72, %rd71; + mov.u64 %rd73, assertFile_0; + cvta.global.u64 %rd74, %rd73; + mov.u64 %rd75, assertFunc_0; + cvta.global.u64 %rd76, %rd75; + { // callseq 6, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd72; + .param .b64 param1; + st.param.b64 [param1+0], %rd74; + .param .b32 param2; + st.param.b32 [param2+0], %r405; + .param .b64 param3; + st.param.b64 [param3+0], %rd76; + .param .b64 param4; + st.param.b64 [param4+0], %rd104; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 6 + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 31 36 + and.b32 %r254, %r4, 3; + mad.lo.s32 %r255, %r254, 72, %r2; + shl.b32 %r256, %r255, 2; + mov.u32 %r257, global_smem; + add.s32 %r258, %r257, %r256; + st.shared.f32 [%r258], %f566; + st.shared.f32 [%r258+1152], %f567; + st.shared.f32 [%r258+2304], %f568; + st.shared.f32 [%r258+3456], %f569; + st.shared.f32 [%r258+4608], %f570; + st.shared.f32 [%r258+5760], %f571; + st.shared.f32 [%r258+6912], %f572; + st.shared.f32 [%r258+8064], %f573; + bar.sync 0; + mad.lo.s32 %r259, %r1, 72, %r3; + shl.b32 %r260, %r259, 2; + add.s32 %r261, %r257, %r260; + ld.shared.v4.f32 {%f254, %f255, %f256, %f257}, [%r261]; + ld.shared.v4.f32 {%f258, %f259, %f260, %f261}, [%r261+16]; + bar.sync 0; + st.shared.f32 [%r258], %f574; + st.shared.f32 [%r258+1152], %f575; + st.shared.f32 [%r258+2304], %f576; + st.shared.f32 [%r258+3456], %f577; + st.shared.f32 [%r258+4608], %f578; + st.shared.f32 [%r258+5760], %f579; + st.shared.f32 [%r258+6912], %f580; + st.shared.f32 [%r258+8064], %f581; + bar.sync 0; + ld.shared.v4.f32 {%f262, %f263, %f264, %f265}, [%r261]; + ld.shared.v4.f32 {%f266, %f267, %f268, %f269}, [%r261+16]; +$L__tmp3: + .loc 2 108 21 + sub.f32 %f270, %f599, %f598; + .loc 2 109 28 + add.f32 %f271, %f254, %f255; + .loc 2 110 39 + setp.eq.f32 %p63, %f271, 0f00000000; + .loc 2 110 60 + mov.b32 %r146, %f255; + mov.b32 %r147, %f271; + div.full.f32 %r145, %r146, %r147; + mov.b32 %f272, %r145; + .loc 2 110 49 + selp.f32 %f273, 0f00000000, %f272, %p63; + .loc 2 112 17 + fma.rn.f32 %f274, %f270, %f273, %f598; + .loc 2 113 15 + add.f32 %f275, %f582, %f583; + .loc 2 113 30 + mul.f32 %f276, %f270, %f270; + .loc 2 113 38 + mul.f32 %f277, %f276, %f254; + .loc 2 113 22 + fma.rn.f32 %f278, %f277, %f273, %f275; + .loc 2 108 21 + sub.f32 %f279, %f600, %f274; + .loc 2 109 28 + add.f32 %f280, %f256, %f271; + .loc 2 110 39 + setp.eq.f32 %p64, %f280, 0f00000000; + .loc 2 110 60 + mov.b32 %r150, %f280; + mov.b32 %r149, %f256; + div.full.f32 %r148, %r149, %r150; + mov.b32 %f281, %r148; + .loc 2 110 49 + selp.f32 %f282, 0f00000000, %f281, %p64; + .loc 2 112 17 + fma.rn.f32 %f283, %f282, %f279, %f274; + .loc 2 113 15 + add.f32 %f284, %f584, %f278; + .loc 2 113 30 + mul.f32 %f285, %f279, %f279; + .loc 2 113 38 + mul.f32 %f286, %f271, %f285; + .loc 2 113 22 + fma.rn.f32 %f287, %f282, %f286, %f284; + .loc 2 108 21 + sub.f32 %f288, %f601, %f283; + .loc 2 109 28 + add.f32 %f289, %f257, %f280; + .loc 2 110 39 + setp.eq.f32 %p65, %f289, 0f00000000; + .loc 2 110 60 + mov.b32 %r153, %f289; + mov.b32 %r152, %f257; + div.full.f32 %r151, %r152, %r153; + mov.b32 %f290, %r151; + .loc 2 110 49 + selp.f32 %f291, 0f00000000, %f290, %p65; + .loc 2 112 17 + fma.rn.f32 %f292, %f291, %f288, %f283; + .loc 2 113 15 + add.f32 %f293, %f585, %f287; + .loc 2 113 30 + mul.f32 %f294, %f288, %f288; + .loc 2 113 38 + mul.f32 %f295, %f280, %f294; + .loc 2 113 22 + fma.rn.f32 %f296, %f291, %f295, %f293; + .loc 2 108 21 + sub.f32 %f297, %f602, %f292; + .loc 2 109 28 + add.f32 %f298, %f258, %f289; + .loc 2 110 39 + setp.eq.f32 %p66, %f298, 0f00000000; + .loc 2 110 60 + mov.b32 %r156, %f298; + mov.b32 %r155, %f258; + div.full.f32 %r154, %r155, %r156; + mov.b32 %f299, %r154; + .loc 2 110 49 + selp.f32 %f300, 0f00000000, %f299, %p66; + .loc 2 112 17 + fma.rn.f32 %f301, %f300, %f297, %f292; + .loc 2 113 15 + add.f32 %f302, %f586, %f296; + .loc 2 113 30 + mul.f32 %f303, %f297, %f297; + .loc 2 113 38 + mul.f32 %f304, %f289, %f303; + .loc 2 113 22 + fma.rn.f32 %f305, %f300, %f304, %f302; + .loc 2 108 21 + sub.f32 %f306, %f603, %f301; + .loc 2 109 28 + add.f32 %f307, %f259, %f298; + .loc 2 110 39 + setp.eq.f32 %p67, %f307, 0f00000000; + .loc 2 110 60 + mov.b32 %r159, %f307; + mov.b32 %r158, %f259; + div.full.f32 %r157, %r158, %r159; + mov.b32 %f308, %r157; + .loc 2 110 49 + selp.f32 %f309, 0f00000000, %f308, %p67; + .loc 2 112 17 + fma.rn.f32 %f310, %f309, %f306, %f301; + .loc 2 113 15 + add.f32 %f311, %f587, %f305; + .loc 2 113 30 + mul.f32 %f312, %f306, %f306; + .loc 2 113 38 + mul.f32 %f313, %f298, %f312; + .loc 2 113 22 + fma.rn.f32 %f314, %f309, %f313, %f311; + .loc 2 108 21 + sub.f32 %f315, %f604, %f310; + .loc 2 109 28 + add.f32 %f316, %f260, %f307; + .loc 2 110 39 + setp.eq.f32 %p68, %f316, 0f00000000; + .loc 2 110 60 + mov.b32 %r162, %f316; + mov.b32 %r161, %f260; + div.full.f32 %r160, %r161, %r162; + mov.b32 %f317, %r160; + .loc 2 110 49 + selp.f32 %f318, 0f00000000, %f317, %p68; + .loc 2 112 17 + fma.rn.f32 %f319, %f318, %f315, %f310; + .loc 2 113 15 + add.f32 %f320, %f588, %f314; + .loc 2 113 30 + mul.f32 %f321, %f315, %f315; + .loc 2 113 38 + mul.f32 %f322, %f307, %f321; + .loc 2 113 22 + fma.rn.f32 %f323, %f318, %f322, %f320; + .loc 2 108 21 + sub.f32 %f324, %f605, %f319; + .loc 2 109 28 + add.f32 %f325, %f261, %f316; + .loc 2 110 39 + setp.eq.f32 %p69, %f325, 0f00000000; + .loc 2 110 60 + mov.b32 %r165, %f325; + mov.b32 %r164, %f261; + div.full.f32 %r163, %r164, %r165; + mov.b32 %f326, %r163; + .loc 2 110 49 + selp.f32 %f327, 0f00000000, %f326, %p69; + .loc 2 112 17 + fma.rn.f32 %f328, %f327, %f324, %f319; + .loc 2 113 15 + add.f32 %f329, %f589, %f323; + .loc 2 113 30 + mul.f32 %f330, %f324, %f324; + .loc 2 113 38 + mul.f32 %f331, %f316, %f330; + .loc 2 113 22 + fma.rn.f32 %f332, %f327, %f331, %f329; + .loc 2 108 21 + sub.f32 %f333, %f607, %f606; + .loc 2 109 28 + add.f32 %f334, %f262, %f263; + .loc 2 110 39 + setp.eq.f32 %p70, %f334, 0f00000000; + .loc 2 110 60 + mov.b32 %r167, %f263; + mov.b32 %r168, %f334; + div.full.f32 %r166, %r167, %r168; + mov.b32 %f335, %r166; + .loc 2 110 49 + selp.f32 %f336, 0f00000000, %f335, %p70; + .loc 2 112 17 + fma.rn.f32 %f337, %f333, %f336, %f606; + .loc 2 113 15 + add.f32 %f338, %f590, %f591; + .loc 2 113 30 + mul.f32 %f339, %f333, %f333; + .loc 2 113 38 + mul.f32 %f340, %f339, %f262; + .loc 2 113 22 + fma.rn.f32 %f341, %f340, %f336, %f338; + .loc 2 108 21 + sub.f32 %f342, %f608, %f337; + .loc 2 109 28 + add.f32 %f343, %f264, %f334; + .loc 2 110 39 + setp.eq.f32 %p71, %f343, 0f00000000; + .loc 2 110 60 + mov.b32 %r171, %f343; + mov.b32 %r170, %f264; + div.full.f32 %r169, %r170, %r171; + mov.b32 %f344, %r169; + .loc 2 110 49 + selp.f32 %f345, 0f00000000, %f344, %p71; + .loc 2 112 17 + fma.rn.f32 %f346, %f345, %f342, %f337; + .loc 2 113 15 + add.f32 %f347, %f592, %f341; + .loc 2 113 30 + mul.f32 %f348, %f342, %f342; + .loc 2 113 38 + mul.f32 %f349, %f334, %f348; + .loc 2 113 22 + fma.rn.f32 %f350, %f345, %f349, %f347; + .loc 2 108 21 + sub.f32 %f351, %f609, %f346; + .loc 2 109 28 + add.f32 %f352, %f265, %f343; + .loc 2 110 39 + setp.eq.f32 %p72, %f352, 0f00000000; + .loc 2 110 60 + mov.b32 %r174, %f352; + mov.b32 %r173, %f265; + div.full.f32 %r172, %r173, %r174; + mov.b32 %f353, %r172; + .loc 2 110 49 + selp.f32 %f354, 0f00000000, %f353, %p72; + .loc 2 112 17 + fma.rn.f32 %f355, %f354, %f351, %f346; + .loc 2 113 15 + add.f32 %f356, %f593, %f350; + .loc 2 113 30 + mul.f32 %f357, %f351, %f351; + .loc 2 113 38 + mul.f32 %f358, %f343, %f357; + .loc 2 113 22 + fma.rn.f32 %f359, %f354, %f358, %f356; + .loc 2 108 21 + sub.f32 %f360, %f610, %f355; + .loc 2 109 28 + add.f32 %f361, %f266, %f352; + .loc 2 110 39 + setp.eq.f32 %p73, %f361, 0f00000000; + .loc 2 110 60 + mov.b32 %r177, %f361; + mov.b32 %r176, %f266; + div.full.f32 %r175, %r176, %r177; + mov.b32 %f362, %r175; + .loc 2 110 49 + selp.f32 %f363, 0f00000000, %f362, %p73; + .loc 2 112 17 + fma.rn.f32 %f364, %f363, %f360, %f355; + .loc 2 113 15 + add.f32 %f365, %f594, %f359; + .loc 2 113 30 + mul.f32 %f366, %f360, %f360; + .loc 2 113 38 + mul.f32 %f367, %f352, %f366; + .loc 2 113 22 + fma.rn.f32 %f368, %f363, %f367, %f365; + .loc 2 108 21 + sub.f32 %f369, %f611, %f364; + .loc 2 109 28 + add.f32 %f370, %f267, %f361; + .loc 2 110 39 + setp.eq.f32 %p74, %f370, 0f00000000; + .loc 2 110 60 + mov.b32 %r180, %f370; + mov.b32 %r179, %f267; + div.full.f32 %r178, %r179, %r180; + mov.b32 %f371, %r178; + .loc 2 110 49 + selp.f32 %f372, 0f00000000, %f371, %p74; + .loc 2 112 17 + fma.rn.f32 %f373, %f372, %f369, %f364; + .loc 2 113 15 + add.f32 %f374, %f595, %f368; + .loc 2 113 30 + mul.f32 %f375, %f369, %f369; + .loc 2 113 38 + mul.f32 %f376, %f361, %f375; + .loc 2 113 22 + fma.rn.f32 %f377, %f372, %f376, %f374; + .loc 2 108 21 + sub.f32 %f378, %f612, %f373; + .loc 2 109 28 + add.f32 %f379, %f268, %f370; + .loc 2 110 39 + setp.eq.f32 %p75, %f379, 0f00000000; + .loc 2 110 60 + mov.b32 %r183, %f379; + mov.b32 %r182, %f268; + div.full.f32 %r181, %r182, %r183; + mov.b32 %f380, %r181; + .loc 2 110 49 + selp.f32 %f381, 0f00000000, %f380, %p75; + .loc 2 112 17 + fma.rn.f32 %f382, %f381, %f378, %f373; + .loc 2 113 15 + add.f32 %f383, %f596, %f377; + .loc 2 113 30 + mul.f32 %f384, %f378, %f378; + .loc 2 113 38 + mul.f32 %f385, %f370, %f384; + .loc 2 113 22 + fma.rn.f32 %f386, %f381, %f385, %f383; + .loc 2 108 21 + sub.f32 %f387, %f613, %f382; + .loc 2 109 28 + add.f32 %f388, %f269, %f379; + .loc 2 110 39 + setp.eq.f32 %p76, %f388, 0f00000000; + .loc 2 110 60 + mov.b32 %r186, %f388; + mov.b32 %r185, %f269; + div.full.f32 %r184, %r185, %r186; + mov.b32 %f389, %r184; + .loc 2 110 49 + selp.f32 %f390, 0f00000000, %f389, %p76; + .loc 2 112 17 + fma.rn.f32 %f391, %f390, %f387, %f382; + .loc 2 113 15 + add.f32 %f392, %f597, %f386; + .loc 2 113 30 + mul.f32 %f393, %f387, %f387; + .loc 2 113 38 + mul.f32 %f394, %f379, %f393; + .loc 2 113 22 + fma.rn.f32 %f395, %f390, %f394, %f392; +$L__tmp4: + .loc 2 120 46 + mov.b32 %r262, %f328; + shfl.sync.bfly.b32 %r263, %r262, 4, 31, -1; + mov.b32 %f396, %r263; + mov.b32 %r264, %f332; + shfl.sync.bfly.b32 %r265, %r264, 4, 31, -1; + mov.b32 %f397, %r265; + shfl.sync.bfly.b32 %r188, %r165, 4, 31, -1; + mov.b32 %f398, %r188; +$L__tmp5: + .loc 2 108 21 + sub.f32 %f399, %f396, %f328; + .loc 2 109 28 + add.f32 %f400, %f325, %f398; + .loc 2 110 39 + setp.eq.f32 %p77, %f400, 0f00000000; + .loc 2 110 60 + mov.b32 %r189, %f400; + div.full.f32 %r187, %r188, %r189; + mov.b32 %f401, %r187; + .loc 2 110 49 + selp.f32 %f402, 0f00000000, %f401, %p77; + .loc 2 112 17 + fma.rn.f32 %f403, %f402, %f399, %f328; + .loc 2 113 15 + add.f32 %f404, %f332, %f397; + .loc 2 113 30 + mul.f32 %f405, %f399, %f399; + .loc 2 113 38 + mul.f32 %f406, %f325, %f405; + .loc 2 113 22 + fma.rn.f32 %f407, %f402, %f406, %f404; +$L__tmp6: + .loc 2 120 46 + mov.b32 %r266, %f403; + shfl.sync.bfly.b32 %r267, %r266, 2, 31, -1; + mov.b32 %f408, %r267; + mov.b32 %r268, %f407; + shfl.sync.bfly.b32 %r269, %r268, 2, 31, -1; + mov.b32 %f409, %r269; + shfl.sync.bfly.b32 %r191, %r189, 2, 31, -1; + mov.b32 %f410, %r191; +$L__tmp7: + .loc 2 108 21 + sub.f32 %f411, %f408, %f403; + .loc 2 109 28 + add.f32 %f412, %f400, %f410; + .loc 2 110 39 + setp.eq.f32 %p78, %f412, 0f00000000; + .loc 2 110 60 + mov.b32 %r192, %f412; + div.full.f32 %r190, %r191, %r192; + mov.b32 %f413, %r190; + .loc 2 110 49 + selp.f32 %f414, 0f00000000, %f413, %p78; + .loc 2 112 17 + fma.rn.f32 %f415, %f414, %f411, %f403; + .loc 2 113 15 + add.f32 %f416, %f407, %f409; + .loc 2 113 30 + mul.f32 %f417, %f411, %f411; + .loc 2 113 38 + mul.f32 %f418, %f400, %f417; + .loc 2 113 22 + fma.rn.f32 %f419, %f414, %f418, %f416; +$L__tmp8: + .loc 2 120 46 + mov.b32 %r270, %f415; + shfl.sync.bfly.b32 %r271, %r270, 1, 31, -1; + mov.b32 %f420, %r271; + mov.b32 %r272, %f419; + shfl.sync.bfly.b32 %r273, %r272, 1, 31, -1; + mov.b32 %f421, %r273; + shfl.sync.bfly.b32 %r194, %r192, 1, 31, -1; + mov.b32 %f422, %r194; +$L__tmp9: + .loc 2 108 21 + sub.f32 %f423, %f420, %f415; + .loc 2 109 28 + add.f32 %f424, %f412, %f422; + .loc 2 110 39 + setp.eq.f32 %p79, %f424, 0f00000000; + .loc 2 110 60 + mov.b32 %r195, %f424; + div.full.f32 %r193, %r194, %r195; + mov.b32 %f425, %r193; + .loc 2 110 49 + selp.f32 %f426, 0f00000000, %f425, %p79; + .loc 2 112 17 + fma.rn.f32 %f145, %f423, %f426, %f415; + .loc 2 113 15 + add.f32 %f427, %f419, %f421; + .loc 2 113 30 + mul.f32 %f428, %f423, %f423; + .loc 2 113 38 + mul.f32 %f429, %f412, %f428; + .loc 2 113 22 + fma.rn.f32 %f430, %f426, %f429, %f427; +$L__tmp10: + .loc 2 120 46 + mov.b32 %r274, %f391; + shfl.sync.bfly.b32 %r275, %r274, 4, 31, -1; + mov.b32 %f431, %r275; + mov.b32 %r276, %f395; + shfl.sync.bfly.b32 %r277, %r276, 4, 31, -1; + mov.b32 %f432, %r277; + shfl.sync.bfly.b32 %r197, %r186, 4, 31, -1; + mov.b32 %f433, %r197; +$L__tmp11: + .loc 2 108 21 + sub.f32 %f434, %f431, %f391; + .loc 2 109 28 + add.f32 %f435, %f388, %f433; + .loc 2 110 39 + setp.eq.f32 %p80, %f435, 0f00000000; + .loc 2 110 60 + mov.b32 %r198, %f435; + div.full.f32 %r196, %r197, %r198; + mov.b32 %f436, %r196; + .loc 2 110 49 + selp.f32 %f437, 0f00000000, %f436, %p80; + .loc 2 112 17 + fma.rn.f32 %f438, %f434, %f437, %f391; + .loc 2 113 15 + add.f32 %f439, %f395, %f432; + .loc 2 113 30 + mul.f32 %f440, %f434, %f434; + .loc 2 113 38 + mul.f32 %f441, %f388, %f440; + .loc 2 113 22 + fma.rn.f32 %f442, %f441, %f437, %f439; +$L__tmp12: + .loc 2 120 46 + mov.b32 %r278, %f438; + shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1; + mov.b32 %f443, %r279; + mov.b32 %r280, %f442; + shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1; + mov.b32 %f444, %r281; + shfl.sync.bfly.b32 %r200, %r198, 2, 31, -1; + mov.b32 %f445, %r200; +$L__tmp13: + .loc 2 108 21 + sub.f32 %f446, %f443, %f438; + .loc 2 109 28 + add.f32 %f447, %f435, %f445; + .loc 2 110 39 + setp.eq.f32 %p81, %f447, 0f00000000; + .loc 2 110 60 + mov.b32 %r201, %f447; + div.full.f32 %r199, %r200, %r201; + mov.b32 %f448, %r199; + .loc 2 110 49 + selp.f32 %f449, 0f00000000, %f448, %p81; + .loc 2 112 17 + fma.rn.f32 %f450, %f446, %f449, %f438; + .loc 2 113 15 + add.f32 %f451, %f442, %f444; + .loc 2 113 30 + mul.f32 %f452, %f446, %f446; + .loc 2 113 38 + mul.f32 %f453, %f435, %f452; + .loc 2 113 22 + fma.rn.f32 %f454, %f449, %f453, %f451; +$L__tmp14: + .loc 2 120 46 + mov.b32 %r282, %f450; + shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1; + mov.b32 %f455, %r283; + mov.b32 %r284, %f454; + shfl.sync.bfly.b32 %r285, %r284, 1, 31, -1; + mov.b32 %f456, %r285; + shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1; + mov.b32 %f457, %r203; +$L__tmp15: + .loc 2 108 21 + sub.f32 %f458, %f455, %f450; + .loc 2 109 28 + add.f32 %f459, %f447, %f457; + .loc 2 110 39 + setp.eq.f32 %p82, %f459, 0f00000000; + .loc 2 110 60 + mov.b32 %r204, %f459; + div.full.f32 %r202, %r203, %r204; + mov.b32 %f460, %r202; + .loc 2 110 49 + selp.f32 %f461, 0f00000000, %f460, %p82; + .loc 2 112 17 + fma.rn.f32 %f146, %f458, %f461, %f450; + .loc 2 113 15 + add.f32 %f462, %f454, %f456; + .loc 2 113 30 + mul.f32 %f463, %f458, %f458; + .loc 2 113 38 + mul.f32 %f464, %f447, %f463; + .loc 2 113 22 + fma.rn.f32 %f465, %f461, %f464, %f462; +$L__tmp16: + .loc 1 69 23 + mov.b32 %r206, %f430; + mov.b32 %r207, 1132462080; + div.full.f32 %r205, %r206, %r207; + mov.b32 %f466, %r205; + mov.b32 %r230, %f465; + div.full.f32 %r229, %r230, %r207; + mov.b32 %f467, %r229; + .loc 1 71 24 + add.f32 %f147, %f466, 0f3727C5AC; + add.f32 %f148, %f467, 0f3727C5AC; + .loc 1 55 36 + add.s64 %rd9, %rd12, %rd2; + shl.b32 %r286, %r11, 14; + shl.b32 %r287, %r1, 8; + or.b32 %r288, %r286, %r287; + or.b32 %r8, %r288, %r3; + mov.u64 %rd106, 0; + mov.b32 %r407, -64; + rsqrt.approx.ftz.f32 %f516, %f147; + rsqrt.approx.ftz.f32 %f517, %f148; + bra.uni $L__BB0_5; +$L__BB0_7: + .loc 1 65 35 + add.s64 %rd96, %rd4, %rd106; + add.s64 %rd97, %rd96, 16; + add.s64 %rd98, %rd3, %rd106; + .loc 1 65 54 + add.s64 %rd99, %rd98, 16; + mov.u32 %r338, 0x0; + mov.u32 %r339, 0x0; + mov.u32 %r340, 0x0; + mov.u32 %r341, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r338, %r339, %r340, %r341 }, [ %rd96 + 0 ]; + @!%p1 mov.u32 %r338, %r342; + @!%p1 mov.u32 %r339, %r342; + @!%p1 mov.u32 %r340, %r342; + @!%p1 mov.u32 %r341, %r342; + mov.b32 %f468, %r338; + mov.b32 %f469, %r339; + mov.b32 %f470, %r340; + mov.b32 %f471, %r341; + mov.u32 %r346, 0x0; + mov.u32 %r347, 0x0; + mov.u32 %r348, 0x0; + mov.u32 %r349, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r346, %r347, %r348, %r349 }, [ %rd97 + 0 ]; + @!%p1 mov.u32 %r346, %r342; + @!%p1 mov.u32 %r347, %r342; + @!%p1 mov.u32 %r348, %r342; + @!%p1 mov.u32 %r349, %r342; + mov.b32 %f472, %r346; + mov.b32 %f473, %r347; + mov.b32 %f474, %r348; + mov.b32 %f475, %r349; + mov.u32 %r354, 0x0; + mov.u32 %r355, 0x0; + mov.u32 %r356, 0x0; + mov.u32 %r357, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r354, %r355, %r356, %r357 }, [ %rd98 + 0 ]; + @!%p1 mov.u32 %r354, %r342; + @!%p1 mov.u32 %r355, %r342; + @!%p1 mov.u32 %r356, %r342; + @!%p1 mov.u32 %r357, %r342; + mov.b32 %f476, %r354; + mov.b32 %f477, %r355; + mov.b32 %f478, %r356; + mov.b32 %f479, %r357; + mov.u32 %r362, 0x0; + mov.u32 %r363, 0x0; + mov.u32 %r364, 0x0; + mov.u32 %r365, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r362, %r363, %r364, %r365 }, [ %rd99 + 0 ]; + @!%p1 mov.u32 %r362, %r342; + @!%p1 mov.u32 %r363, %r342; + @!%p1 mov.u32 %r364, %r342; + @!%p1 mov.u32 %r365, %r342; + mov.b32 %f480, %r362; + mov.b32 %f481, %r363; + mov.b32 %f482, %r364; + mov.b32 %f483, %r365; + .loc 1 66 24 + add.f32 %f484, %f149, %f468; + add.f32 %f485, %f150, %f469; + add.f32 %f486, %f151, %f470; + add.f32 %f487, %f152, %f471; + add.f32 %f488, %f153, %f472; + add.f32 %f489, %f154, %f473; + add.f32 %f490, %f155, %f474; + add.f32 %f491, %f156, %f475; + add.f32 %f492, %f157, %f476; + add.f32 %f493, %f158, %f477; + add.f32 %f494, %f159, %f478; + add.f32 %f495, %f160, %f479; + add.f32 %f496, %f161, %f480; + add.f32 %f497, %f162, %f481; + add.f32 %f498, %f163, %f482; + add.f32 %f499, %f164, %f483; + .loc 1 67 24 + sub.f32 %f500, %f484, %f145; + sub.f32 %f501, %f485, %f145; + sub.f32 %f502, %f486, %f145; + sub.f32 %f503, %f487, %f145; + sub.f32 %f504, %f488, %f145; + sub.f32 %f505, %f489, %f145; + sub.f32 %f506, %f490, %f145; + sub.f32 %f507, %f491, %f145; + sub.f32 %f508, %f492, %f146; + sub.f32 %f509, %f493, %f146; + sub.f32 %f510, %f494, %f146; + sub.f32 %f511, %f495, %f146; + sub.f32 %f512, %f496, %f146; + sub.f32 %f513, %f497, %f146; + sub.f32 %f514, %f498, %f146; + sub.f32 %f515, %f499, %f146; + .loc 1 73 24 + mul.f32 %f518, %f500, %f516; + mul.f32 %f519, %f501, %f516; + mul.f32 %f520, %f502, %f516; + mul.f32 %f521, %f503, %f516; + mul.f32 %f522, %f504, %f516; + mul.f32 %f523, %f505, %f516; + mul.f32 %f524, %f506, %f516; + mul.f32 %f525, %f507, %f516; + mul.f32 %f526, %f508, %f517; + mul.f32 %f527, %f509, %f517; + mul.f32 %f528, %f510, %f517; + mul.f32 %f529, %f511, %f517; + mul.f32 %f530, %f512, %f517; + mul.f32 %f531, %f513, %f517; + mul.f32 %f532, %f514, %f517; + mul.f32 %f533, %f515, %f517; + .loc 1 74 24 + mul.f32 %f534, %f518, %f165; + mul.f32 %f535, %f519, %f166; + mul.f32 %f536, %f520, %f167; + mul.f32 %f537, %f521, %f168; + mul.f32 %f538, %f522, %f169; + mul.f32 %f539, %f523, %f170; + mul.f32 %f540, %f524, %f171; + mul.f32 %f541, %f525, %f172; + mul.f32 %f542, %f526, %f165; + mul.f32 %f543, %f527, %f166; + mul.f32 %f544, %f528, %f167; + mul.f32 %f545, %f529, %f168; + mul.f32 %f546, %f530, %f169; + mul.f32 %f547, %f531, %f170; + mul.f32 %f548, %f532, %f171; + mul.f32 %f549, %f533, %f172; + .loc 1 76 35 + add.s32 %r394, %r8, %r407; + add.s32 %r395, %r394, 64; + .loc 1 76 29 + add.s32 %r396, %r394, 8256; + mul.wide.s32 %rd102, %r395, 2; + add.s64 %rd100, %rd13, %rd102; + mul.wide.s32 %rd103, %r396, 2; + add.s64 %rd101, %rd13, %rd103; + .loc 1 76 52 + mov.b32 %r370, %f534; + cvt.rn.bf16.f32 %rs1, %r370; + mov.b32 %r371, %f535; + cvt.rn.bf16.f32 %rs2, %r371; + mov.b32 %r372, %f536; + cvt.rn.bf16.f32 %rs3, %r372; + mov.b32 %r373, %f537; + cvt.rn.bf16.f32 %rs4, %r373; + mov.b32 %r374, %f538; + cvt.rn.bf16.f32 %rs5, %r374; + mov.b32 %r375, %f539; + cvt.rn.bf16.f32 %rs6, %r375; + mov.b32 %r376, %f540; + cvt.rn.bf16.f32 %rs7, %r376; + mov.b32 %r377, %f541; + cvt.rn.bf16.f32 %rs8, %r377; + mov.b32 %r378, %f542; + cvt.rn.bf16.f32 %rs9, %r378; + mov.b32 %r379, %f543; + cvt.rn.bf16.f32 %rs10, %r379; + mov.b32 %r380, %f544; + cvt.rn.bf16.f32 %rs11, %r380; + mov.b32 %r381, %f545; + cvt.rn.bf16.f32 %rs12, %r381; + mov.b32 %r382, %f546; + cvt.rn.bf16.f32 %rs13, %r382; + mov.b32 %r383, %f547; + cvt.rn.bf16.f32 %rs14, %r383; + mov.b32 %r384, %f548; + cvt.rn.bf16.f32 %rs15, %r384; + mov.b32 %r385, %f549; + cvt.rn.bf16.f32 %rs16, %r385; + mov.b32 %r397, {%rs1, %rs2}; + mov.b32 %r398, {%rs3, %rs4}; + mov.b32 %r399, {%rs5, %rs6}; + mov.b32 %r400, {%rs7, %rs8}; + @%p1 st.global.v4.b32 [ %rd100 + 0 ], { %r397, %r398, %r399, %r400 }; + mov.b32 %r401, {%rs9, %rs10}; + mov.b32 %r402, {%rs11, %rs12}; + mov.b32 %r403, {%rs13, %rs14}; + mov.b32 %r404, {%rs15, %rs16}; + @%p1 st.global.v4.b32 [ %rd101 + 0 ], { %r401, %r402, %r403, %r404 }; + .loc 1 55 36 + add.s64 %rd106, %rd106, 256; + add.s32 %r407, %r407, 64; + setp.lt.u32 %p136, %r407, 192; + @%p136 bra $L__BB0_5; + bra.uni $L__BB0_8; +$L__BB0_5: + .loc 1 59 35 + add.s64 %rd83, %rd6, %rd106; + add.s64 %rd84, %rd83, 16; + add.s64 %rd85, %rd5, %rd106; + .loc 1 59 51 + add.s64 %rd86, %rd85, 16; + mov.u32 %r289, 0x0; + mov.u32 %r290, 0x0; + mov.u32 %r291, 0x0; + mov.u32 %r292, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r289, %r290, %r291, %r292 }, [ %rd83 + 0 ]; + @!%p1 mov.u32 %r289, %r342; + @!%p1 mov.u32 %r290, %r342; + @!%p1 mov.u32 %r291, %r342; + @!%p1 mov.u32 %r292, %r342; + mov.b32 %f149, %r289; + mov.b32 %f150, %r290; + mov.b32 %f151, %r291; + mov.b32 %f152, %r292; + mov.u32 %r297, 0x0; + mov.u32 %r298, 0x0; + mov.u32 %r299, 0x0; + mov.u32 %r300, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r297, %r298, %r299, %r300 }, [ %rd84 + 0 ]; + @!%p1 mov.u32 %r297, %r342; + @!%p1 mov.u32 %r298, %r342; + @!%p1 mov.u32 %r299, %r342; + @!%p1 mov.u32 %r300, %r342; + mov.b32 %f153, %r297; + mov.b32 %f154, %r298; + mov.b32 %f155, %r299; + mov.b32 %f156, %r300; + mov.u32 %r305, 0x0; + mov.u32 %r306, 0x0; + mov.u32 %r307, 0x0; + mov.u32 %r308, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r305, %r306, %r307, %r308 }, [ %rd85 + 0 ]; + @!%p1 mov.u32 %r305, %r342; + @!%p1 mov.u32 %r306, %r342; + @!%p1 mov.u32 %r307, %r342; + @!%p1 mov.u32 %r308, %r342; + mov.b32 %f157, %r305; + mov.b32 %f158, %r306; + mov.b32 %f159, %r307; + mov.b32 %f160, %r308; + mov.u32 %r313, 0x0; + mov.u32 %r314, 0x0; + mov.u32 %r315, 0x0; + mov.u32 %r316, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r313, %r314, %r315, %r316 }, [ %rd86 + 0 ]; + @!%p1 mov.u32 %r313, %r342; + @!%p1 mov.u32 %r314, %r342; + @!%p1 mov.u32 %r315, %r342; + @!%p1 mov.u32 %r316, %r342; + mov.b32 %f161, %r313; + mov.b32 %f162, %r314; + mov.b32 %f163, %r315; + mov.b32 %f164, %r316; + .loc 1 60 35 + add.s64 %rd87, %rd9, %rd106; + .loc 1 60 40 + add.s64 %rd88, %rd87, 16; + mov.u32 %r321, 0x0; + mov.u32 %r322, 0x0; + mov.u32 %r323, 0x0; + mov.u32 %r324, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd87 + 0 ]; + @!%p1 mov.u32 %r321, %r342; + @!%p1 mov.u32 %r322, %r342; + @!%p1 mov.u32 %r323, %r342; + @!%p1 mov.u32 %r324, %r342; + mov.b32 %f165, %r321; + mov.b32 %f166, %r322; + mov.b32 %f167, %r323; + mov.b32 %f168, %r324; + mov.u32 %r329, 0x0; + mov.u32 %r330, 0x0; + mov.u32 %r331, 0x0; + mov.u32 %r332, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd88 + 0 ]; + @!%p1 mov.u32 %r329, %r342; + @!%p1 mov.u32 %r330, %r342; + @!%p1 mov.u32 %r331, %r342; + @!%p1 mov.u32 %r332, %r342; + mov.b32 %f169, %r329; + mov.b32 %f170, %r330; + mov.b32 %f171, %r331; + mov.b32 %f172, %r332; + .loc 1 64 57 + @%p41 bra $L__BB0_7; + mov.u64 %rd89, assertMessage_1; + cvta.global.u64 %rd90, %rd89; + mov.u64 %rd91, assertFile_1; + cvta.global.u64 %rd92, %rd91; + mov.u64 %rd93, assertFunc_1; + cvta.global.u64 %rd94, %rd93; + { // callseq 7, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd90; + .param .b64 param1; + st.param.b64 [param1+0], %rd92; + .param .b32 param2; + st.param.b32 [param2+0], %r405; + .param .b64 param3; + st.param.b64 [param3+0], %rd94; + .param .b64 param4; + st.param.b64 [param4+0], %rd104; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 7 + bra.uni $L__BB0_7; +$L__BB0_8: + .loc 1 55 4 + ret; +$L__tmp17: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 298 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 103 +.b8 120 +.b8 53 +.b8 108 +.b8 120 +.b8 112 +.b8 117 +.b8 101 +.b8 120 +.b8 112 +.b8 105 +.b8 110 +.b8 100 +.b8 106 +.b8 52 +.b8 100 +.b8 115 +.b8 109 +.b8 106 +.b8 122 +.b8 53 +.b8 120 +.b8 52 +.b8 50 +.b8 117 +.b8 104 +.b8 121 +.b8 121 +.b8 55 +.b8 105 +.b8 115 +.b8 107 +.b8 101 +.b8 118 +.b8 113 +.b8 55 +.b8 111 +.b8 118 +.b8 122 +.b8 112 +.b8 119 +.b8 97 +.b8 103 +.b8 98 +.b8 51 +.b8 116 +.b8 53 +.b8 112 +.b8 111 +.b8 119 +.b8 106 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 103 +.b8 120 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 44 +.b8 38 +.b8 5 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp16 +.b8 2 +.b8 50 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp16 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp4 +.b64 $L__tmp15 +.b8 2 +.b8 50 +.b8 41 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fade03c9b7b8213cece19d92fb4462dff4c94d95 --- /dev/null +++ b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir @@ -0,0 +1,137 @@ +module { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32> + %c256_i32 = arith.constant 256 : i32 + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<256> : tensor<64x1xi64> + %cst_2 = arith.constant dense<0> : tensor<64x1xi64> + %cst_3 = arith.constant dense<50257> : tensor<64x1xi64> + %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32> + %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> + %cst_8 = arith.constant dense<256> : tensor<64x1xi32> + %cst_9 = arith.constant dense<256> : tensor<1x64xi32> + %cst_10 = arith.constant dense<512> : tensor<64x1xi32> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> + %7 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %10 = arith.remsi %5, %cst_10 : tensor<64x1xi32> + %11 = arith.muli %10, %cst_8 : tensor<64x1xi32> + %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %14 = arith.addi %9, %cst_3 : tensor<64x1xi64> + %15 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64> + %16 = arith.select %15, %14, %9 : tensor<64x1xi1>, tensor<64x1xi64> + %17 = arith.cmpi sge, %16, %cst_2 : tensor<64x1xi64> + %18 = arith.cmpi slt, %16, %cst_3 : tensor<64x1xi64> + %19 = arith.andi %17, %18 : tensor<64x1xi1> + %20 = arith.muli %16, %cst_1 : tensor<64x1xi64> + %21 = tt.broadcast %20 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %22 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %23:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 { + %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32> + %47 = arith.addi %46, %6 : tensor<1x64xi32> + %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32> + %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32> + %50 = arith.addi %49, %12 : tensor<64x64xi32> + %51 = tt.addptr %13, %50 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %54 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64> + %55 = tt.broadcast %54 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %56 = arith.addi %55, %21 : tensor<64x64xi64> + %57 = tt.addptr %22, %56 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %58 = tt.load %57, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %59 = arith.addf %58, %53 : tensor<64x64xf32> + %60 = arith.subf %59, %arg8 : tensor<64x64xf32> + %61 = arith.addf %arg10, %cst_0 : tensor<64x64xf32> + %62 = arith.divf %60, %61 : tensor<64x64xf32> + %63 = arith.addf %arg8, %62 : tensor<64x64xf32> + %64 = arith.subf %59, %63 : tensor<64x64xf32> + %65 = arith.mulf %60, %64 : tensor<64x64xf32> + %66 = arith.addf %arg9, %65 : tensor<64x64xf32> + %67 = arith.select %52, %63, %arg8 : tensor<64x64xi1>, tensor<64x64xf32> + %68 = arith.select %52, %66, %arg9 : tensor<64x64xi1>, tensor<64x64xf32> + %69 = arith.select %52, %61, %arg10 : tensor<64x64xi1>, tensor<64x64xf32> + scf.yield %67, %68, %69 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32> + } + %24:3 = "tt.reduce"(%23#0, %23#1, %23#2) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %46 = arith.subf %arg10, %arg7 : f32 + %47 = arith.addf %arg9, %arg12 : f32 + %48 = arith.cmpf oeq, %47, %cst : f32 + %49 = arith.divf %arg12, %47 : f32 + %50 = arith.select %48, %cst, %49 : f32 + %51 = arith.mulf %46, %50 : f32 + %52 = arith.addf %arg7, %51 : f32 + %53 = arith.addf %arg8, %arg11 : f32 + %54 = arith.mulf %46, %46 : f32 + %55 = arith.mulf %54, %arg9 : f32 + %56 = arith.mulf %55, %50 : f32 + %57 = arith.addf %53, %56 : f32 + tt.reduce.return %52, %57, %47 : f32, f32, f32 + }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) + %25 = tt.expand_dims %24#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %26 = tt.expand_dims %24#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %27 = arith.muli %10, %cst_8 : tensor<64x1xi32> + %28 = tt.broadcast %27 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %29 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %30 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x64x!tt.ptr> + %31 = arith.addi %9, %cst_3 : tensor<64x1xi64> + %32 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64> + %33 = arith.select %32, %31, %9 : tensor<64x1xi1>, tensor<64x1xi64> + %34 = arith.cmpi sge, %33, %cst_2 : tensor<64x1xi64> + %35 = arith.cmpi slt, %33, %cst_3 : tensor<64x1xi64> + %36 = arith.andi %34, %35 : tensor<64x1xi1> + %37 = arith.muli %33, %cst_1 : tensor<64x1xi64> + %38 = tt.broadcast %37 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %39 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %40 = tt.broadcast %25 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %41 = arith.divf %26, %cst_5 : tensor<64x1xf32> + %42 = arith.addf %41, %cst_4 : tensor<64x1xf32> + %43 = arith.muli %5, %cst_8 : tensor<64x1xi32> + %44 = tt.broadcast %43 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %45 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 { + %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32> + %47 = arith.addi %46, %6 : tensor<1x64xi32> + %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32> + %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32> + %50 = arith.addi %49, %28 : tensor<64x64xi32> + %51 = tt.addptr %29, %50 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %54 = tt.addptr %30, %47 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> + %55 = tt.load %54, %48, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32> + tt.assert %36, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %56 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64> + %57 = tt.broadcast %56 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %58 = arith.addi %57, %38 : tensor<64x64xi64> + %59 = tt.addptr %39, %58 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %60 = tt.load %59, %52, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32> + %61 = arith.addf %60, %53 : tensor<64x64xf32> + %62 = arith.subf %61, %40 : tensor<64x64xf32> + %63 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> + %64 = tt.broadcast %63 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %65 = arith.mulf %62, %64 : tensor<64x64xf32> + %66 = tt.broadcast %55 : (tensor<1x64xf32>) -> tensor<64x64xf32> + %67 = arith.mulf %65, %66 : tensor<64x64xf32> + %68 = arith.addi %49, %44 : tensor<64x64xi32> + %69 = tt.addptr %45, %68 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %70 = arith.truncf %67 : tensor<64x64xf32> to tensor<64x64xbf16> + tt.store %69, %70, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16> + } + tt.return + } +} diff --git a/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir b/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..ae7fe6323220f73b0ce0d75b758ebcfa9d3a692a --- /dev/null +++ b/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir @@ -0,0 +1,1360 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_1 = internal constant [38 x i8] c"" +@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257" +@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_0 = internal constant [38 x i8] c"" +@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr + +define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %10 = lshr i32 %9, 3, !dbg !10 + %11 = and i32 %10, 31, !dbg !10 + %12 = and i32 %9, 63, !dbg !10 + %13 = shl i32 %9, 3, !dbg !11 + %14 = and i32 %13, 56, !dbg !11 + %15 = or i32 %14, 4, !dbg !11 + %16 = lshr i32 %9, 6, !dbg !12 + %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13 + %18 = shl i32 %17, 6, !dbg !14 + %19 = or i32 %18, %11, !dbg !15 + %20 = or i32 %19, 32, !dbg !15 + %21 = or i32 %18, %12, !dbg !15 + %22 = sext i32 %19 to i64, !dbg !16 + %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16 + %24 = sext i32 %20 to i64, !dbg !16 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16 + %26 = sext i32 %21 to i64, !dbg !16 + %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17 + %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17 + %45 = srem i32 %19, 512, !dbg !18 + %46 = srem i32 %20, 512, !dbg !18 + %47 = shl nsw i32 %45, 8, !dbg !19 + %48 = shl nsw i32 %46, 8, !dbg !19 + %49 = shl i32 %19, 8, !dbg !20 + %50 = shl i32 %20, 8, !dbg !20 + %51 = add i64 %44, 50257, !dbg !21 + %52 = icmp slt i64 %28, 0, !dbg !22 + %53 = icmp slt i64 %36, 0, !dbg !22 + %54 = icmp slt i64 %44, 0, !dbg !22 + %55 = select i1 %54, i64 %51, i64 %44, !dbg !23 + %56 = icmp ugt i64 %55, 50256, !dbg !24 + %57 = shl i64 %28, 8, !dbg !25 + %58 = add i64 %57, 12865792, !dbg !25 + %59 = select i1 %52, i64 %58, i64 %57, !dbg !25 + %60 = shl i64 %36, 8, !dbg !25 + %61 = add i64 %60, 12865792, !dbg !25 + %62 = select i1 %53, i64 %61, i64 %60, !dbg !25 + %63 = getelementptr float, ptr addrspace(1) %1, i64 %59 + %64 = getelementptr float, ptr addrspace(1) %1, i64 %62 + br label %65, !dbg !12 + +65: ; preds = %8, %230 + %66 = phi float [ 0.000000e+00, %8 ], [ %321, %230 ] + %67 = phi float [ 0.000000e+00, %8 ], [ %322, %230 ] + %68 = phi float [ 0.000000e+00, %8 ], [ %323, %230 ] + %69 = phi float [ 0.000000e+00, %8 ], [ %324, %230 ] + %70 = phi float [ 0.000000e+00, %8 ], [ %325, %230 ] + %71 = phi float [ 0.000000e+00, %8 ], [ %326, %230 ] + %72 = phi float [ 0.000000e+00, %8 ], [ %327, %230 ] + %73 = phi float [ 0.000000e+00, %8 ], [ %328, %230 ] + %74 = phi float [ 0.000000e+00, %8 ], [ %329, %230 ] + %75 = phi float [ 0.000000e+00, %8 ], [ %330, %230 ] + %76 = phi float [ 0.000000e+00, %8 ], [ %331, %230 ] + %77 = phi float [ 0.000000e+00, %8 ], [ %332, %230 ] + %78 = phi float [ 0.000000e+00, %8 ], [ %333, %230 ] + %79 = phi float [ 0.000000e+00, %8 ], [ %334, %230 ] + %80 = phi float [ 0.000000e+00, %8 ], [ %335, %230 ] + %81 = phi float [ 0.000000e+00, %8 ], [ %336, %230 ] + %82 = phi float [ 0.000000e+00, %8 ], [ %337, %230 ] + %83 = phi float [ 0.000000e+00, %8 ], [ %338, %230 ] + %84 = phi float [ 0.000000e+00, %8 ], [ %339, %230 ] + %85 = phi float [ 0.000000e+00, %8 ], [ %340, %230 ] + %86 = phi float [ 0.000000e+00, %8 ], [ %341, %230 ] + %87 = phi float [ 0.000000e+00, %8 ], [ %342, %230 ] + %88 = phi float [ 0.000000e+00, %8 ], [ %343, %230 ] + %89 = phi float [ 0.000000e+00, %8 ], [ %344, %230 ] + %90 = phi float [ 0.000000e+00, %8 ], [ %345, %230 ] + %91 = phi float [ 0.000000e+00, %8 ], [ %346, %230 ] + %92 = phi float [ 0.000000e+00, %8 ], [ %347, %230 ] + %93 = phi float [ 0.000000e+00, %8 ], [ %348, %230 ] + %94 = phi float [ 0.000000e+00, %8 ], [ %349, %230 ] + %95 = phi float [ 0.000000e+00, %8 ], [ %350, %230 ] + %96 = phi float [ 0.000000e+00, %8 ], [ %351, %230 ] + %97 = phi float [ 0.000000e+00, %8 ], [ %352, %230 ] + %98 = phi float [ 0.000000e+00, %8 ], [ %417, %230 ] + %99 = phi float [ 0.000000e+00, %8 ], [ %418, %230 ] + %100 = phi float [ 0.000000e+00, %8 ], [ %419, %230 ] + %101 = phi float [ 0.000000e+00, %8 ], [ %420, %230 ] + %102 = phi float [ 0.000000e+00, %8 ], [ %421, %230 ] + %103 = phi float [ 0.000000e+00, %8 ], [ %422, %230 ] + %104 = phi float [ 0.000000e+00, %8 ], [ %423, %230 ] + %105 = phi float [ 0.000000e+00, %8 ], [ %424, %230 ] + %106 = phi float [ 0.000000e+00, %8 ], [ %425, %230 ] + %107 = phi float [ 0.000000e+00, %8 ], [ %426, %230 ] + %108 = phi float [ 0.000000e+00, %8 ], [ %427, %230 ] + %109 = phi float [ 0.000000e+00, %8 ], [ %428, %230 ] + %110 = phi float [ 0.000000e+00, %8 ], [ %429, %230 ] + %111 = phi float [ 0.000000e+00, %8 ], [ %430, %230 ] + %112 = phi float [ 0.000000e+00, %8 ], [ %431, %230 ] + %113 = phi float [ 0.000000e+00, %8 ], [ %432, %230 ] + %114 = phi float [ 0.000000e+00, %8 ], [ %369, %230 ] + %115 = phi float [ 0.000000e+00, %8 ], [ %370, %230 ] + %116 = phi float [ 0.000000e+00, %8 ], [ %371, %230 ] + %117 = phi float [ 0.000000e+00, %8 ], [ %372, %230 ] + %118 = phi float [ 0.000000e+00, %8 ], [ %373, %230 ] + %119 = phi float [ 0.000000e+00, %8 ], [ %374, %230 ] + %120 = phi float [ 0.000000e+00, %8 ], [ %375, %230 ] + %121 = phi float [ 0.000000e+00, %8 ], [ %376, %230 ] + %122 = phi float [ 0.000000e+00, %8 ], [ %377, %230 ] + %123 = phi float [ 0.000000e+00, %8 ], [ %378, %230 ] + %124 = phi float [ 0.000000e+00, %8 ], [ %379, %230 ] + %125 = phi float [ 0.000000e+00, %8 ], [ %380, %230 ] + %126 = phi float [ 0.000000e+00, %8 ], [ %381, %230 ] + %127 = phi float [ 0.000000e+00, %8 ], [ %382, %230 ] + %128 = phi float [ 0.000000e+00, %8 ], [ %383, %230 ] + %129 = phi float [ 0.000000e+00, %8 ], [ %384, %230 ] + %130 = phi i32 [ 0, %8 ], [ %433, %230 ] + %131 = or i32 %130, %14, !dbg !26 + %132 = or i32 %130, %15, !dbg !26 + %133 = add i32 %131, %47, !dbg !27 + %134 = add i32 %132, %47, !dbg !27 + %135 = add i32 %131, %48, !dbg !27 + %136 = add i32 %132, %48, !dbg !27 + %137 = sext i32 %133 to i64, !dbg !28 + %138 = getelementptr float, ptr addrspace(1) %2, i64 %137, !dbg !28 + %139 = sext i32 %134 to i64, !dbg !28 + %140 = getelementptr float, ptr addrspace(1) %2, i64 %139, !dbg !28 + %141 = sext i32 %135 to i64, !dbg !28 + %142 = getelementptr float, ptr addrspace(1) %2, i64 %141, !dbg !28 + %143 = sext i32 %136 to i64, !dbg !28 + %144 = getelementptr float, ptr addrspace(1) %2, i64 %143, !dbg !28 + %145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !29 + %147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !29 + %148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !29 + %149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !29 + %150 = bitcast i32 %146 to float, !dbg !29 + %151 = bitcast i32 %147 to float, !dbg !29 + %152 = bitcast i32 %148 to float, !dbg !29 + %153 = bitcast i32 %149 to float, !dbg !29 + %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %140, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !29 + %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !29 + %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !29 + %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !29 + %159 = bitcast i32 %155 to float, !dbg !29 + %160 = bitcast i32 %156 to float, !dbg !29 + %161 = bitcast i32 %157 to float, !dbg !29 + %162 = bitcast i32 %158 to float, !dbg !29 + %163 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %142, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %164 = extractvalue { i32, i32, i32, i32 } %163, 0, !dbg !29 + %165 = extractvalue { i32, i32, i32, i32 } %163, 1, !dbg !29 + %166 = extractvalue { i32, i32, i32, i32 } %163, 2, !dbg !29 + %167 = extractvalue { i32, i32, i32, i32 } %163, 3, !dbg !29 + %168 = bitcast i32 %164 to float, !dbg !29 + %169 = bitcast i32 %165 to float, !dbg !29 + %170 = bitcast i32 %166 to float, !dbg !29 + %171 = bitcast i32 %167 to float, !dbg !29 + %172 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %144, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %173 = extractvalue { i32, i32, i32, i32 } %172, 0, !dbg !29 + %174 = extractvalue { i32, i32, i32, i32 } %172, 1, !dbg !29 + %175 = extractvalue { i32, i32, i32, i32 } %172, 2, !dbg !29 + %176 = extractvalue { i32, i32, i32, i32 } %172, 3, !dbg !29 + %177 = bitcast i32 %173 to float, !dbg !29 + %178 = bitcast i32 %174 to float, !dbg !29 + %179 = bitcast i32 %175 to float, !dbg !29 + %180 = bitcast i32 %176 to float, !dbg !29 + %181 = add i32 %131, %49, !dbg !30 + %182 = add i32 %131, %50, !dbg !30 + %183 = sext i32 %181 to i64, !dbg !31 + %184 = getelementptr i16, ptr addrspace(1) %3, i64 %183, !dbg !31 + %185 = sext i32 %182 to i64, !dbg !31 + %186 = getelementptr i16, ptr addrspace(1) %3, i64 %185, !dbg !31 + %187 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %188 = extractvalue { i32, i32, i32, i32 } %187, 0, !dbg !32 + %189 = extractvalue { i32, i32, i32, i32 } %187, 1, !dbg !32 + %190 = extractvalue { i32, i32, i32, i32 } %187, 2, !dbg !32 + %191 = extractvalue { i32, i32, i32, i32 } %187, 3, !dbg !32 + %192 = trunc i32 %188 to i16, !dbg !32 + %extelt.offset9 = lshr i32 %188, 16, !dbg !32 + %193 = trunc i32 %extelt.offset9 to i16, !dbg !32 + %194 = trunc i32 %189 to i16, !dbg !32 + %extelt.offset10 = lshr i32 %189, 16, !dbg !32 + %195 = trunc i32 %extelt.offset10 to i16, !dbg !32 + %196 = trunc i32 %190 to i16, !dbg !32 + %extelt.offset11 = lshr i32 %190, 16, !dbg !32 + %197 = trunc i32 %extelt.offset11 to i16, !dbg !32 + %198 = trunc i32 %191 to i16, !dbg !32 + %extelt.offset12 = lshr i32 %191, 16, !dbg !32 + %199 = trunc i32 %extelt.offset12 to i16, !dbg !32 + %200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %186, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32 + %201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !32 + %202 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !32 + %203 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !32 + %204 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !32 + %205 = trunc i32 %201 to i16, !dbg !32 + %extelt.offset13 = lshr i32 %201, 16, !dbg !32 + %206 = trunc i32 %extelt.offset13 to i16, !dbg !32 + %207 = trunc i32 %202 to i16, !dbg !32 + %extelt.offset14 = lshr i32 %202, 16, !dbg !32 + %208 = trunc i32 %extelt.offset14 to i16, !dbg !32 + %209 = trunc i32 %203 to i16, !dbg !32 + %extelt.offset15 = lshr i32 %203, 16, !dbg !32 + %210 = trunc i32 %extelt.offset15 to i16, !dbg !32 + %211 = trunc i32 %204 to i16, !dbg !32 + %extelt.offset16 = lshr i32 %204, 16, !dbg !32 + %212 = trunc i32 %extelt.offset16 to i16, !dbg !32 + %213 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %192) #6, !dbg !33 + %214 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %193) #6, !dbg !33 + %215 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %194) #6, !dbg !33 + %216 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %195) #6, !dbg !33 + %217 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %196) #6, !dbg !33 + %218 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %197) #6, !dbg !33 + %219 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %198) #6, !dbg !33 + %220 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %199) #6, !dbg !33 + %221 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %205) #6, !dbg !33 + %222 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %206) #6, !dbg !33 + %223 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %207) #6, !dbg !33 + %224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %208) #6, !dbg !33 + %225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #6, !dbg !33 + %226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %210) #6, !dbg !33 + %227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #6, !dbg !33 + %228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %212) #6, !dbg !33 + br i1 %56, label %229, label %230, !dbg !34 + +229: ; preds = %65 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34 + br label %230, !dbg !34 + +230: ; preds = %229, %65 + %231 = zext nneg i32 %131 to i64, !dbg !35 + %232 = zext nneg i32 %132 to i64, !dbg !35 + %233 = getelementptr float, ptr addrspace(1) %63, i64 %231, !dbg !36 + %234 = getelementptr float, ptr addrspace(1) %63, i64 %232, !dbg !36 + %235 = getelementptr float, ptr addrspace(1) %64, i64 %231, !dbg !36 + %236 = getelementptr float, ptr addrspace(1) %64, i64 %232, !dbg !36 + %237 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %233, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37 + %238 = extractvalue { i32, i32, i32, i32 } %237, 0, !dbg !37 + %239 = extractvalue { i32, i32, i32, i32 } %237, 1, !dbg !37 + %240 = extractvalue { i32, i32, i32, i32 } %237, 2, !dbg !37 + %241 = extractvalue { i32, i32, i32, i32 } %237, 3, !dbg !37 + %242 = bitcast i32 %238 to float, !dbg !37 + %243 = bitcast i32 %239 to float, !dbg !37 + %244 = bitcast i32 %240 to float, !dbg !37 + %245 = bitcast i32 %241 to float, !dbg !37 + %246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %234, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37 + %247 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !37 + %248 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !37 + %249 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !37 + %250 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !37 + %251 = bitcast i32 %247 to float, !dbg !37 + %252 = bitcast i32 %248 to float, !dbg !37 + %253 = bitcast i32 %249 to float, !dbg !37 + %254 = bitcast i32 %250 to float, !dbg !37 + %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %235, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37 + %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !37 + %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !37 + %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !37 + %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !37 + %260 = bitcast i32 %256 to float, !dbg !37 + %261 = bitcast i32 %257 to float, !dbg !37 + %262 = bitcast i32 %258 to float, !dbg !37 + %263 = bitcast i32 %259 to float, !dbg !37 + %264 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %236, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37 + %265 = extractvalue { i32, i32, i32, i32 } %264, 0, !dbg !37 + %266 = extractvalue { i32, i32, i32, i32 } %264, 1, !dbg !37 + %267 = extractvalue { i32, i32, i32, i32 } %264, 2, !dbg !37 + %268 = extractvalue { i32, i32, i32, i32 } %264, 3, !dbg !37 + %269 = bitcast i32 %265 to float, !dbg !37 + %270 = bitcast i32 %266 to float, !dbg !37 + %271 = bitcast i32 %267 to float, !dbg !37 + %272 = bitcast i32 %268 to float, !dbg !37 + %273 = fadd float %150, %242, !dbg !38 + %274 = fadd float %151, %243, !dbg !38 + %275 = fadd float %152, %244, !dbg !38 + %276 = fadd float %153, %245, !dbg !38 + %277 = fadd float %159, %251, !dbg !38 + %278 = fadd float %160, %252, !dbg !38 + %279 = fadd float %161, %253, !dbg !38 + %280 = fadd float %162, %254, !dbg !38 + %281 = fadd float %168, %260, !dbg !38 + %282 = fadd float %169, %261, !dbg !38 + %283 = fadd float %170, %262, !dbg !38 + %284 = fadd float %171, %263, !dbg !38 + %285 = fadd float %177, %269, !dbg !38 + %286 = fadd float %178, %270, !dbg !38 + %287 = fadd float %179, %271, !dbg !38 + %288 = fadd float %180, %272, !dbg !38 + %289 = fadd float %213, %273, !dbg !39 + %290 = fadd float %214, %274, !dbg !39 + %291 = fadd float %215, %275, !dbg !39 + %292 = fadd float %216, %276, !dbg !39 + %293 = fadd float %217, %277, !dbg !39 + %294 = fadd float %218, %278, !dbg !39 + %295 = fadd float %219, %279, !dbg !39 + %296 = fadd float %220, %280, !dbg !39 + %297 = fadd float %221, %281, !dbg !39 + %298 = fadd float %222, %282, !dbg !39 + %299 = fadd float %223, %283, !dbg !39 + %300 = fadd float %224, %284, !dbg !39 + %301 = fadd float %225, %285, !dbg !39 + %302 = fadd float %226, %286, !dbg !39 + %303 = fadd float %227, %287, !dbg !39 + %304 = fadd float %228, %288, !dbg !39 + %305 = fsub float %289, %114, !dbg !40 + %306 = fsub float %290, %115, !dbg !40 + %307 = fsub float %291, %116, !dbg !40 + %308 = fsub float %292, %117, !dbg !40 + %309 = fsub float %293, %118, !dbg !40 + %310 = fsub float %294, %119, !dbg !40 + %311 = fsub float %295, %120, !dbg !40 + %312 = fsub float %296, %121, !dbg !40 + %313 = fsub float %297, %122, !dbg !40 + %314 = fsub float %298, %123, !dbg !40 + %315 = fsub float %299, %124, !dbg !40 + %316 = fsub float %300, %125, !dbg !40 + %317 = fsub float %301, %126, !dbg !40 + %318 = fsub float %302, %127, !dbg !40 + %319 = fsub float %303, %128, !dbg !40 + %320 = fsub float %304, %129, !dbg !40 + %321 = fadd float %66, 1.000000e+00, !dbg !44 + %322 = fadd float %67, 1.000000e+00, !dbg !44 + %323 = fadd float %68, 1.000000e+00, !dbg !44 + %324 = fadd float %69, 1.000000e+00, !dbg !44 + %325 = fadd float %70, 1.000000e+00, !dbg !44 + %326 = fadd float %71, 1.000000e+00, !dbg !44 + %327 = fadd float %72, 1.000000e+00, !dbg !44 + %328 = fadd float %73, 1.000000e+00, !dbg !44 + %329 = fadd float %74, 1.000000e+00, !dbg !44 + %330 = fadd float %75, 1.000000e+00, !dbg !44 + %331 = fadd float %76, 1.000000e+00, !dbg !44 + %332 = fadd float %77, 1.000000e+00, !dbg !44 + %333 = fadd float %78, 1.000000e+00, !dbg !44 + %334 = fadd float %79, 1.000000e+00, !dbg !44 + %335 = fadd float %80, 1.000000e+00, !dbg !44 + %336 = fadd float %81, 1.000000e+00, !dbg !44 + %337 = fadd float %82, 1.000000e+00, !dbg !44 + %338 = fadd float %83, 1.000000e+00, !dbg !44 + %339 = fadd float %84, 1.000000e+00, !dbg !44 + %340 = fadd float %85, 1.000000e+00, !dbg !44 + %341 = fadd float %86, 1.000000e+00, !dbg !44 + %342 = fadd float %87, 1.000000e+00, !dbg !44 + %343 = fadd float %88, 1.000000e+00, !dbg !44 + %344 = fadd float %89, 1.000000e+00, !dbg !44 + %345 = fadd float %90, 1.000000e+00, !dbg !44 + %346 = fadd float %91, 1.000000e+00, !dbg !44 + %347 = fadd float %92, 1.000000e+00, !dbg !44 + %348 = fadd float %93, 1.000000e+00, !dbg !44 + %349 = fadd float %94, 1.000000e+00, !dbg !44 + %350 = fadd float %95, 1.000000e+00, !dbg !44 + %351 = fadd float %96, 1.000000e+00, !dbg !44 + %352 = fadd float %97, 1.000000e+00, !dbg !44 + %353 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %305, float %321) #6, !dbg !45 + %354 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %306, float %322) #6, !dbg !45 + %355 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %307, float %323) #6, !dbg !45 + %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %308, float %324) #6, !dbg !45 + %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %309, float %325) #6, !dbg !45 + %358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %310, float %326) #6, !dbg !45 + %359 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %311, float %327) #6, !dbg !45 + %360 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %312, float %328) #6, !dbg !45 + %361 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %313, float %329) #6, !dbg !45 + %362 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %314, float %330) #6, !dbg !45 + %363 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %315, float %331) #6, !dbg !45 + %364 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %316, float %332) #6, !dbg !45 + %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %317, float %333) #6, !dbg !45 + %366 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %318, float %334) #6, !dbg !45 + %367 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %319, float %335) #6, !dbg !45 + %368 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %320, float %336) #6, !dbg !45 + %369 = fadd float %114, %353, !dbg !46 + %370 = fadd float %115, %354, !dbg !46 + %371 = fadd float %116, %355, !dbg !46 + %372 = fadd float %117, %356, !dbg !46 + %373 = fadd float %118, %357, !dbg !46 + %374 = fadd float %119, %358, !dbg !46 + %375 = fadd float %120, %359, !dbg !46 + %376 = fadd float %121, %360, !dbg !46 + %377 = fadd float %122, %361, !dbg !46 + %378 = fadd float %123, %362, !dbg !46 + %379 = fadd float %124, %363, !dbg !46 + %380 = fadd float %125, %364, !dbg !46 + %381 = fadd float %126, %365, !dbg !46 + %382 = fadd float %127, %366, !dbg !46 + %383 = fadd float %128, %367, !dbg !46 + %384 = fadd float %129, %368, !dbg !46 + %385 = fsub float %289, %369, !dbg !47 + %386 = fsub float %290, %370, !dbg !47 + %387 = fsub float %291, %371, !dbg !47 + %388 = fsub float %292, %372, !dbg !47 + %389 = fsub float %293, %373, !dbg !47 + %390 = fsub float %294, %374, !dbg !47 + %391 = fsub float %295, %375, !dbg !47 + %392 = fsub float %296, %376, !dbg !47 + %393 = fsub float %297, %377, !dbg !47 + %394 = fsub float %298, %378, !dbg !47 + %395 = fsub float %299, %379, !dbg !47 + %396 = fsub float %300, %380, !dbg !47 + %397 = fsub float %301, %381, !dbg !47 + %398 = fsub float %302, %382, !dbg !47 + %399 = fsub float %303, %383, !dbg !47 + %400 = fsub float %304, %384, !dbg !47 + %401 = fmul float %305, %385, !dbg !48 + %402 = fmul float %306, %386, !dbg !48 + %403 = fmul float %307, %387, !dbg !48 + %404 = fmul float %308, %388, !dbg !48 + %405 = fmul float %309, %389, !dbg !48 + %406 = fmul float %310, %390, !dbg !48 + %407 = fmul float %311, %391, !dbg !48 + %408 = fmul float %312, %392, !dbg !48 + %409 = fmul float %313, %393, !dbg !48 + %410 = fmul float %314, %394, !dbg !48 + %411 = fmul float %315, %395, !dbg !48 + %412 = fmul float %316, %396, !dbg !48 + %413 = fmul float %317, %397, !dbg !48 + %414 = fmul float %318, %398, !dbg !48 + %415 = fmul float %319, %399, !dbg !48 + %416 = fmul float %320, %400, !dbg !48 + %417 = fadd float %98, %401, !dbg !49 + %418 = fadd float %99, %402, !dbg !49 + %419 = fadd float %100, %403, !dbg !49 + %420 = fadd float %101, %404, !dbg !49 + %421 = fadd float %102, %405, !dbg !49 + %422 = fadd float %103, %406, !dbg !49 + %423 = fadd float %104, %407, !dbg !49 + %424 = fadd float %105, %408, !dbg !49 + %425 = fadd float %106, %409, !dbg !49 + %426 = fadd float %107, %410, !dbg !49 + %427 = fadd float %108, %411, !dbg !49 + %428 = fadd float %109, %412, !dbg !49 + %429 = fadd float %110, %413, !dbg !49 + %430 = fadd float %111, %414, !dbg !49 + %431 = fadd float %112, %415, !dbg !49 + %432 = fadd float %113, %416, !dbg !49 + %433 = add nuw nsw i32 %130, 64, !dbg !12 + %434 = icmp ult i32 %130, 192, !dbg !12 + br i1 %434, label %65, label %435, !dbg !12 + +435: ; preds = %230 + %436 = and i32 %16, 3, !dbg !12 + %437 = mul nuw nsw i32 %436, 72, !dbg !12 + %438 = add nuw nsw i32 %437, %12, !dbg !12 + %439 = zext nneg i32 %438 to i64, !dbg !12 + %440 = getelementptr float, ptr addrspace(3) @global_smem, i64 %439, !dbg !12 + %441 = insertelement <1 x float> undef, float %337, i64 0, !dbg !12 + store <1 x float> %441, ptr addrspace(3) %440, align 4, !dbg !12 + %442 = add nuw nsw i32 %12, 288, !dbg !12 + %443 = add nuw nsw i32 %442, %437, !dbg !12 + %444 = zext nneg i32 %443 to i64, !dbg !12 + %445 = getelementptr float, ptr addrspace(3) @global_smem, i64 %444, !dbg !12 + %446 = insertelement <1 x float> undef, float %338, i64 0, !dbg !12 + store <1 x float> %446, ptr addrspace(3) %445, align 4, !dbg !12 + %447 = or i32 %12, 576, !dbg !12 + %448 = add nuw nsw i32 %447, %437, !dbg !12 + %449 = zext nneg i32 %448 to i64, !dbg !12 + %450 = getelementptr float, ptr addrspace(3) @global_smem, i64 %449, !dbg !12 + %451 = insertelement <1 x float> undef, float %339, i64 0, !dbg !12 + store <1 x float> %451, ptr addrspace(3) %450, align 4, !dbg !12 + %452 = add nuw nsw i32 %12, 864, !dbg !12 + %453 = add nuw nsw i32 %452, %437, !dbg !12 + %454 = zext nneg i32 %453 to i64, !dbg !12 + %455 = getelementptr float, ptr addrspace(3) @global_smem, i64 %454, !dbg !12 + %456 = insertelement <1 x float> undef, float %340, i64 0, !dbg !12 + store <1 x float> %456, ptr addrspace(3) %455, align 4, !dbg !12 + %457 = or i32 %12, 1152, !dbg !12 + %458 = add nuw nsw i32 %457, %437, !dbg !12 + %459 = zext nneg i32 %458 to i64, !dbg !12 + %460 = getelementptr float, ptr addrspace(3) @global_smem, i64 %459, !dbg !12 + %461 = insertelement <1 x float> undef, float %341, i64 0, !dbg !12 + store <1 x float> %461, ptr addrspace(3) %460, align 4, !dbg !12 + %462 = add nuw nsw i32 %12, 1440, !dbg !12 + %463 = add nuw nsw i32 %462, %437, !dbg !12 + %464 = zext nneg i32 %463 to i64, !dbg !12 + %465 = getelementptr float, ptr addrspace(3) @global_smem, i64 %464, !dbg !12 + %466 = insertelement <1 x float> undef, float %342, i64 0, !dbg !12 + store <1 x float> %466, ptr addrspace(3) %465, align 4, !dbg !12 + %467 = or i32 %12, 1728, !dbg !12 + %468 = add nuw nsw i32 %467, %437, !dbg !12 + %469 = zext nneg i32 %468 to i64, !dbg !12 + %470 = getelementptr float, ptr addrspace(3) @global_smem, i64 %469, !dbg !12 + %471 = insertelement <1 x float> undef, float %343, i64 0, !dbg !12 + store <1 x float> %471, ptr addrspace(3) %470, align 4, !dbg !12 + %472 = add nuw nsw i32 %12, 2016, !dbg !12 + %473 = add nuw nsw i32 %472, %437, !dbg !12 + %474 = zext nneg i32 %473 to i64, !dbg !12 + %475 = getelementptr float, ptr addrspace(3) @global_smem, i64 %474, !dbg !12 + %476 = insertelement <1 x float> undef, float %344, i64 0, !dbg !12 + store <1 x float> %476, ptr addrspace(3) %475, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %477 = mul nuw nsw i32 %11, 72, !dbg !12 + %478 = add nuw nsw i32 %477, %14, !dbg !12 + %479 = zext nneg i32 %478 to i64, !dbg !12 + %480 = getelementptr float, ptr addrspace(3) @global_smem, i64 %479, !dbg !12 + %481 = load float, ptr addrspace(3) %480, align 32, !dbg !12 + %482 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 1, !dbg !12 + %483 = load float, ptr addrspace(3) %482, align 4, !dbg !12 + %484 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 2, !dbg !12 + %485 = load float, ptr addrspace(3) %484, align 8, !dbg !12 + %486 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 3, !dbg !12 + %487 = load float, ptr addrspace(3) %486, align 4, !dbg !12 + %488 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 4, !dbg !12 + %489 = load float, ptr addrspace(3) %488, align 16, !dbg !12 + %490 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 5, !dbg !12 + %491 = load float, ptr addrspace(3) %490, align 4, !dbg !12 + %492 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 6, !dbg !12 + %493 = load float, ptr addrspace(3) %492, align 8, !dbg !12 + %494 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 7, !dbg !12 + %495 = load float, ptr addrspace(3) %494, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %496 = insertelement <1 x float> undef, float %345, i64 0, !dbg !12 + store <1 x float> %496, ptr addrspace(3) %440, align 4, !dbg !12 + %497 = insertelement <1 x float> undef, float %346, i64 0, !dbg !12 + store <1 x float> %497, ptr addrspace(3) %445, align 4, !dbg !12 + %498 = insertelement <1 x float> undef, float %347, i64 0, !dbg !12 + store <1 x float> %498, ptr addrspace(3) %450, align 4, !dbg !12 + %499 = insertelement <1 x float> undef, float %348, i64 0, !dbg !12 + store <1 x float> %499, ptr addrspace(3) %455, align 4, !dbg !12 + %500 = insertelement <1 x float> undef, float %349, i64 0, !dbg !12 + store <1 x float> %500, ptr addrspace(3) %460, align 4, !dbg !12 + %501 = insertelement <1 x float> undef, float %350, i64 0, !dbg !12 + store <1 x float> %501, ptr addrspace(3) %465, align 4, !dbg !12 + %502 = insertelement <1 x float> undef, float %351, i64 0, !dbg !12 + store <1 x float> %502, ptr addrspace(3) %470, align 4, !dbg !12 + %503 = insertelement <1 x float> undef, float %352, i64 0, !dbg !12 + store <1 x float> %503, ptr addrspace(3) %475, align 4, !dbg !12 + tail call void @llvm.nvvm.barrier0(), !dbg !12 + %504 = load float, ptr addrspace(3) %480, align 32, !dbg !12 + %505 = load float, ptr addrspace(3) %482, align 4, !dbg !12 + %506 = load float, ptr addrspace(3) %484, align 8, !dbg !12 + %507 = load float, ptr addrspace(3) %486, align 4, !dbg !12 + %508 = load float, ptr addrspace(3) %488, align 16, !dbg !12 + %509 = load float, ptr addrspace(3) %490, align 4, !dbg !12 + %510 = load float, ptr addrspace(3) %492, align 8, !dbg !12 + %511 = load float, ptr addrspace(3) %494, align 4, !dbg !12 + %512 = fsub float %370, %369, !dbg !50 + %513 = fadd float %481, %483, !dbg !54 + %514 = fcmp oeq float %513, 0.000000e+00, !dbg !55 + %515 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %483, float %513) #6, !dbg !56 + %516 = select i1 %514, float 0.000000e+00, float %515, !dbg !57 + %517 = fmul float %512, %516, !dbg !58 + %518 = fadd float %369, %517, !dbg !59 + %519 = fadd float %417, %418, !dbg !60 + %520 = fmul float %512, %512, !dbg !61 + %521 = fmul float %520, %481, !dbg !62 + %522 = fmul float %521, %516, !dbg !63 + %523 = fadd float %519, %522, !dbg !64 + %524 = fsub float %371, %518, !dbg !50 + %525 = fadd float %485, %513, !dbg !54 + %526 = fcmp oeq float %525, 0.000000e+00, !dbg !55 + %527 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %485, float %525) #6, !dbg !56 + %528 = select i1 %526, float 0.000000e+00, float %527, !dbg !57 + %529 = fmul float %528, %524, !dbg !58 + %530 = fadd float %518, %529, !dbg !59 + %531 = fadd float %419, %523, !dbg !60 + %532 = fmul float %524, %524, !dbg !61 + %533 = fmul float %513, %532, !dbg !62 + %534 = fmul float %528, %533, !dbg !63 + %535 = fadd float %531, %534, !dbg !64 + %536 = fsub float %372, %530, !dbg !50 + %537 = fadd float %487, %525, !dbg !54 + %538 = fcmp oeq float %537, 0.000000e+00, !dbg !55 + %539 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %487, float %537) #6, !dbg !56 + %540 = select i1 %538, float 0.000000e+00, float %539, !dbg !57 + %541 = fmul float %540, %536, !dbg !58 + %542 = fadd float %530, %541, !dbg !59 + %543 = fadd float %420, %535, !dbg !60 + %544 = fmul float %536, %536, !dbg !61 + %545 = fmul float %525, %544, !dbg !62 + %546 = fmul float %540, %545, !dbg !63 + %547 = fadd float %543, %546, !dbg !64 + %548 = fsub float %373, %542, !dbg !50 + %549 = fadd float %489, %537, !dbg !54 + %550 = fcmp oeq float %549, 0.000000e+00, !dbg !55 + %551 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %489, float %549) #6, !dbg !56 + %552 = select i1 %550, float 0.000000e+00, float %551, !dbg !57 + %553 = fmul float %552, %548, !dbg !58 + %554 = fadd float %542, %553, !dbg !59 + %555 = fadd float %421, %547, !dbg !60 + %556 = fmul float %548, %548, !dbg !61 + %557 = fmul float %537, %556, !dbg !62 + %558 = fmul float %552, %557, !dbg !63 + %559 = fadd float %555, %558, !dbg !64 + %560 = fsub float %374, %554, !dbg !50 + %561 = fadd float %491, %549, !dbg !54 + %562 = fcmp oeq float %561, 0.000000e+00, !dbg !55 + %563 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %491, float %561) #6, !dbg !56 + %564 = select i1 %562, float 0.000000e+00, float %563, !dbg !57 + %565 = fmul float %564, %560, !dbg !58 + %566 = fadd float %554, %565, !dbg !59 + %567 = fadd float %422, %559, !dbg !60 + %568 = fmul float %560, %560, !dbg !61 + %569 = fmul float %549, %568, !dbg !62 + %570 = fmul float %564, %569, !dbg !63 + %571 = fadd float %567, %570, !dbg !64 + %572 = fsub float %375, %566, !dbg !50 + %573 = fadd float %493, %561, !dbg !54 + %574 = fcmp oeq float %573, 0.000000e+00, !dbg !55 + %575 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %493, float %573) #6, !dbg !56 + %576 = select i1 %574, float 0.000000e+00, float %575, !dbg !57 + %577 = fmul float %576, %572, !dbg !58 + %578 = fadd float %566, %577, !dbg !59 + %579 = fadd float %423, %571, !dbg !60 + %580 = fmul float %572, %572, !dbg !61 + %581 = fmul float %561, %580, !dbg !62 + %582 = fmul float %576, %581, !dbg !63 + %583 = fadd float %579, %582, !dbg !64 + %584 = fsub float %376, %578, !dbg !50 + %585 = fadd float %495, %573, !dbg !54 + %586 = fcmp oeq float %585, 0.000000e+00, !dbg !55 + %587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %495, float %585) #6, !dbg !56 + %588 = select i1 %586, float 0.000000e+00, float %587, !dbg !57 + %589 = fmul float %588, %584, !dbg !58 + %590 = fadd float %578, %589, !dbg !59 + %591 = fadd float %424, %583, !dbg !60 + %592 = fmul float %584, %584, !dbg !61 + %593 = fmul float %573, %592, !dbg !62 + %594 = fmul float %588, %593, !dbg !63 + %595 = fadd float %591, %594, !dbg !64 + %596 = fsub float %378, %377, !dbg !50 + %597 = fadd float %504, %505, !dbg !54 + %598 = fcmp oeq float %597, 0.000000e+00, !dbg !55 + %599 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %505, float %597) #6, !dbg !56 + %600 = select i1 %598, float 0.000000e+00, float %599, !dbg !57 + %601 = fmul float %596, %600, !dbg !58 + %602 = fadd float %377, %601, !dbg !59 + %603 = fadd float %425, %426, !dbg !60 + %604 = fmul float %596, %596, !dbg !61 + %605 = fmul float %604, %504, !dbg !62 + %606 = fmul float %605, %600, !dbg !63 + %607 = fadd float %603, %606, !dbg !64 + %608 = fsub float %379, %602, !dbg !50 + %609 = fadd float %506, %597, !dbg !54 + %610 = fcmp oeq float %609, 0.000000e+00, !dbg !55 + %611 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %506, float %609) #6, !dbg !56 + %612 = select i1 %610, float 0.000000e+00, float %611, !dbg !57 + %613 = fmul float %612, %608, !dbg !58 + %614 = fadd float %602, %613, !dbg !59 + %615 = fadd float %427, %607, !dbg !60 + %616 = fmul float %608, %608, !dbg !61 + %617 = fmul float %597, %616, !dbg !62 + %618 = fmul float %612, %617, !dbg !63 + %619 = fadd float %615, %618, !dbg !64 + %620 = fsub float %380, %614, !dbg !50 + %621 = fadd float %507, %609, !dbg !54 + %622 = fcmp oeq float %621, 0.000000e+00, !dbg !55 + %623 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %507, float %621) #6, !dbg !56 + %624 = select i1 %622, float 0.000000e+00, float %623, !dbg !57 + %625 = fmul float %624, %620, !dbg !58 + %626 = fadd float %614, %625, !dbg !59 + %627 = fadd float %428, %619, !dbg !60 + %628 = fmul float %620, %620, !dbg !61 + %629 = fmul float %609, %628, !dbg !62 + %630 = fmul float %624, %629, !dbg !63 + %631 = fadd float %627, %630, !dbg !64 + %632 = fsub float %381, %626, !dbg !50 + %633 = fadd float %508, %621, !dbg !54 + %634 = fcmp oeq float %633, 0.000000e+00, !dbg !55 + %635 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %508, float %633) #6, !dbg !56 + %636 = select i1 %634, float 0.000000e+00, float %635, !dbg !57 + %637 = fmul float %636, %632, !dbg !58 + %638 = fadd float %626, %637, !dbg !59 + %639 = fadd float %429, %631, !dbg !60 + %640 = fmul float %632, %632, !dbg !61 + %641 = fmul float %621, %640, !dbg !62 + %642 = fmul float %636, %641, !dbg !63 + %643 = fadd float %639, %642, !dbg !64 + %644 = fsub float %382, %638, !dbg !50 + %645 = fadd float %509, %633, !dbg !54 + %646 = fcmp oeq float %645, 0.000000e+00, !dbg !55 + %647 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %509, float %645) #6, !dbg !56 + %648 = select i1 %646, float 0.000000e+00, float %647, !dbg !57 + %649 = fmul float %648, %644, !dbg !58 + %650 = fadd float %638, %649, !dbg !59 + %651 = fadd float %430, %643, !dbg !60 + %652 = fmul float %644, %644, !dbg !61 + %653 = fmul float %633, %652, !dbg !62 + %654 = fmul float %648, %653, !dbg !63 + %655 = fadd float %651, %654, !dbg !64 + %656 = fsub float %383, %650, !dbg !50 + %657 = fadd float %510, %645, !dbg !54 + %658 = fcmp oeq float %657, 0.000000e+00, !dbg !55 + %659 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %510, float %657) #6, !dbg !56 + %660 = select i1 %658, float 0.000000e+00, float %659, !dbg !57 + %661 = fmul float %660, %656, !dbg !58 + %662 = fadd float %650, %661, !dbg !59 + %663 = fadd float %431, %655, !dbg !60 + %664 = fmul float %656, %656, !dbg !61 + %665 = fmul float %645, %664, !dbg !62 + %666 = fmul float %660, %665, !dbg !63 + %667 = fadd float %663, %666, !dbg !64 + %668 = fsub float %384, %662, !dbg !50 + %669 = fadd float %511, %657, !dbg !54 + %670 = fcmp oeq float %669, 0.000000e+00, !dbg !55 + %671 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %511, float %669) #6, !dbg !56 + %672 = select i1 %670, float 0.000000e+00, float %671, !dbg !57 + %673 = fmul float %672, %668, !dbg !58 + %674 = fadd float %662, %673, !dbg !59 + %675 = fadd float %432, %667, !dbg !60 + %676 = fmul float %668, %668, !dbg !61 + %677 = fmul float %657, %676, !dbg !62 + %678 = fmul float %672, %677, !dbg !63 + %679 = fadd float %675, %678, !dbg !64 + %680 = bitcast float %590 to i32, !dbg !65 + %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %680, i32 4, i32 31), !dbg !65 + %682 = bitcast i32 %681 to float, !dbg !65 + %683 = bitcast float %595 to i32, !dbg !65 + %684 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %683, i32 4, i32 31), !dbg !65 + %685 = bitcast i32 %684 to float, !dbg !65 + %686 = bitcast float %585 to i32, !dbg !65 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 4, i32 31), !dbg !65 + %688 = bitcast i32 %687 to float, !dbg !65 + %689 = fsub float %682, %590, !dbg !50 + %690 = fadd float %585, %688, !dbg !54 + %691 = fcmp oeq float %690, 0.000000e+00, !dbg !55 + %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %688, float %690) #6, !dbg !56 + %693 = select i1 %691, float 0.000000e+00, float %692, !dbg !57 + %694 = fmul float %693, %689, !dbg !58 + %695 = fadd float %590, %694, !dbg !59 + %696 = fadd float %595, %685, !dbg !60 + %697 = fmul float %689, %689, !dbg !61 + %698 = fmul float %585, %697, !dbg !62 + %699 = fmul float %693, %698, !dbg !63 + %700 = fadd float %696, %699, !dbg !64 + %701 = bitcast float %695 to i32, !dbg !65 + %702 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %701, i32 2, i32 31), !dbg !65 + %703 = bitcast i32 %702 to float, !dbg !65 + %704 = bitcast float %700 to i32, !dbg !65 + %705 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %704, i32 2, i32 31), !dbg !65 + %706 = bitcast i32 %705 to float, !dbg !65 + %707 = bitcast float %690 to i32, !dbg !65 + %708 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 2, i32 31), !dbg !65 + %709 = bitcast i32 %708 to float, !dbg !65 + %710 = fsub float %703, %695, !dbg !50 + %711 = fadd float %690, %709, !dbg !54 + %712 = fcmp oeq float %711, 0.000000e+00, !dbg !55 + %713 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %709, float %711) #6, !dbg !56 + %714 = select i1 %712, float 0.000000e+00, float %713, !dbg !57 + %715 = fmul float %714, %710, !dbg !58 + %716 = fadd float %695, %715, !dbg !59 + %717 = fadd float %700, %706, !dbg !60 + %718 = fmul float %710, %710, !dbg !61 + %719 = fmul float %690, %718, !dbg !62 + %720 = fmul float %714, %719, !dbg !63 + %721 = fadd float %717, %720, !dbg !64 + %722 = bitcast float %716 to i32, !dbg !65 + %723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !65 + %724 = bitcast i32 %723 to float, !dbg !65 + %725 = bitcast float %721 to i32, !dbg !65 + %726 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %725, i32 1, i32 31), !dbg !65 + %727 = bitcast i32 %726 to float, !dbg !65 + %728 = bitcast float %711 to i32, !dbg !65 + %729 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %728, i32 1, i32 31), !dbg !65 + %730 = bitcast i32 %729 to float, !dbg !65 + %731 = fsub float %724, %716, !dbg !50 + %732 = fadd float %711, %730, !dbg !54 + %733 = fcmp oeq float %732, 0.000000e+00, !dbg !55 + %734 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %730, float %732) #6, !dbg !56 + %735 = select i1 %733, float 0.000000e+00, float %734, !dbg !57 + %736 = fmul float %731, %735, !dbg !58 + %737 = fadd float %716, %736, !dbg !59 + %738 = fadd float %721, %727, !dbg !60 + %739 = fmul float %731, %731, !dbg !61 + %740 = fmul float %711, %739, !dbg !62 + %741 = fmul float %735, %740, !dbg !63 + %742 = fadd float %738, %741, !dbg !64 + %743 = bitcast float %674 to i32, !dbg !65 + %744 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %743, i32 4, i32 31), !dbg !65 + %745 = bitcast i32 %744 to float, !dbg !65 + %746 = bitcast float %679 to i32, !dbg !65 + %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 4, i32 31), !dbg !65 + %748 = bitcast i32 %747 to float, !dbg !65 + %749 = bitcast float %669 to i32, !dbg !65 + %750 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 4, i32 31), !dbg !65 + %751 = bitcast i32 %750 to float, !dbg !65 + %752 = fsub float %745, %674, !dbg !50 + %753 = fadd float %669, %751, !dbg !54 + %754 = fcmp oeq float %753, 0.000000e+00, !dbg !55 + %755 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %751, float %753) #6, !dbg !56 + %756 = select i1 %754, float 0.000000e+00, float %755, !dbg !57 + %757 = fmul float %752, %756, !dbg !58 + %758 = fadd float %674, %757, !dbg !59 + %759 = fadd float %679, %748, !dbg !60 + %760 = fmul float %752, %752, !dbg !61 + %761 = fmul float %669, %760, !dbg !62 + %762 = fmul float %761, %756, !dbg !63 + %763 = fadd float %759, %762, !dbg !64 + %764 = bitcast float %758 to i32, !dbg !65 + %765 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %764, i32 2, i32 31), !dbg !65 + %766 = bitcast i32 %765 to float, !dbg !65 + %767 = bitcast float %763 to i32, !dbg !65 + %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 2, i32 31), !dbg !65 + %769 = bitcast i32 %768 to float, !dbg !65 + %770 = bitcast float %753 to i32, !dbg !65 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 2, i32 31), !dbg !65 + %772 = bitcast i32 %771 to float, !dbg !65 + %773 = fsub float %766, %758, !dbg !50 + %774 = fadd float %753, %772, !dbg !54 + %775 = fcmp oeq float %774, 0.000000e+00, !dbg !55 + %776 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %772, float %774) #6, !dbg !56 + %777 = select i1 %775, float 0.000000e+00, float %776, !dbg !57 + %778 = fmul float %773, %777, !dbg !58 + %779 = fadd float %758, %778, !dbg !59 + %780 = fadd float %763, %769, !dbg !60 + %781 = fmul float %773, %773, !dbg !61 + %782 = fmul float %753, %781, !dbg !62 + %783 = fmul float %777, %782, !dbg !63 + %784 = fadd float %780, %783, !dbg !64 + %785 = bitcast float %779 to i32, !dbg !65 + %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !65 + %787 = bitcast i32 %786 to float, !dbg !65 + %788 = bitcast float %784 to i32, !dbg !65 + %789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 1, i32 31), !dbg !65 + %790 = bitcast i32 %789 to float, !dbg !65 + %791 = bitcast float %774 to i32, !dbg !65 + %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %791, i32 1, i32 31), !dbg !65 + %793 = bitcast i32 %792 to float, !dbg !65 + %794 = fsub float %787, %779, !dbg !50 + %795 = fadd float %774, %793, !dbg !54 + %796 = fcmp oeq float %795, 0.000000e+00, !dbg !55 + %797 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %793, float %795) #6, !dbg !56 + %798 = select i1 %796, float 0.000000e+00, float %797, !dbg !57 + %799 = fmul float %794, %798, !dbg !58 + %800 = fadd float %779, %799, !dbg !59 + %801 = fadd float %784, %790, !dbg !60 + %802 = fmul float %794, %794, !dbg !61 + %803 = fmul float %774, %802, !dbg !62 + %804 = fmul float %798, %803, !dbg !63 + %805 = fadd float %801, %804, !dbg !64 + %806 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %807 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %808 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %809 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %810 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %811 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %812 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %813 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67 + %814 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %815 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %816 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %817 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %818 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %819 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %820 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %821 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67 + %822 = fadd float %806, 0x3EE4F8B580000000, !dbg !68 + %823 = fadd float %814, 0x3EE4F8B580000000, !dbg !68 + br label %824, !dbg !69 + +824: ; preds = %435, %__nv_rsqrtf.exit40 + %825 = phi i32 [ 0, %435 ], [ %1134, %__nv_rsqrtf.exit40 ] + %826 = or i32 %825, %14, !dbg !70 + %827 = or i32 %825, %15, !dbg !70 + %828 = add i32 %826, %47, !dbg !71 + %829 = add i32 %827, %47, !dbg !71 + %830 = add i32 %826, %48, !dbg !71 + %831 = add i32 %827, %48, !dbg !71 + %832 = sext i32 %828 to i64, !dbg !72 + %833 = getelementptr float, ptr addrspace(1) %2, i64 %832, !dbg !72 + %834 = sext i32 %829 to i64, !dbg !72 + %835 = getelementptr float, ptr addrspace(1) %2, i64 %834, !dbg !72 + %836 = sext i32 %830 to i64, !dbg !72 + %837 = getelementptr float, ptr addrspace(1) %2, i64 %836, !dbg !72 + %838 = sext i32 %831 to i64, !dbg !72 + %839 = getelementptr float, ptr addrspace(1) %2, i64 %838, !dbg !72 + %840 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %833, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %841 = extractvalue { i32, i32, i32, i32 } %840, 0, !dbg !73 + %842 = extractvalue { i32, i32, i32, i32 } %840, 1, !dbg !73 + %843 = extractvalue { i32, i32, i32, i32 } %840, 2, !dbg !73 + %844 = extractvalue { i32, i32, i32, i32 } %840, 3, !dbg !73 + %845 = bitcast i32 %841 to float, !dbg !73 + %846 = bitcast i32 %842 to float, !dbg !73 + %847 = bitcast i32 %843 to float, !dbg !73 + %848 = bitcast i32 %844 to float, !dbg !73 + %849 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %850 = extractvalue { i32, i32, i32, i32 } %849, 0, !dbg !73 + %851 = extractvalue { i32, i32, i32, i32 } %849, 1, !dbg !73 + %852 = extractvalue { i32, i32, i32, i32 } %849, 2, !dbg !73 + %853 = extractvalue { i32, i32, i32, i32 } %849, 3, !dbg !73 + %854 = bitcast i32 %850 to float, !dbg !73 + %855 = bitcast i32 %851 to float, !dbg !73 + %856 = bitcast i32 %852 to float, !dbg !73 + %857 = bitcast i32 %853 to float, !dbg !73 + %858 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %859 = extractvalue { i32, i32, i32, i32 } %858, 0, !dbg !73 + %860 = extractvalue { i32, i32, i32, i32 } %858, 1, !dbg !73 + %861 = extractvalue { i32, i32, i32, i32 } %858, 2, !dbg !73 + %862 = extractvalue { i32, i32, i32, i32 } %858, 3, !dbg !73 + %863 = bitcast i32 %859 to float, !dbg !73 + %864 = bitcast i32 %860 to float, !dbg !73 + %865 = bitcast i32 %861 to float, !dbg !73 + %866 = bitcast i32 %862 to float, !dbg !73 + %867 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %839, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73 + %868 = extractvalue { i32, i32, i32, i32 } %867, 0, !dbg !73 + %869 = extractvalue { i32, i32, i32, i32 } %867, 1, !dbg !73 + %870 = extractvalue { i32, i32, i32, i32 } %867, 2, !dbg !73 + %871 = extractvalue { i32, i32, i32, i32 } %867, 3, !dbg !73 + %872 = bitcast i32 %868 to float, !dbg !73 + %873 = bitcast i32 %869 to float, !dbg !73 + %874 = bitcast i32 %870 to float, !dbg !73 + %875 = bitcast i32 %871 to float, !dbg !73 + %876 = add i32 %826, %49, !dbg !74 + %877 = add i32 %826, %50, !dbg !74 + %878 = sext i32 %876 to i64, !dbg !75 + %879 = getelementptr i16, ptr addrspace(1) %3, i64 %878, !dbg !75 + %880 = sext i32 %877 to i64, !dbg !75 + %881 = getelementptr i16, ptr addrspace(1) %3, i64 %880, !dbg !75 + %882 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %879, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76 + %883 = extractvalue { i32, i32, i32, i32 } %882, 0, !dbg !76 + %884 = extractvalue { i32, i32, i32, i32 } %882, 1, !dbg !76 + %885 = extractvalue { i32, i32, i32, i32 } %882, 2, !dbg !76 + %886 = extractvalue { i32, i32, i32, i32 } %882, 3, !dbg !76 + %887 = trunc i32 %883 to i16, !dbg !76 + %extelt.offset = lshr i32 %883, 16, !dbg !76 + %888 = trunc i32 %extelt.offset to i16, !dbg !76 + %889 = trunc i32 %884 to i16, !dbg !76 + %extelt.offset2 = lshr i32 %884, 16, !dbg !76 + %890 = trunc i32 %extelt.offset2 to i16, !dbg !76 + %891 = trunc i32 %885 to i16, !dbg !76 + %extelt.offset3 = lshr i32 %885, 16, !dbg !76 + %892 = trunc i32 %extelt.offset3 to i16, !dbg !76 + %893 = trunc i32 %886 to i16, !dbg !76 + %extelt.offset4 = lshr i32 %886, 16, !dbg !76 + %894 = trunc i32 %extelt.offset4 to i16, !dbg !76 + %895 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %881, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76 + %896 = extractvalue { i32, i32, i32, i32 } %895, 0, !dbg !76 + %897 = extractvalue { i32, i32, i32, i32 } %895, 1, !dbg !76 + %898 = extractvalue { i32, i32, i32, i32 } %895, 2, !dbg !76 + %899 = extractvalue { i32, i32, i32, i32 } %895, 3, !dbg !76 + %900 = trunc i32 %896 to i16, !dbg !76 + %extelt.offset5 = lshr i32 %896, 16, !dbg !76 + %901 = trunc i32 %extelt.offset5 to i16, !dbg !76 + %902 = trunc i32 %897 to i16, !dbg !76 + %extelt.offset6 = lshr i32 %897, 16, !dbg !76 + %903 = trunc i32 %extelt.offset6 to i16, !dbg !76 + %904 = trunc i32 %898 to i16, !dbg !76 + %extelt.offset7 = lshr i32 %898, 16, !dbg !76 + %905 = trunc i32 %extelt.offset7 to i16, !dbg !76 + %906 = trunc i32 %899 to i16, !dbg !76 + %extelt.offset8 = lshr i32 %899, 16, !dbg !76 + %907 = trunc i32 %extelt.offset8 to i16, !dbg !76 + %908 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %887) #6, !dbg !77 + %909 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %888) #6, !dbg !77 + %910 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %889) #6, !dbg !77 + %911 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %890) #6, !dbg !77 + %912 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %891) #6, !dbg !77 + %913 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %892) #6, !dbg !77 + %914 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %893) #6, !dbg !77 + %915 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %894) #6, !dbg !77 + %916 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %900) #6, !dbg !77 + %917 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %901) #6, !dbg !77 + %918 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %902) #6, !dbg !77 + %919 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %903) #6, !dbg !77 + %920 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %904) #6, !dbg !77 + %921 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %905) #6, !dbg !77 + %922 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %906) #6, !dbg !77 + %923 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %907) #6, !dbg !77 + %924 = zext nneg i32 %826 to i64, !dbg !78 + %925 = getelementptr float, ptr addrspace(1) %4, i64 %924, !dbg !78 + %926 = zext nneg i32 %827 to i64, !dbg !78 + %927 = getelementptr float, ptr addrspace(1) %4, i64 %926, !dbg !78 + %928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %925, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79 + %929 = extractvalue { i32, i32, i32, i32 } %928, 0, !dbg !79 + %930 = extractvalue { i32, i32, i32, i32 } %928, 1, !dbg !79 + %931 = extractvalue { i32, i32, i32, i32 } %928, 2, !dbg !79 + %932 = extractvalue { i32, i32, i32, i32 } %928, 3, !dbg !79 + %933 = bitcast i32 %929 to float, !dbg !79 + %934 = bitcast i32 %930 to float, !dbg !79 + %935 = bitcast i32 %931 to float, !dbg !79 + %936 = bitcast i32 %932 to float, !dbg !79 + %937 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %927, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79 + %938 = extractvalue { i32, i32, i32, i32 } %937, 0, !dbg !79 + %939 = extractvalue { i32, i32, i32, i32 } %937, 1, !dbg !79 + %940 = extractvalue { i32, i32, i32, i32 } %937, 2, !dbg !79 + %941 = extractvalue { i32, i32, i32, i32 } %937, 3, !dbg !79 + %942 = bitcast i32 %938 to float, !dbg !79 + %943 = bitcast i32 %939 to float, !dbg !79 + %944 = bitcast i32 %940 to float, !dbg !79 + %945 = bitcast i32 %941 to float, !dbg !79 + br i1 %56, label %946, label %947, !dbg !80 + +946: ; preds = %824 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80 + br label %947, !dbg !80 + +947: ; preds = %946, %824 + %948 = getelementptr float, ptr addrspace(1) %63, i64 %924, !dbg !81 + %949 = getelementptr float, ptr addrspace(1) %63, i64 %926, !dbg !81 + %950 = getelementptr float, ptr addrspace(1) %64, i64 %924, !dbg !81 + %951 = getelementptr float, ptr addrspace(1) %64, i64 %926, !dbg !81 + %952 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %948, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82 + %953 = extractvalue { i32, i32, i32, i32 } %952, 0, !dbg !82 + %954 = extractvalue { i32, i32, i32, i32 } %952, 1, !dbg !82 + %955 = extractvalue { i32, i32, i32, i32 } %952, 2, !dbg !82 + %956 = extractvalue { i32, i32, i32, i32 } %952, 3, !dbg !82 + %957 = bitcast i32 %953 to float, !dbg !82 + %958 = bitcast i32 %954 to float, !dbg !82 + %959 = bitcast i32 %955 to float, !dbg !82 + %960 = bitcast i32 %956 to float, !dbg !82 + %961 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %949, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82 + %962 = extractvalue { i32, i32, i32, i32 } %961, 0, !dbg !82 + %963 = extractvalue { i32, i32, i32, i32 } %961, 1, !dbg !82 + %964 = extractvalue { i32, i32, i32, i32 } %961, 2, !dbg !82 + %965 = extractvalue { i32, i32, i32, i32 } %961, 3, !dbg !82 + %966 = bitcast i32 %962 to float, !dbg !82 + %967 = bitcast i32 %963 to float, !dbg !82 + %968 = bitcast i32 %964 to float, !dbg !82 + %969 = bitcast i32 %965 to float, !dbg !82 + %970 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %950, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82 + %971 = extractvalue { i32, i32, i32, i32 } %970, 0, !dbg !82 + %972 = extractvalue { i32, i32, i32, i32 } %970, 1, !dbg !82 + %973 = extractvalue { i32, i32, i32, i32 } %970, 2, !dbg !82 + %974 = extractvalue { i32, i32, i32, i32 } %970, 3, !dbg !82 + %975 = bitcast i32 %971 to float, !dbg !82 + %976 = bitcast i32 %972 to float, !dbg !82 + %977 = bitcast i32 %973 to float, !dbg !82 + %978 = bitcast i32 %974 to float, !dbg !82 + %979 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %951, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82 + %980 = extractvalue { i32, i32, i32, i32 } %979, 0, !dbg !82 + %981 = extractvalue { i32, i32, i32, i32 } %979, 1, !dbg !82 + %982 = extractvalue { i32, i32, i32, i32 } %979, 2, !dbg !82 + %983 = extractvalue { i32, i32, i32, i32 } %979, 3, !dbg !82 + %984 = bitcast i32 %980 to float, !dbg !82 + %985 = bitcast i32 %981 to float, !dbg !82 + %986 = bitcast i32 %982 to float, !dbg !82 + %987 = bitcast i32 %983 to float, !dbg !82 + %988 = fadd float %845, %957, !dbg !83 + %989 = fadd float %846, %958, !dbg !83 + %990 = fadd float %847, %959, !dbg !83 + %991 = fadd float %848, %960, !dbg !83 + %992 = fadd float %854, %966, !dbg !83 + %993 = fadd float %855, %967, !dbg !83 + %994 = fadd float %856, %968, !dbg !83 + %995 = fadd float %857, %969, !dbg !83 + %996 = fadd float %863, %975, !dbg !83 + %997 = fadd float %864, %976, !dbg !83 + %998 = fadd float %865, %977, !dbg !83 + %999 = fadd float %866, %978, !dbg !83 + %1000 = fadd float %872, %984, !dbg !83 + %1001 = fadd float %873, %985, !dbg !83 + %1002 = fadd float %874, %986, !dbg !83 + %1003 = fadd float %875, %987, !dbg !83 + %1004 = fadd float %908, %988, !dbg !84 + %1005 = fadd float %909, %989, !dbg !84 + %1006 = fadd float %910, %990, !dbg !84 + %1007 = fadd float %911, %991, !dbg !84 + %1008 = fadd float %912, %992, !dbg !84 + %1009 = fadd float %913, %993, !dbg !84 + %1010 = fadd float %914, %994, !dbg !84 + %1011 = fadd float %915, %995, !dbg !84 + %1012 = fadd float %916, %996, !dbg !84 + %1013 = fadd float %917, %997, !dbg !84 + %1014 = fadd float %918, %998, !dbg !84 + %1015 = fadd float %919, %999, !dbg !84 + %1016 = fadd float %920, %1000, !dbg !84 + %1017 = fadd float %921, %1001, !dbg !84 + %1018 = fadd float %922, %1002, !dbg !84 + %1019 = fadd float %923, %1003, !dbg !84 + %1020 = fsub float %1004, %737, !dbg !85 + %1021 = fsub float %1005, %737, !dbg !85 + %1022 = fsub float %1006, %737, !dbg !85 + %1023 = fsub float %1007, %737, !dbg !85 + %1024 = fsub float %1008, %737, !dbg !85 + %1025 = fsub float %1009, %737, !dbg !85 + %1026 = fsub float %1010, %737, !dbg !85 + %1027 = fsub float %1011, %737, !dbg !85 + %1028 = fsub float %1012, %800, !dbg !85 + %1029 = fsub float %1013, %800, !dbg !85 + %1030 = fsub float %1014, %800, !dbg !85 + %1031 = fsub float %1015, %800, !dbg !85 + %1032 = fsub float %1016, %800, !dbg !85 + %1033 = fsub float %1017, %800, !dbg !85 + %1034 = fsub float %1018, %800, !dbg !85 + %1035 = fsub float %1019, %800, !dbg !85 + %1036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %.not.i = icmp eq i32 %1036, 0, !dbg !86 + br i1 %.not.i, label %1039, label %1037, !dbg !86 + +1037: ; preds = %947 + %1038 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %822), !dbg !86 + br label %__nv_rsqrtf.exit, !dbg !86 + +1039: ; preds = %947 + %1040 = tail call float @llvm.nvvm.rsqrt.approx.f(float %822), !dbg !86 + br label %__nv_rsqrtf.exit, !dbg !86 + +__nv_rsqrtf.exit: ; preds = %1037, %1039 + %.0.i = phi float [ %1038, %1037 ], [ %1040, %1039 ], !dbg !86 + %1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %.not.i38 = icmp eq i32 %1048, 0, !dbg !86 + br i1 %.not.i38, label %1051, label %1049, !dbg !86 + +1049: ; preds = %__nv_rsqrtf.exit + %1050 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %823), !dbg !86 + br label %__nv_rsqrtf.exit40, !dbg !86 + +1051: ; preds = %__nv_rsqrtf.exit + %1052 = tail call float @llvm.nvvm.rsqrt.approx.f(float %823), !dbg !86 + br label %__nv_rsqrtf.exit40, !dbg !86 + +__nv_rsqrtf.exit40: ; preds = %1049, %1051 + %.0.i39 = phi float [ %1050, %1049 ], [ %1052, %1051 ], !dbg !86 + %1053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86 + %1060 = fmul float %1020, %.0.i, !dbg !87 + %1061 = fmul float %1021, %.0.i, !dbg !87 + %1062 = fmul float %1022, %.0.i, !dbg !87 + %1063 = fmul float %1023, %.0.i, !dbg !87 + %1064 = fmul float %1024, %.0.i, !dbg !87 + %1065 = fmul float %1025, %.0.i, !dbg !87 + %1066 = fmul float %1026, %.0.i, !dbg !87 + %1067 = fmul float %1027, %.0.i, !dbg !87 + %1068 = fmul float %1028, %.0.i39, !dbg !87 + %1069 = fmul float %1029, %.0.i39, !dbg !87 + %1070 = fmul float %1030, %.0.i39, !dbg !87 + %1071 = fmul float %1031, %.0.i39, !dbg !87 + %1072 = fmul float %1032, %.0.i39, !dbg !87 + %1073 = fmul float %1033, %.0.i39, !dbg !87 + %1074 = fmul float %1034, %.0.i39, !dbg !87 + %1075 = fmul float %1035, %.0.i39, !dbg !87 + %1076 = fmul float %1060, %933, !dbg !88 + %1077 = fmul float %1061, %934, !dbg !88 + %1078 = fmul float %1062, %935, !dbg !88 + %1079 = fmul float %1063, %936, !dbg !88 + %1080 = fmul float %1064, %942, !dbg !88 + %1081 = fmul float %1065, %943, !dbg !88 + %1082 = fmul float %1066, %944, !dbg !88 + %1083 = fmul float %1067, %945, !dbg !88 + %1084 = fmul float %1068, %933, !dbg !88 + %1085 = fmul float %1069, %934, !dbg !88 + %1086 = fmul float %1070, %935, !dbg !88 + %1087 = fmul float %1071, %936, !dbg !88 + %1088 = fmul float %1072, %942, !dbg !88 + %1089 = fmul float %1073, %943, !dbg !88 + %1090 = fmul float %1074, %944, !dbg !88 + %1091 = fmul float %1075, %945, !dbg !88 + %1092 = getelementptr i16, ptr addrspace(1) %5, i64 %878, !dbg !89 + %1093 = getelementptr i16, ptr addrspace(1) %5, i64 %880, !dbg !89 + %1094 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1076) #6, !dbg !90 + %1095 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1077) #6, !dbg !90 + %1096 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1078) #6, !dbg !90 + %1097 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1079) #6, !dbg !90 + %1098 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1080) #6, !dbg !90 + %1099 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1081) #6, !dbg !90 + %1100 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1082) #6, !dbg !90 + %1101 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1083) #6, !dbg !90 + %1102 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1084) #6, !dbg !90 + %1103 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1085) #6, !dbg !90 + %1104 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1086) #6, !dbg !90 + %1105 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1087) #6, !dbg !90 + %1106 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1088) #6, !dbg !90 + %1107 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1089) #6, !dbg !90 + %1108 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1090) #6, !dbg !90 + %1109 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1091) #6, !dbg !90 + %1110 = insertelement <2 x i16> undef, i16 %1094, i64 0, !dbg !90 + %1111 = insertelement <2 x i16> %1110, i16 %1095, i64 1, !dbg !90 + %1112 = bitcast <2 x i16> %1111 to i32, !dbg !90 + %1113 = insertelement <2 x i16> undef, i16 %1096, i64 0, !dbg !90 + %1114 = insertelement <2 x i16> %1113, i16 %1097, i64 1, !dbg !90 + %1115 = bitcast <2 x i16> %1114 to i32, !dbg !90 + %1116 = insertelement <2 x i16> undef, i16 %1098, i64 0, !dbg !90 + %1117 = insertelement <2 x i16> %1116, i16 %1099, i64 1, !dbg !90 + %1118 = bitcast <2 x i16> %1117 to i32, !dbg !90 + %1119 = insertelement <2 x i16> undef, i16 %1100, i64 0, !dbg !90 + %1120 = insertelement <2 x i16> %1119, i16 %1101, i64 1, !dbg !90 + %1121 = bitcast <2 x i16> %1120 to i32, !dbg !90 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1112, i32 %1115, i32 %1118, i32 %1121, ptr addrspace(1) %1092, i1 true) #6, !dbg !90 + %1122 = insertelement <2 x i16> undef, i16 %1102, i64 0, !dbg !90 + %1123 = insertelement <2 x i16> %1122, i16 %1103, i64 1, !dbg !90 + %1124 = bitcast <2 x i16> %1123 to i32, !dbg !90 + %1125 = insertelement <2 x i16> undef, i16 %1104, i64 0, !dbg !90 + %1126 = insertelement <2 x i16> %1125, i16 %1105, i64 1, !dbg !90 + %1127 = bitcast <2 x i16> %1126 to i32, !dbg !90 + %1128 = insertelement <2 x i16> undef, i16 %1106, i64 0, !dbg !90 + %1129 = insertelement <2 x i16> %1128, i16 %1107, i64 1, !dbg !90 + %1130 = bitcast <2 x i16> %1129 to i32, !dbg !90 + %1131 = insertelement <2 x i16> undef, i16 %1108, i64 0, !dbg !90 + %1132 = insertelement <2 x i16> %1131, i16 %1109, i64 1, !dbg !90 + %1133 = bitcast <2 x i16> %1132 to i32, !dbg !90 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1124, i32 %1127, i32 %1130, i32 %1133, ptr addrspace(1) %1093, i1 true) #6, !dbg !90 + %1134 = add nuw nsw i32 %825, 64, !dbg !69 + %1135 = icmp ult i32 %825, 192, !dbg !69 + br i1 %1135, label %824, label %1136, !dbg !69 + +1136: ; preds = %__nv_rsqrtf.exit40 + ret void, !dbg !91 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn") +!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 22, column: 44, scope: !7) +!11 = !DILocation(line: 24, column: 33, scope: !7) +!12 = !DILocation(line: 31, column: 36, scope: !7) +!13 = !DILocation(line: 21, column: 28, scope: !7) +!14 = !DILocation(line: 21, column: 33, scope: !7) +!15 = !DILocation(line: 22, column: 23, scope: !7) +!16 = !DILocation(line: 26, column: 30, scope: !7) +!17 = !DILocation(line: 26, column: 35, scope: !7) +!18 = !DILocation(line: 27, column: 18, scope: !7) +!19 = !DILocation(line: 35, column: 44, scope: !7) +!20 = !DILocation(line: 36, column: 44, scope: !7) +!21 = !DILocation(line: 37, column: 22, scope: !7) +!22 = !DILocation(line: 38, column: 22, scope: !7) +!23 = !DILocation(line: 39, column: 36, scope: !7) +!24 = !DILocation(line: 40, column: 40, scope: !7) +!25 = !DILocation(line: 41, column: 44, scope: !7) +!26 = !DILocation(line: 32, column: 27, scope: !7) +!27 = !DILocation(line: 35, column: 40, scope: !7) +!28 = !DILocation(line: 35, column: 34, scope: !7) +!29 = !DILocation(line: 35, column: 50, scope: !7) +!30 = !DILocation(line: 36, column: 40, scope: !7) +!31 = !DILocation(line: 36, column: 34, scope: !7) +!32 = !DILocation(line: 36, column: 50, scope: !7) +!33 = !DILocation(line: 36, column: 101, scope: !7) +!34 = !DILocation(line: 40, column: 55, scope: !7) +!35 = !DILocation(line: 41, column: 40, scope: !7) +!36 = !DILocation(line: 41, column: 34, scope: !7) +!37 = !DILocation(line: 41, column: 52, scope: !7) +!38 = !DILocation(line: 42, column: 22, scope: !7) +!39 = !DILocation(line: 44, column: 22, scope: !7) +!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43) +!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0) +!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!43 = !DILocation(line: 47, column: 41, scope: !41) +!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43) +!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43) +!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43) +!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43) +!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43) +!49 = !DILocation(line: 50, column: 50, scope: !7) +!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52) +!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0) +!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53) +!53 = !DILocation(line: 53, column: 44, scope: !51) +!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52) +!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52) +!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52) +!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52) +!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52) +!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52) +!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52) +!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52) +!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52) +!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52) +!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52) +!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66) +!66 = !DILocation(line: 53, column: 44, scope: !41) +!67 = !DILocation(line: 75, column: 24, scope: !7) +!68 = !DILocation(line: 77, column: 24, scope: !7) +!69 = !DILocation(line: 58, column: 36, scope: !7) +!70 = !DILocation(line: 59, column: 27, scope: !7) +!71 = !DILocation(line: 62, column: 41, scope: !7) +!72 = !DILocation(line: 62, column: 35, scope: !7) +!73 = !DILocation(line: 62, column: 51, scope: !7) +!74 = !DILocation(line: 63, column: 41, scope: !7) +!75 = !DILocation(line: 63, column: 35, scope: !7) +!76 = !DILocation(line: 63, column: 51, scope: !7) +!77 = !DILocation(line: 63, column: 103, scope: !7) +!78 = !DILocation(line: 64, column: 35, scope: !7) +!79 = !DILocation(line: 64, column: 40, scope: !7) +!80 = !DILocation(line: 68, column: 57, scope: !7) +!81 = !DILocation(line: 69, column: 35, scope: !7) +!82 = !DILocation(line: 69, column: 54, scope: !7) +!83 = !DILocation(line: 70, column: 24, scope: !7) +!84 = !DILocation(line: 72, column: 24, scope: !7) +!85 = !DILocation(line: 73, column: 24, scope: !7) +!86 = !DILocation(line: 78, column: 30, scope: !7) +!87 = !DILocation(line: 79, column: 24, scope: !7) +!88 = !DILocation(line: 80, column: 24, scope: !7) +!89 = !DILocation(line: 82, column: 29, scope: !7) +!90 = !DILocation(line: 82, column: 52, scope: !7) +!91 = !DILocation(line: 58, column: 4, scope: !7) diff --git a/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir b/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..abf505c4535e94e854eed877e6ccce0c3c218c4b --- /dev/null +++ b/.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir @@ -0,0 +1,151 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32> + %c256_i32 = arith.constant 256 : i32 + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_2 = arith.constant dense<256> : tensor<64x1xi64> + %cst_3 = arith.constant dense<0> : tensor<64x1xi64> + %cst_4 = arith.constant dense<50257> : tensor<64x1xi64> + %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32> + %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> + %cst_9 = arith.constant dense<256> : tensor<64x1xi32> + %cst_10 = arith.constant dense<256> : tensor<1x64xi32> + %cst_11 = arith.constant dense<512> : tensor<64x1xi32> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> + %7 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %10 = arith.remsi %5, %cst_11 : tensor<64x1xi32> + %11 = arith.muli %10, %cst_9 : tensor<64x1xi32> + %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %14 = arith.muli %5, %cst_9 : tensor<64x1xi32> + %15 = tt.broadcast %14 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %16 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %17 = arith.addi %9, %cst_4 : tensor<64x1xi64> + %18 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64> + %19 = arith.select %18, %17, %9 : tensor<64x1xi1>, tensor<64x1xi64> + %20 = arith.cmpi sge, %19, %cst_3 : tensor<64x1xi64> + %21 = arith.cmpi slt, %19, %cst_4 : tensor<64x1xi64> + %22 = arith.andi %20, %21 : tensor<64x1xi1> + %23 = arith.muli %19, %cst_2 : tensor<64x1xi64> + %24 = tt.broadcast %23 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %25 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %26:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 { + %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32> + %51 = arith.addi %50, %6 : tensor<1x64xi32> + %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32> + %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32> + %54 = arith.addi %53, %12 : tensor<64x64xi32> + %55 = tt.addptr %13, %54 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %58 = arith.addi %53, %15 : tensor<64x64xi32> + %59 = tt.addptr %16, %58 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16> + %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32> + tt.assert %22, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %62 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64> + %63 = tt.broadcast %62 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %64 = arith.addi %63, %24 : tensor<64x64xi64> + %65 = tt.addptr %25, %64 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %66 = tt.load %65, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %67 = arith.addf %66, %57 : tensor<64x64xf32> + %68 = arith.addf %67, %61 : tensor<64x64xf32> + %69 = arith.subf %68, %arg9 : tensor<64x64xf32> + %70 = arith.addf %arg11, %cst_1 : tensor<64x64xf32> + %71 = arith.divf %69, %70 : tensor<64x64xf32> + %72 = arith.addf %arg9, %71 : tensor<64x64xf32> + %73 = arith.subf %68, %72 : tensor<64x64xf32> + %74 = arith.mulf %69, %73 : tensor<64x64xf32> + %75 = arith.addf %arg10, %74 : tensor<64x64xf32> + %76 = arith.select %56, %72, %arg9 : tensor<64x64xi1>, tensor<64x64xf32> + %77 = arith.select %56, %75, %arg10 : tensor<64x64xi1>, tensor<64x64xf32> + %78 = arith.select %56, %70, %arg11 : tensor<64x64xi1>, tensor<64x64xf32> + scf.yield %76, %77, %78 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32> + } + %27:3 = "tt.reduce"(%26#0, %26#1, %26#2) <{axis = 1 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): + %50 = arith.subf %arg11, %arg8 : f32 + %51 = arith.addf %arg10, %arg13 : f32 + %52 = arith.cmpf oeq, %51, %cst_0 : f32 + %53 = arith.divf %arg13, %51 : f32 + %54 = arith.select %52, %cst_0, %53 : f32 + %55 = arith.mulf %50, %54 : f32 + %56 = arith.addf %arg8, %55 : f32 + %57 = arith.addf %arg9, %arg12 : f32 + %58 = arith.mulf %50, %50 : f32 + %59 = arith.mulf %58, %arg10 : f32 + %60 = arith.mulf %59, %54 : f32 + %61 = arith.addf %57, %60 : f32 + tt.reduce.return %56, %61, %51 : f32, f32, f32 + }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) + %28 = tt.expand_dims %27#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %29 = tt.expand_dims %27#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %30 = arith.muli %10, %cst_9 : tensor<64x1xi32> + %31 = tt.broadcast %30 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %32 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %33 = arith.muli %5, %cst_9 : tensor<64x1xi32> + %34 = tt.broadcast %33 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %35 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %36 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x64x!tt.ptr> + %37 = arith.addi %9, %cst_4 : tensor<64x1xi64> + %38 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64> + %39 = arith.select %38, %37, %9 : tensor<64x1xi1>, tensor<64x1xi64> + %40 = arith.cmpi sge, %39, %cst_3 : tensor<64x1xi64> + %41 = arith.cmpi slt, %39, %cst_4 : tensor<64x1xi64> + %42 = arith.andi %40, %41 : tensor<64x1xi1> + %43 = arith.muli %39, %cst_2 : tensor<64x1xi64> + %44 = tt.broadcast %43 : (tensor<64x1xi64>) -> tensor<64x64xi64> + %45 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %46 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %47 = arith.divf %29, %cst_6 : tensor<64x1xf32> + %48 = arith.addf %47, %cst_5 : tensor<64x1xf32> + %49 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 : i32 { + %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32> + %51 = arith.addi %50, %6 : tensor<1x64xi32> + %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32> + %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32> + %54 = arith.addi %53, %31 : tensor<64x64xi32> + %55 = tt.addptr %32, %54 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32> + %58 = arith.addi %53, %34 : tensor<64x64xi32> + %59 = tt.addptr %35, %58 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16> + %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32> + %62 = tt.addptr %36, %51 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> + %63 = tt.load %62, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32> + tt.assert %42, "index out of bounds: 0 <= tmp16 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %64 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64> + %65 = tt.broadcast %64 : (tensor<1x64xi64>) -> tensor<64x64xi64> + %66 = arith.addi %65, %44 : tensor<64x64xi64> + %67 = tt.addptr %45, %66 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> + %68 = tt.load %67, %56, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32> + %69 = arith.addf %68, %57 : tensor<64x64xf32> + %70 = arith.addf %69, %61 : tensor<64x64xf32> + %71 = arith.subf %70, %46 : tensor<64x64xf32> + %72 = tt.extern_elementwise %48 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> + %73 = tt.broadcast %72 : (tensor<64x1xf32>) -> tensor<64x64xf32> + %74 = arith.mulf %71, %73 : tensor<64x64xf32> + %75 = tt.broadcast %63 : (tensor<1x64xf32>) -> tensor<64x64xf32> + %76 = arith.mulf %74, %75 : tensor<64x64xf32> + %77 = tt.addptr %49, %58 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %78 = arith.truncf %76 : tensor<64x64xf32> to tensor<64x64xbf16> + tt.store %77, %78, %56 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16> + } + tt.return + } +} diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..85eb5315e1d3686bd64eed3fe17ff7382f07f4f4 Binary files /dev/null and b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin differ diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..3568d41cc1e48f843eeba028636791c34d835b56 --- /dev/null +++ b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir @@ -0,0 +1,269 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = shl i32 %6, 2, !dbg !8 + %10 = and i32 %9, 12, !dbg !8 + %11 = and i32 %6, 15, !dbg !8 + %12 = and i32 %8, 7, !dbg !9 + %13 = lshr i32 %7, 2, !dbg !9 + %14 = shl nuw nsw i32 %12, 3, !dbg !9 + %15 = or i32 %14, %13, !dbg !9 + %16 = or i32 %15, 64, !dbg !9 + %17 = or i32 %10, 1, !dbg !10 + %18 = or i32 %10, 2, !dbg !10 + %19 = or i32 %10, 3, !dbg !10 + %20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14 + %21 = shl i32 %20, 4, !dbg !15 + %22 = or i32 %21, %10, !dbg !16 + %23 = or i32 %21, %11, !dbg !16 + %24 = icmp ult i32 %16, 120, !dbg !17 + %25 = shl nuw nsw i32 %15, 17, !dbg !18 + %26 = shl nuw nsw i32 %16, 17, !dbg !18 + %27 = add i32 %22, %25, !dbg !19 + %28 = add i32 %22, %26, !dbg !19 + %29 = sext i32 %27 to i64, !dbg !20 + %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !20 + %31 = sext i32 %28 to i64, !dbg !20 + %32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !20 + %33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !21 + %35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !21 + %36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !21 + %37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !21 + %38 = bitcast i32 %34 to float, !dbg !21 + %39 = bitcast i32 %35 to float, !dbg !21 + %40 = bitcast i32 %36 to float, !dbg !21 + %41 = bitcast i32 %37 to float, !dbg !21 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24) #3, !dbg !21 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21 + %47 = bitcast i32 %43 to float, !dbg !21 + %48 = bitcast i32 %44 to float, !dbg !21 + %49 = bitcast i32 %45 to float, !dbg !21 + %50 = bitcast i32 %46 to float, !dbg !21 + %51 = fadd float %38, 0.000000e+00, !dbg !22 + %52 = fadd float %39, 0.000000e+00, !dbg !22 + %53 = fadd float %40, 0.000000e+00, !dbg !22 + %54 = fadd float %41, 0.000000e+00, !dbg !22 + %55 = fadd float %47, 0.000000e+00, !dbg !22 + %56 = fadd float %48, 0.000000e+00, !dbg !22 + %57 = fadd float %49, 0.000000e+00, !dbg !22 + %58 = fadd float %50, 0.000000e+00, !dbg !22 + %59 = select i1 %24, float %55, float 0.000000e+00, !dbg !23 + %60 = select i1 %24, float %56, float 0.000000e+00, !dbg !23 + %61 = select i1 %24, float %57, float 0.000000e+00, !dbg !23 + %62 = select i1 %24, float %58, float 0.000000e+00, !dbg !23 + %63 = fadd float %51, %59, !dbg !24 + %64 = fadd float %52, %60, !dbg !24 + %65 = fadd float %53, %61, !dbg !24 + %66 = fadd float %54, %62, !dbg !24 + %67 = bitcast float %63 to i32, !dbg !10 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !10 + %69 = bitcast i32 %68 to float, !dbg !10 + %70 = fadd float %63, %69, !dbg !24 + %71 = bitcast float %70 to i32, !dbg !10 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !10 + %73 = bitcast i32 %72 to float, !dbg !10 + %74 = fadd float %70, %73, !dbg !24 + %75 = bitcast float %74 to i32, !dbg !10 + %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 4, i32 31), !dbg !10 + %77 = bitcast i32 %76 to float, !dbg !10 + %78 = fadd float %74, %77, !dbg !24 + %79 = bitcast float %64 to i32, !dbg !10 + %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 16, i32 31), !dbg !10 + %81 = bitcast i32 %80 to float, !dbg !10 + %82 = fadd float %64, %81, !dbg !24 + %83 = bitcast float %82 to i32, !dbg !10 + %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 8, i32 31), !dbg !10 + %85 = bitcast i32 %84 to float, !dbg !10 + %86 = fadd float %82, %85, !dbg !24 + %87 = bitcast float %86 to i32, !dbg !10 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 4, i32 31), !dbg !10 + %89 = bitcast i32 %88 to float, !dbg !10 + %90 = fadd float %86, %89, !dbg !24 + %91 = bitcast float %65 to i32, !dbg !10 + %92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %91, i32 16, i32 31), !dbg !10 + %93 = bitcast i32 %92 to float, !dbg !10 + %94 = fadd float %65, %93, !dbg !24 + %95 = bitcast float %94 to i32, !dbg !10 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 8, i32 31), !dbg !10 + %97 = bitcast i32 %96 to float, !dbg !10 + %98 = fadd float %94, %97, !dbg !24 + %99 = bitcast float %98 to i32, !dbg !10 + %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 4, i32 31), !dbg !10 + %101 = bitcast i32 %100 to float, !dbg !10 + %102 = fadd float %98, %101, !dbg !24 + %103 = bitcast float %66 to i32, !dbg !10 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !10 + %105 = bitcast i32 %104 to float, !dbg !10 + %106 = fadd float %66, %105, !dbg !24 + %107 = bitcast float %106 to i32, !dbg !10 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 8, i32 31), !dbg !10 + %109 = bitcast i32 %108 to float, !dbg !10 + %110 = fadd float %106, %109, !dbg !24 + %111 = bitcast float %110 to i32, !dbg !10 + %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 4, i32 31), !dbg !10 + %113 = bitcast i32 %112 to float, !dbg !10 + %114 = fadd float %110, %113, !dbg !24 + %115 = icmp ult i32 %7, 4, !dbg !10 + %116 = shl nuw nsw i32 %10, 3, !dbg !10 + %117 = or i32 %116, %12, !dbg !10 + %118 = zext nneg i32 %117 to i64, !dbg !10 + %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %78, i1 %115) #3, !dbg !10 + %120 = shl nuw nsw i32 %17, 3, !dbg !10 + %121 = or i32 %120, %12, !dbg !10 + %122 = zext nneg i32 %121 to i64, !dbg !10 + %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %90, i1 %115) #3, !dbg !10 + %124 = shl nuw nsw i32 %18, 3, !dbg !10 + %125 = or i32 %124, %12, !dbg !10 + %126 = zext nneg i32 %125 to i64, !dbg !10 + %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %102, i1 %115) #3, !dbg !10 + %128 = shl nuw nsw i32 %19, 3, !dbg !10 + %129 = or i32 %128, %12, !dbg !10 + %130 = zext nneg i32 %129 to i64, !dbg !10 + %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %114, i1 %115) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %132 = icmp slt i32 %6, 128, !dbg !10 + %133 = sext i32 %6 to i64, !dbg !10 + %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !10 + %135 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #3, !dbg !10 + %136 = bitcast float %135 to i32, !dbg !10 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !10 + %138 = bitcast i32 %137 to float, !dbg !10 + %139 = fadd float %135, %138, !dbg !24 + %140 = bitcast float %139 to i32, !dbg !10 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !10 + %142 = bitcast i32 %141 to float, !dbg !10 + %143 = fadd float %139, %142, !dbg !24 + %144 = bitcast float %143 to i32, !dbg !10 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !10 + %146 = bitcast i32 %145 to float, !dbg !10 + %147 = fadd float %143, %146, !dbg !24 + %148 = and i32 %6, 7, !dbg !10 + %149 = icmp eq i32 %148, 0, !dbg !10 + %150 = and i1 %132, %149, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %147, i1 %150) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %151 = zext nneg i32 %116 to i64, !dbg !10 + %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10 + %153 = load float, ptr addrspace(3) %152, align 4, !dbg !10 + %154 = zext nneg i32 %120 to i64, !dbg !10 + %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !10 + %156 = load float, ptr addrspace(3) %155, align 4, !dbg !10 + %157 = zext nneg i32 %124 to i64, !dbg !10 + %158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !10 + %159 = load float, ptr addrspace(3) %158, align 4, !dbg !10 + %160 = zext nneg i32 %128 to i64, !dbg !10 + %161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !10 + %162 = load float, ptr addrspace(3) %161, align 4, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %163 = zext nneg i32 %10 to i64, !dbg !28 + %164 = getelementptr float, ptr addrspace(3) @global_smem, i64 %163, !dbg !28 + %165 = insertelement <1 x float> undef, float %153, i64 0, !dbg !28 + store <1 x float> %165, ptr addrspace(3) %164, align 4, !dbg !28 + %166 = zext nneg i32 %17 to i64, !dbg !28 + %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !28 + %168 = insertelement <1 x float> undef, float %156, i64 0, !dbg !28 + store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !28 + %169 = zext nneg i32 %18 to i64, !dbg !28 + %170 = getelementptr float, ptr addrspace(3) @global_smem, i64 %169, !dbg !28 + %171 = insertelement <1 x float> undef, float %159, i64 0, !dbg !28 + store <1 x float> %171, ptr addrspace(3) %170, align 4, !dbg !28 + %172 = zext nneg i32 %19 to i64, !dbg !28 + %173 = getelementptr float, ptr addrspace(3) @global_smem, i64 %172, !dbg !28 + %174 = insertelement <1 x float> undef, float %162, i64 0, !dbg !28 + store <1 x float> %174, ptr addrspace(3) %173, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %175 = zext nneg i32 %11 to i64, !dbg !28 + %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !28 + %177 = load <1 x float>, ptr addrspace(3) %176, align 4, !dbg !28 + %.frozen = freeze i32 %23 + %178 = sdiv i32 %.frozen, 256, !dbg !29 + %179 = mul i32 %178, 256 + %.decomposed = sub i32 %.frozen, %179 + %180 = sext i32 %178 to i64, !dbg !30 + %181 = getelementptr i64, ptr addrspace(1) %1, i64 %180, !dbg !30 + %182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %181, i1 true) #3, !dbg !31 + %183 = lshr i64 %182, 54, !dbg !32 + %184 = and i64 %183, 512, !dbg !32 + %185 = add i64 %184, %182, !dbg !32 + %186 = shl i64 %185, 8, !dbg !33 + %187 = sext i32 %.decomposed to i64, !dbg !34 + %188 = getelementptr float, ptr addrspace(1) %2, i64 %186, !dbg !35 + %189 = getelementptr float, ptr addrspace(1) %188, i64 %187, !dbg !35 + %190 = lshr i32 %7, 4, !dbg !36 + %191 = shl nuw nsw i32 %12, 1, !dbg !36 + %192 = or i32 %191, %190, !dbg !36 + %193 = icmp eq i32 %192, 0, !dbg !36 + %194 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %189, <1 x float> %177, i1 %193) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13) +!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0) +!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!13 = !DILocation(line: 35, column: 25, scope: !11) +!14 = !DILocation(line: 21, column: 28, scope: !5) +!15 = !DILocation(line: 21, column: 33, scope: !5) +!16 = !DILocation(line: 22, column: 23, scope: !5) +!17 = !DILocation(line: 29, column: 25, scope: !5) +!18 = !DILocation(line: 31, column: 47, scope: !5) +!19 = !DILocation(line: 31, column: 40, scope: !5) +!20 = !DILocation(line: 31, column: 34, scope: !5) +!21 = !DILocation(line: 31, column: 53, scope: !5) +!22 = !DILocation(line: 33, column: 23, scope: !5) +!23 = !DILocation(line: 34, column: 38, scope: !5) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7ef9f5d6208d9997c791c9331f3073423998a5cf --- /dev/null +++ b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx @@ -0,0 +1,642 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<20>; + .reg .b32 %r<107>; + .reg .f32 %f<60>; + .reg .b64 %rd<18>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r32, %tid.x; + and.b32 %r33, %r32, 31; + ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_2]; + shl.b32 %r34, %r32, 2; + and.b32 %r35, %r34, 12; + and.b32 %r36, %r32, 15; + .loc 1 24 33 + bfe.u32 %r37, %r32, 5, 3; + bfe.u32 %r38, %r32, 2, 3; + shl.b32 %r39, %r37, 3; + or.b32 %r40, %r39, %r38; + or.b32 %r41, %r40, 64; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r42, %r1, 4; + .loc 1 22 23 + or.b32 %r43, %r42, %r35; + or.b32 %r44, %r42, %r36; + .loc 1 29 25 + setp.lt.u32 %p6, %r41, 120; + .loc 1 31 47 + shl.b32 %r45, %r40, 17; + shl.b32 %r46, %r41, 17; + .loc 1 31 40 + add.s32 %r47, %r43, %r45; + add.s32 %r48, %r43, %r46; + .loc 1 31 34 + mul.wide.s32 %rd9, %r47, 4; + add.s64 %rd1, %rd6, %rd9; + mul.wide.s32 %rd10, %r48, 4; + add.s64 %rd2, %rd6, %rd10; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p6 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p6 mov.u32 %r10, %r6; + @!%p6 mov.u32 %r11, %r6; + @!%p6 mov.u32 %r12, %r6; + @!%p6 mov.u32 %r13, %r6; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 33 23 + add.f32 %f9, %f1, 0f00000000; + add.f32 %f10, %f2, 0f00000000; + add.f32 %f11, %f3, 0f00000000; + add.f32 %f12, %f4, 0f00000000; + add.f32 %f13, %f5, 0f00000000; + add.f32 %f14, %f6, 0f00000000; + add.f32 %f15, %f7, 0f00000000; + add.f32 %f16, %f8, 0f00000000; + .loc 1 34 38 + selp.f32 %f17, %f13, 0f00000000, %p6; + selp.f32 %f18, %f14, 0f00000000, %p6; + selp.f32 %f19, %f15, 0f00000000, %p6; + selp.f32 %f20, %f16, 0f00000000, %p6; +$L__tmp1: + .loc 2 233 15 + add.f32 %f21, %f9, %f17; + add.f32 %f22, %f10, %f18; + add.f32 %f23, %f11, %f19; + add.f32 %f24, %f12, %f20; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r49, %f21; + shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1; + mov.b32 %f25, %r50; +$L__tmp3: + .loc 2 233 15 + add.f32 %f26, %f21, %f25; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r51, %f26; + shfl.sync.bfly.b32 %r52, %r51, 8, 31, -1; + mov.b32 %f27, %r52; +$L__tmp5: + .loc 2 233 15 + add.f32 %f28, %f26, %f27; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r53, %f28; + shfl.sync.bfly.b32 %r54, %r53, 4, 31, -1; + mov.b32 %f29, %r54; +$L__tmp7: + .loc 2 233 15 + add.f32 %f30, %f28, %f29; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r55, %f22; + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + mov.b32 %f31, %r56; +$L__tmp9: + .loc 2 233 15 + add.f32 %f32, %f22, %f31; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r57, %f32; + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + mov.b32 %f33, %r58; +$L__tmp11: + .loc 2 233 15 + add.f32 %f34, %f32, %f33; +$L__tmp12: + .loc 2 243 36 + mov.b32 %r59, %f34; + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + mov.b32 %f35, %r60; +$L__tmp13: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r61, %f23; + shfl.sync.bfly.b32 %r62, %r61, 16, 31, -1; + mov.b32 %f37, %r62; +$L__tmp15: + .loc 2 233 15 + add.f32 %f38, %f23, %f37; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r63, %f38; + shfl.sync.bfly.b32 %r64, %r63, 8, 31, -1; + mov.b32 %f39, %r64; +$L__tmp17: + .loc 2 233 15 + add.f32 %f40, %f38, %f39; +$L__tmp18: + .loc 2 243 36 + mov.b32 %r65, %f40; + shfl.sync.bfly.b32 %r66, %r65, 4, 31, -1; + mov.b32 %f41, %r66; +$L__tmp19: + .loc 2 233 15 + add.f32 %f42, %f40, %f41; +$L__tmp20: + .loc 2 243 36 + mov.b32 %r67, %f24; + shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1; + mov.b32 %f43, %r68; +$L__tmp21: + .loc 2 233 15 + add.f32 %f44, %f24, %f43; +$L__tmp22: + .loc 2 243 36 + mov.b32 %r69, %f44; + shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1; + mov.b32 %f45, %r70; +$L__tmp23: + .loc 2 233 15 + add.f32 %f46, %f44, %f45; +$L__tmp24: + .loc 2 243 36 + mov.b32 %r71, %f46; + shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1; + mov.b32 %f47, %r72; +$L__tmp25: + .loc 2 233 15 + add.f32 %f48, %f46, %f47; +$L__tmp26: + .loc 2 243 36 + setp.lt.u32 %p11, %r33, 4; + shl.b32 %r73, %r37, 2; + shl.b32 %r74, %r35, 5; + or.b32 %r75, %r74, %r73; + mov.u32 %r76, global_smem; + add.s32 %r18, %r76, %r75; + mov.b32 %r19, %f30; + @%p11 st.shared.b32 [ %r18 + 0 ], %r19; + or.b32 %r77, %r74, 32; + or.b32 %r78, %r77, %r73; + add.s32 %r20, %r76, %r78; + mov.b32 %r21, %f36; + @%p11 st.shared.b32 [ %r20 + 0 ], %r21; + or.b32 %r79, %r74, 64; + or.b32 %r80, %r79, %r73; + add.s32 %r22, %r76, %r80; + mov.b32 %r23, %f42; + @%p11 st.shared.b32 [ %r22 + 0 ], %r23; + or.b32 %r81, %r74, 96; + or.b32 %r82, %r81, %r73; + add.s32 %r24, %r76, %r82; + mov.b32 %r25, %f48; + @%p11 st.shared.b32 [ %r24 + 0 ], %r25; + bar.sync 0; + setp.lt.s32 %p15, %r32, 128; + add.s32 %r27, %r76, %r34; + @%p15 ld.shared.b32 %r26, [ %r27 + 0 ]; + mov.b32 %f49, %r26; + shfl.sync.bfly.b32 %r83, %r26, 4, 31, -1; + mov.b32 %f50, %r83; +$L__tmp27: + .loc 2 233 15 + add.f32 %f51, %f49, %f50; +$L__tmp28: + .loc 2 243 36 + mov.b32 %r84, %f51; + shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1; + mov.b32 %f52, %r85; +$L__tmp29: + .loc 2 233 15 + add.f32 %f53, %f51, %f52; +$L__tmp30: + .loc 2 243 36 + mov.b32 %r86, %f53; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + mov.b32 %f54, %r87; +$L__tmp31: + .loc 2 233 15 + add.f32 %f55, %f53, %f54; +$L__tmp32: + .loc 2 243 36 + and.b32 %r88, %r32, 7; + setp.eq.s32 %p19, %r88, 0; + and.pred %p16, %p15, %p19; + mov.b32 %r29, %f55; + @%p16 st.shared.b32 [ %r27 + 0 ], %r29; + bar.sync 0; + add.s32 %r89, %r76, %r74; + ld.shared.f32 %f56, [%r89]; + add.s32 %r90, %r76, %r77; + ld.shared.f32 %f57, [%r90]; + add.s32 %r91, %r76, %r79; + ld.shared.f32 %f58, [%r91]; + add.s32 %r92, %r76, %r81; + ld.shared.f32 %f59, [%r92]; +$L__tmp33: + .loc 1 35 28 + bar.sync 0; + shl.b32 %r93, %r35, 2; + add.s32 %r94, %r76, %r93; + st.shared.f32 [%r94], %f56; + st.shared.f32 [%r94+4], %f57; + st.shared.f32 [%r94+8], %f58; + st.shared.f32 [%r94+12], %f59; + bar.sync 0; + shl.b32 %r95, %r36, 2; + add.s32 %r96, %r76, %r95; + .loc 1 36 20 + shr.s32 %r98, %r44, 31; + shr.u32 %r99, %r98, 24; + add.s32 %r100, %r44, %r99; + shr.s32 %r101, %r100, 8; + and.b32 %r102, %r100, -256; + sub.s32 %r103, %r44, %r102; + .loc 1 38 30 + mul.wide.s32 %rd11, %r101, 8; + add.s64 %rd4, %rd7, %rd11; + .loc 1 45 55 + ld.shared.u32 %r31, [%r96]; + .loc 1 38 35 + mov.u64 %rd3, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd3 }, [ %rd4 + 0 ]; + .loc 1 41 32 + shr.u64 %rd12, %rd3, 54; + and.b64 %rd13, %rd12, 512; + add.s64 %rd14, %rd13, %rd3; + .loc 1 45 30 + shl.b64 %rd15, %rd14, 10; + add.s64 %rd16, %rd8, %rd15; + mul.wide.s32 %rd17, %r103, 4; + add.s64 %rd5, %rd16, %rd17; + .loc 1 45 55 + bfe.u32 %r104, %r32, 4, 1; + shl.b32 %r105, %r37, 1; + or.b32 %r106, %r105, %r104; + setp.eq.s32 %p18, %r106, 0; + mov.u32 %r30, 0x0; + @%p18 atom.global.gpu.acq_rel.add.f32 %r30, [ %rd5 + 0 ], %r31; + .loc 1 45 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp32 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp32 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp33 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8954ee9cc3b1e80a7e0c7bb572a27be1d96f7f75 --- /dev/null +++ b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir @@ -0,0 +1,60 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<16x1xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<16x1xi64, #blocked> + %cst_1 = arith.constant dense<512> : tensor<16x1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<16x1xi32, #blocked> + %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1> + %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1> + %cst_5 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked1> + %cst_6 = arith.constant dense : tensor<16x1xi1, #blocked> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked> + %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1> + %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked> + %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked1> + %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked> + %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1> + %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1> + %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1> + %14 = tt.broadcast %8 : (tensor<16x1xi32, #blocked1>) -> tensor<16x128xi32, #blocked1> + %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<16x128xi32, #blocked1> + %16 = arith.addi %14, %15 : tensor<16x128xi32, #blocked1> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x128x!tt.ptr, #blocked1> + %18 = tt.addptr %17, %16 : tensor<16x128x!tt.ptr, #blocked1>, tensor<16x128xi32, #blocked1> + %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<16x128xi1, #blocked1> + %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked1> + %21 = arith.addf %20, %cst_5 : tensor<16x128xf32, #blocked1> + %22 = arith.select %19, %21, %cst_5 : tensor<16x128xi1, #blocked1>, tensor<16x128xf32, #blocked1> + %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %40 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<16x128xf32, #blocked1>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %24 = triton_gpu.convert_layout %23 : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked> + %26 = arith.divsi %9, %cst_2 : tensor<16x1xi32, #blocked> + %27 = arith.remsi %9, %cst_2 : tensor<16x1xi32, #blocked> + %28 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked> + %29 = tt.addptr %28, %26 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked> + %31 = arith.addi %30, %cst_1 : tensor<16x1xi64, #blocked> + %32 = arith.cmpi slt, %30, %cst_0 : tensor<16x1xi64, #blocked> + %33 = arith.select %32, %31, %30 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked> + %34 = arith.muli %33, %cst : tensor<16x1xi64, #blocked> + %35 = arith.extsi %27 : tensor<16x1xi32, #blocked> to tensor<16x1xi64, #blocked> + %36 = arith.addi %35, %34 : tensor<16x1xi64, #blocked> + %37 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked> + %38 = tt.addptr %37, %36 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi64, #blocked> + %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xf32, #blocked>, tensor<16x1xi1, #blocked>) -> tensor<16x1xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..746b65f12455b8baa9865625d37b4f09f8d4737f --- /dev/null +++ b/.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir @@ -0,0 +1,53 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<16x1xi64> + %cst_0 = arith.constant dense<0> : tensor<16x1xi64> + %cst_1 = arith.constant dense<512> : tensor<16x1xi64> + %cst_2 = arith.constant dense : tensor<16x1xi1> + %cst_3 = arith.constant dense<256> : tensor<16x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x128xi32> + %cst_5 = arith.constant dense<120> : tensor<1x128xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<16x1xi32> + %5 = arith.addi %4, %3 : tensor<16x1xi32> + %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> + %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32> + %9 = arith.muli %7, %cst_4 : tensor<1x128xi32> + %10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32> + %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32> + %12 = arith.addi %10, %11 : tensor<16x128xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr>, tensor<16x128xi32> + %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1> + %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32> + %17 = arith.addf %16, %cst_6 : tensor<16x128xf32> + %18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32> + %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %35 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %35 : f32 + }) : (tensor<16x128xf32>) -> tensor<16xf32> + %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %21 = arith.divsi %5, %cst_3 : tensor<16x1xi32> + %22 = arith.remsi %5, %cst_3 : tensor<16x1xi32> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64> + %26 = arith.addi %25, %cst_1 : tensor<16x1xi64> + %27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64> + %28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64> + %29 = arith.muli %28, %cst : tensor<16x1xi64> + %30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64> + %31 = arith.addi %30, %29 : tensor<16x1xi64> + %32 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr>, tensor<16x1xi64> + %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32> + tt.return + } +} diff --git a/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..30b3e01578542f25f42cefbfd2799f6e4511693e --- /dev/null +++ b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir @@ -0,0 +1,66 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %19 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %43 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %43 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %25 = arith.addf %24, %cst_1 : f32 + %26 = arith.mulf %22, %15 : tensor<256xf32, #blocked> + %27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %43 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %43 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %29 = arith.addf %28, %cst_1 : f32 + %30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked> + %31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked> + %32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked> + %33 = arith.subf %31, %32 : tensor<256xf32, #blocked> + %34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked> + %35 = arith.mulf %15, %34 : tensor<256xf32, #blocked> + %36 = arith.subf %33, %35 : tensor<256xf32, #blocked> + %37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %38 = arith.mulf %37, %36 : tensor<256xf32, #blocked> + %39 = arith.addf %18, %38 : tensor<256xf32, #blocked> + tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %40 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %41 = tt.addptr %40, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d701d6c1da1048e08005c64ddf8b0fe629dc1499 Binary files /dev/null and b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin differ diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..267330755ab03a7b1202f0954855f0bd41d54f7b --- /dev/null +++ b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir @@ -0,0 +1,45 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 { + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %4 = shl i32 %3, 2, !dbg !8 + %5 = and i32 %4, 508, !dbg !8 + %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %7 = shl i32 %6, 10, !dbg !10 + %8 = or i32 %7, %5, !dbg !11 + %9 = or i32 %8, 512, !dbg !11 + %10 = sext i32 %8 to i64, !dbg !12 + %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12 + %12 = sext i32 %9 to i64, !dbg !12 + %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !12 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !13 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 true) #1, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w") +!3 = !{ptr @triton__0d1de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 25, column: 25, scope: !5) +!13 = !DILocation(line: 25, column: 36, scope: !5) +!14 = !DILocation(line: 25, column: 4, scope: !5) diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6b18866900964ecfbb7f84ae68f7051e22021b00 --- /dev/null +++ b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx @@ -0,0 +1,279 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1de + +.visible .entry triton__0d1de( + .param .u64 triton__0d1de_param_0, + .param .u32 triton__0d1de_param_1 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<3>; + .reg .b32 %r<15>; + .reg .b64 %rd<5>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r10, %tid.x; + shl.b32 %r11, %r10, 2; + and.b32 %r12, %r11, 508; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r13, %r1, 10; + .loc 1 21 23 + or.b32 %r14, %r13, %r12; + .loc 1 25 25 + mul.wide.s32 %rd4, %r14, 4; + add.s64 %rd1, %rd3, %rd4; + add.s64 %rd2, %rd1, 2048; + mov.b32 %r2, 0; + mov.pred %p1, -1; + .loc 1 25 36 + @%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 }; + @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 }; + .loc 1 25 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/7w/c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 172 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 55 +.b8 119 +.b8 53 +.b8 114 +.b8 54 +.b8 54 +.b8 102 +.b8 99 +.b8 103 +.b8 103 +.b8 109 +.b8 54 +.b8 97 +.b8 111 +.b8 107 +.b8 107 +.b8 116 +.b8 122 +.b8 119 +.b8 109 +.b8 103 +.b8 50 +.b8 52 +.b8 109 +.b8 108 +.b8 101 +.b8 118 +.b8 113 +.b8 50 +.b8 104 +.b8 113 +.b8 100 +.b8 119 +.b8 50 +.b8 98 +.b8 103 +.b8 119 +.b8 122 +.b8 119 +.b8 108 +.b8 111 +.b8 118 +.b8 114 +.b8 101 +.b8 108 +.b8 54 +.b8 114 +.b8 101 +.b8 53 +.b8 121 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 55 +.b8 119 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4c08ff099f6a5e5ae194eb35f3f5357242962977 --- /dev/null +++ b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir @@ -0,0 +1,16 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..516330e5c027335adcb5b898329b1a17a0146ec2 --- /dev/null +++ b/.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir @@ -0,0 +1,15 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32> + tt.return + } +} diff --git a/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir b/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d3858d7765afd912afa77d470883f2e2c5b428d4 --- /dev/null +++ b/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir @@ -0,0 +1,110 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6e7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i64 {tt.max_divisibility = 8 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<7680> : tensor<1x2048xi64, #blocked> + %cst_0 = arith.constant dense<7680> : tensor<1x2048xi64, #blocked1> + %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked> + %c385973760_i64 = arith.constant 385973760 : i64 + %c7680_i64 = arith.constant 7680 : i64 + %c8_i64 = arith.constant 8 : i64 + %cst_2 = arith.constant dense<-1> : tensor<1x2048xi64, #blocked> + %cst_3 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked1> + %cst_5 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c7680_i32 = arith.constant 7680 : i32 + %c2048_i32 = arith.constant 2048 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.cmpi slt, %1, %c8_i64 : i64 + %3 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %4 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked> + %6 = tt.expand_dims %4 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x2048xi32, #blocked1> + %7 = arith.extsi %5 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> + %8 = arith.extsi %6 : tensor<1x2048xi32, #blocked1> to tensor<1x2048xi64, #blocked1> + %9 = arith.muli %1, %c7680_i64 : i64 + %10 = tt.splat %9 : (i64) -> tensor<1x2048xi64, #blocked> + %11 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x2048x!tt.ptr, #blocked> + %12 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked> + %13 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked1> + %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<1x2048x!tt.ptr, #blocked> + %15 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x2048x!tt.ptr, #blocked> + %16 = arith.muli %1, %c385973760_i64 : i64 + %17 = tt.splat %16 : (i64) -> tensor<1x2048xi64, #blocked> + %18 = tt.splat %arg1 : (!tt.ptr) -> tensor<1x2048x!tt.ptr, #blocked> + %19:2 = scf.for %arg8 = %c0_i32 to %c7680_i32 step %c2048_i32 iter_args(%arg9 = %cst_4, %arg10 = %cst_3) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>) : i32 { + %30 = arith.extsi %arg8 : i32 to i64 + %31 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked> + %32 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked1> + %33 = arith.addi %31, %7 : tensor<1x2048xi64, #blocked> + %34 = arith.addi %32, %8 : tensor<1x2048xi64, #blocked1> + %35 = arith.cmpi slt, %33, %cst : tensor<1x2048xi64, #blocked> + %36 = arith.cmpi slt, %34, %cst_0 : tensor<1x2048xi64, #blocked1> + %37 = arith.addi %33, %10 : tensor<1x2048xi64, #blocked> + %38 = tt.addptr %11, %37 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> + %39 = arith.andi %35, %12 : tensor<1x2048xi1, #blocked> + %40 = arith.andi %36, %13 : tensor<1x2048xi1, #blocked1> + %41 = tt.load %38, %39, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xi64, #blocked> + %42 = tt.addptr %14, %37 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> + %43 = tt.load %42, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked> + %44 = triton_gpu.convert_layout %43 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1> + %45 = tt.addptr %15, %37 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> + %46 = tt.load %45, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked> + %47 = arith.cmpi ne, %41, %cst_2 : tensor<1x2048xi64, #blocked> + %48 = triton_gpu.convert_layout %47 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked1> + %49 = arith.select %47, %41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> + %50 = arith.addi %49, %cst_1 : tensor<1x2048xi64, #blocked> + %51 = arith.cmpi slt, %49, %cst_3 : tensor<1x2048xi64, #blocked> + %52 = arith.select %51, %50, %49 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> + %53 = arith.cmpi sge, %52, %cst_3 : tensor<1x2048xi64, #blocked> + %54 = arith.cmpi slt, %52, %cst_1 : tensor<1x2048xi64, #blocked> + %55 = arith.andi %53, %54 : tensor<1x2048xi1, #blocked> + %56 = triton_gpu.convert_layout %55 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked2> + tt.assert %56, "index out of bounds: 0 <= tmp7 < 50257", "", "_call_with_frames_removed", 883 : tensor<1x2048xi1, #blocked2> + %57 = arith.muli %33, %cst_1 : tensor<1x2048xi64, #blocked> + %58 = arith.addi %52, %57 : tensor<1x2048xi64, #blocked> + %59 = arith.addi %58, %17 : tensor<1x2048xi64, #blocked> + %60 = tt.addptr %18, %59 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> + %61 = triton_gpu.convert_layout %60 : (tensor<1x2048x!tt.ptr, #blocked>) -> tensor<1x2048x!tt.ptr, #blocked1> + %62 = tt.load %61, %40, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked1> + %63 = arith.extf %62 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1> + %64 = arith.subf %63, %44 : tensor<1x2048xf32, #blocked1> + %65 = math.log %46 : tensor<1x2048xf32, #blocked> + %66 = triton_gpu.convert_layout %65 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1> + %67 = arith.subf %64, %66 : tensor<1x2048xf32, #blocked1> + %68 = arith.subf %cst_4, %67 : tensor<1x2048xf32, #blocked1> + %69 = arith.select %48, %68, %cst_4 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> + %70 = arith.addf %arg9, %69 : tensor<1x2048xf32, #blocked1> + %71 = arith.select %40, %70, %arg9 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> + %72 = arith.extui %47 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> + %73 = arith.addi %arg10, %72 : tensor<1x2048xi64, #blocked> + %74 = arith.select %39, %73, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> + scf.yield %71, %74 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked> + } + %20 = "tt.reduce"(%19#0) <{axis = 1 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %30 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %30 : f32 + }) : (tensor<1x2048xf32, #blocked1>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %21 = tt.expand_dims %20 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1> + %22 = tt.addptr %arg4, %1 : !tt.ptr, i64 + %23 = tt.splat %22 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked1> + %24 = tt.splat %2 : (i1) -> tensor<1x1xi1, #blocked1> + tt.store %23, %21, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1> + %25 = "tt.reduce"(%19#1) <{axis = 1 : i32}> ({ + ^bb0(%arg8: i64, %arg9: i64): + %30 = arith.addi %arg8, %arg9 : i64 + tt.reduce.return %30 : i64 + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %26 = triton_gpu.convert_layout %25 : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xi64, #blocked1> + %28 = tt.addptr %arg5, %1 : !tt.ptr, i64 + %29 = tt.splat %28 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked1> + tt.store %29, %27, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked1> + tt.return + } +} diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.cubin b/.triton/dump/962d1809855a53123762906133b1d960/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6309ac46068b6c01e73157eb9b21cad780a64a8e Binary files /dev/null and b/.triton/dump/962d1809855a53123762906133b1d960/triton_.cubin differ diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.llir b/.triton/dump/962d1809855a53123762906133b1d960/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..4522fcc1c1a6412383fc4a972ab64db3b470ddcc --- /dev/null +++ b/.triton/dump/962d1809855a53123762906133b1d960/triton_.llir @@ -0,0 +1,48 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 { + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %4 = shl i32 %3, 2, !dbg !8 + %5 = and i32 %4, 508, !dbg !8 + %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %7 = shl i32 %6, 10, !dbg !10 + %8 = or i32 %7, %5, !dbg !11 + %9 = or i32 %8, 512, !dbg !11 + %10 = icmp slt i32 %8, 12865792, !dbg !12 + %11 = icmp slt i32 %9, 12865792, !dbg !12 + %12 = sext i32 %8 to i64, !dbg !13 + %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13 + %14 = sext i32 %9 to i64, !dbg !13 + %15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 %10) #1, !dbg !14 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 %11) #1, !dbg !14 + ret void, !dbg !15 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y") +!3 = !{ptr @triton__0d1de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 22, column: 21, scope: !5) +!13 = !DILocation(line: 25, column: 25, scope: !5) +!14 = !DILocation(line: 25, column: 36, scope: !5) +!15 = !DILocation(line: 25, column: 4, scope: !5) diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.ptx b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..323c8829e1a7910879356166e3d9e1111f3dc748 --- /dev/null +++ b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ptx @@ -0,0 +1,282 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1de + +.visible .entry triton__0d1de( + .param .u64 triton__0d1de_param_0, + .param .u32 triton__0d1de_param_1 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<3>; + .reg .b32 %r<16>; + .reg .b64 %rd<5>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r10, %tid.x; + shl.b32 %r11, %r10, 2; + and.b32 %r12, %r11, 508; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r13, %r1, 10; + .loc 1 21 23 + or.b32 %r14, %r13, %r12; + or.b32 %r15, %r14, 512; + .loc 1 22 21 + setp.lt.s32 %p1, %r14, 12865792; + setp.lt.s32 %p2, %r15, 12865792; + .loc 1 25 25 + mul.wide.s32 %rd4, %r14, 4; + add.s64 %rd1, %rd3, %rd4; + add.s64 %rd2, %rd1, 2048; + mov.b32 %r2, 0; + .loc 1 25 36 + @%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 }; + @%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 }; + .loc 1 25 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 172 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 52 +.b8 121 +.b8 115 +.b8 101 +.b8 108 +.b8 100 +.b8 119 +.b8 109 +.b8 117 +.b8 51 +.b8 116 +.b8 111 +.b8 53 +.b8 50 +.b8 112 +.b8 98 +.b8 104 +.b8 50 +.b8 109 +.b8 100 +.b8 50 +.b8 111 +.b8 101 +.b8 117 +.b8 102 +.b8 114 +.b8 113 +.b8 51 +.b8 102 +.b8 99 +.b8 100 +.b8 109 +.b8 97 +.b8 112 +.b8 107 +.b8 116 +.b8 52 +.b8 110 +.b8 120 +.b8 100 +.b8 122 +.b8 109 +.b8 121 +.b8 113 +.b8 116 +.b8 103 +.b8 100 +.b8 50 +.b8 121 +.b8 115 +.b8 112 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 52 +.b8 121 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9303a3344f8c5f2257bf9e0d9f4444114f148b48 --- /dev/null +++ b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir @@ -0,0 +1,18 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2962f6703fcea2c27fc6abb5ca101dc337f76a29 --- /dev/null +++ b/.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir @@ -0,0 +1,17 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32> + %cst_0 = arith.constant dense<12865792> : tensor<1024xi32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32> + tt.return + } +} diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..8f9cfb6d6f4fa1cf2c1de42fb44e82da71cb734e Binary files /dev/null and b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin differ diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..ba7bb4edb122f5853209f6a24260d4f7100dadec --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir @@ -0,0 +1,368 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %14 = and i32 %13, 31, !dbg !8 + %15 = lshr i32 %13, 5, !dbg !8 + %16 = shl i32 %13, 2, !dbg !8 + %17 = and i32 %16, 60, !dbg !8 + %18 = and i32 %15, 3, !dbg !8 + %19 = lshr i32 %14, 1, !dbg !8 + %20 = shl nuw nsw i32 %18, 4, !dbg !8 + %21 = or i32 %20, %19, !dbg !8 + %22 = and i32 %16, 4, !dbg !9 + %23 = lshr i32 %14, 4, !dbg !9 + %24 = shl nuw nsw i32 %18, 1, !dbg !9 + %25 = or i32 %24, %23, !dbg !9 + %26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %27 = shl i32 %26, 6, !dbg !11 + %28 = or i32 %27, %17, !dbg !12 + %29 = or i32 %27, %21, !dbg !12 + %.frozen = freeze i32 %28 + %30 = sdiv i32 %.frozen, 256, !dbg !13 + %31 = mul i32 %30, 256 + %.decomposed = sub i32 %.frozen, %31 + %32 = sdiv i32 %29, 256, !dbg !13 + %33 = shl i32 %30, 15, !dbg !14 + %34 = shl nsw i32 %32, 7, !dbg !15 + %35 = add i32 %33, %.decomposed + %36 = mul nuw nsw i32 %17, 12 + %37 = or i32 %25, %36 + %38 = zext nneg i32 %37 to i64 + %39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38 + %40 = or i32 %36, 12 + %41 = add nuw nsw i32 %40, %25 + %42 = zext nneg i32 %41 to i64 + %43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42 + %44 = add nuw nsw i32 %36, 24 + %45 = or i32 %44, %25 + %46 = zext nneg i32 %45 to i64 + %47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46 + %48 = add nuw nsw i32 %36, 36 + %49 = add nuw nsw i32 %48, %25 + %50 = zext nneg i32 %49 to i64 + %51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50 + %52 = mul nuw nsw i32 %21, 12 + %53 = add nuw nsw i32 %52, %22 + %54 = zext nneg i32 %53 to i64 + %55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54 + %56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38 + %57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42 + %58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46 + %59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50 + %60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54 + %61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1 + %62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2 + %63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3 + br label %64, !dbg !16 + +64: ; preds = %12, %64 + %65 = phi i32 [ 0, %12 ], [ %205, %64 ] + %66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ] + %67 = or i32 %65, %22, !dbg !17 + %68 = or i32 %65, %25, !dbg !17 + %69 = shl i32 %68, 8, !dbg !18 + %70 = add i32 %35, %69, !dbg !19 + %71 = sext i32 %70 to i64, !dbg !20 + %72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20 + %73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %74 = extractvalue { i32, i32 } %73, 0, !dbg !21 + %75 = extractvalue { i32, i32 } %73, 1, !dbg !21 + %76 = trunc i32 %74 to i16, !dbg !21 + %extelt.offset = lshr i32 %74, 16, !dbg !21 + %77 = trunc i32 %extelt.offset to i16, !dbg !21 + %78 = trunc i32 %75 to i16, !dbg !21 + %extelt.offset1 = lshr i32 %75, 16, !dbg !21 + %79 = trunc i32 %extelt.offset1 to i16, !dbg !21 + tail call void @llvm.nvvm.barrier0(), !dbg !22 + %80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22 + store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22 + %81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22 + store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22 + %82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22 + store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22 + %83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22 + store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22 + tail call void @llvm.nvvm.barrier0(), !dbg !22 + %84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22 + %85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22 + %86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22 + %87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22 + %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22 + %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22 + %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22 + %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22 + %92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23 + %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24 + %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24 + %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24 + %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24 + %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24 + %98 = bitcast i32 %94 to float, !dbg !24 + %99 = bitcast i32 %95 to float, !dbg !24 + %100 = bitcast i32 %96 to float, !dbg !24 + %101 = bitcast i32 %97 to float, !dbg !24 + tail call void @llvm.nvvm.barrier0(), !dbg !24 + %102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24 + store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24 + %103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24 + store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24 + %104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24 + store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24 + %105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24 + store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24 + tail call void @llvm.nvvm.barrier0(), !dbg !24 + %106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24 + %107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25 + %108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26 + %109 = extractvalue { i32, i32 } %108, 0, !dbg !26 + %110 = extractvalue { i32, i32 } %108, 1, !dbg !26 + %111 = trunc i32 %109 to i16, !dbg !26 + %extelt.offset2 = lshr i32 %109, 16, !dbg !26 + %112 = trunc i32 %extelt.offset2 to i16, !dbg !26 + %113 = trunc i32 %110 to i16, !dbg !26 + %extelt.offset3 = lshr i32 %110, 16, !dbg !26 + %114 = trunc i32 %extelt.offset3 to i16, !dbg !26 + %115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27 + %116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27 + %117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27 + %118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27 + %119 = add i32 %67, %34, !dbg !28 + %120 = sext i32 %119 to i64, !dbg !29 + %121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30 + %127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32 + %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32 + %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32 + %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32 + %133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33 + %134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34 + %135 = extractvalue { i32, i32 } %134, 0, !dbg !34 + %136 = extractvalue { i32, i32 } %134, 1, !dbg !34 + %137 = trunc i32 %135 to i16, !dbg !34 + %extelt.offset4 = lshr i32 %135, 16, !dbg !34 + %138 = trunc i32 %extelt.offset4 to i16, !dbg !34 + %139 = trunc i32 %136 to i16, !dbg !34 + %extelt.offset5 = lshr i32 %136, 16, !dbg !34 + %140 = trunc i32 %extelt.offset5 to i16, !dbg !34 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35 + store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35 + %142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35 + store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35 + %143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35 + store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35 + %144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35 + store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35 + %146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35 + %147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35 + %148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35 + %149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35 + %150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35 + %151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35 + %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35 + %153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36 + %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37 + %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37 + %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37 + %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37 + %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37 + %159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39 + %165 = fadd float %115, %98, !dbg !40 + %166 = fadd float %116, %99, !dbg !40 + %167 = fadd float %117, %100, !dbg !40 + %168 = fadd float %118, %101, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40 + store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40 + %170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40 + store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40 + %171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40 + store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40 + %172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40 + store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40 + %174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37 + %175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37 + %176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37 + %177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37 + %178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37 + %179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37 + %180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37 + %181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37 + %182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37 + %183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39 + %184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39 + %185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39 + %186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39 + %187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39 + %188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39 + %189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39 + %190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39 + %191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39 + %192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> , !dbg !41 + %193 = fsub <8 x float> %192, %182, !dbg !41 + %194 = fmul <8 x float> %193, %191, !dbg !42 + %195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43 + %196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43 + %197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43 + %198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43 + %199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43 + %200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43 + %201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43 + %202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43 + %203 = fmul <8 x float> %202, %194, !dbg !43 + %204 = fadd <8 x float> %66, %203, !dbg !44 + %205 = add nuw nsw i32 %65, 8, !dbg !16 + %206 = icmp ult i32 %65, 120, !dbg !16 + br i1 %206, label %64, label %207, !dbg !16 + +207: ; preds = %64 + %208 = and i32 %13, 63, !dbg !8 + %209 = or i32 %27, %208, !dbg !12 + %shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %210 = fadd <8 x float> %204, %shift, !dbg !45 + %shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %211 = fadd <8 x float> %shift28, %210, !dbg !45 + %shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !45 + %212 = fadd <8 x float> %shift29, %211, !dbg !45 + %213 = extractelement <8 x float> %212, i64 4, !dbg !45 + %214 = bitcast float %213 to i32, !dbg !51 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51 + %216 = bitcast i32 %215 to float, !dbg !51 + %217 = fadd float %213, %216, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !53 + %218 = zext nneg i32 %21 to i64, !dbg !53 + %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53 + %220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53 + store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier0(), !dbg !53 + %221 = zext nneg i32 %208 to i64, !dbg !53 + %222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53 + %223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53 + %224 = sext i32 %209 to i64, !dbg !54 + %225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54 + %226 = and i32 %13, 64, !dbg !55 + %227 = icmp eq i32 %226, 0, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55 + %shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %228 = fadd <8 x float> %204, %shift30, !dbg !56 + %shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %229 = fadd <8 x float> %shift31, %228, !dbg !56 + %shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> , !dbg !56 + %230 = fadd <8 x float> %shift32, %229, !dbg !56 + %231 = extractelement <8 x float> %230, i64 0, !dbg !56 + %232 = bitcast float %231 to i32, !dbg !59 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59 + %234 = bitcast i32 %233 to float, !dbg !59 + %235 = fadd float %231, %234, !dbg !56 + tail call void @llvm.nvvm.barrier0(), !dbg !61 + %236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61 + store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier0(), !dbg !61 + %237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61 + %238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 26, column: 20, scope: !5) +!14 = !DILocation(line: 34, column: 57, scope: !5) +!15 = !DILocation(line: 37, column: 44, scope: !5) +!16 = !DILocation(line: 30, column: 36, scope: !5) +!17 = !DILocation(line: 31, column: 27, scope: !5) +!18 = !DILocation(line: 34, column: 44, scope: !5) +!19 = !DILocation(line: 34, column: 51, scope: !5) +!20 = !DILocation(line: 34, column: 34, scope: !5) +!21 = !DILocation(line: 34, column: 63, scope: !5) +!22 = !DILocation(line: 34, column: 115, scope: !5) +!23 = !DILocation(line: 35, column: 34, scope: !5) +!24 = !DILocation(line: 35, column: 63, scope: !5) +!25 = !DILocation(line: 36, column: 34, scope: !5) +!26 = !DILocation(line: 36, column: 63, scope: !5) +!27 = !DILocation(line: 36, column: 115, scope: !5) +!28 = !DILocation(line: 37, column: 40, scope: !5) +!29 = !DILocation(line: 37, column: 34, scope: !5) +!30 = !DILocation(line: 37, column: 50, scope: !5) +!31 = !DILocation(line: 38, column: 34, scope: !5) +!32 = !DILocation(line: 38, column: 50, scope: !5) +!33 = !DILocation(line: 39, column: 35, scope: !5) +!34 = !DILocation(line: 39, column: 64, scope: !5) +!35 = !DILocation(line: 39, column: 116, scope: !5) +!36 = !DILocation(line: 40, column: 35, scope: !5) +!37 = !DILocation(line: 40, column: 51, scope: !5) +!38 = !DILocation(line: 41, column: 35, scope: !5) +!39 = !DILocation(line: 41, column: 51, scope: !5) +!40 = !DILocation(line: 44, column: 22, scope: !5) +!41 = !DILocation(line: 52, column: 23, scope: !5) +!42 = !DILocation(line: 53, column: 24, scope: !5) +!43 = !DILocation(line: 54, column: 24, scope: !5) +!44 = !DILocation(line: 57, column: 40, scope: !5) +!45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49) +!46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0) +!47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0) +!49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50) +!50 = !DILocation(line: 58, column: 27, scope: !46) +!51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52) +!52 = !DILocation(line: 58, column: 27, scope: !48) +!53 = !DILocation(line: 58, column: 30, scope: !5) +!54 = !DILocation(line: 59, column: 25, scope: !5) +!55 = !DILocation(line: 59, column: 37, scope: !5) +!56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57) +!57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58) +!58 = !DILocation(line: 60, column: 27, scope: !46) +!59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60) +!60 = !DILocation(line: 60, column: 27, scope: !48) +!61 = !DILocation(line: 60, column: 30, scope: !5) +!62 = !DILocation(line: 61, column: 25, scope: !5) +!63 = !DILocation(line: 61, column: 37, scope: !5) +!64 = !DILocation(line: 61, column: 4, scope: !5) diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0d82479629540e2ba5a47d65f293a823b6aab5e6 --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx @@ -0,0 +1,771 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de( + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<38>; + .reg .b16 %rs<13>; + .reg .b32 %r<135>; + .reg .f32 %f<103>; + .reg .b64 %rd<41>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9]; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5]; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2]; + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1]; + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3]; + shl.b32 %r17, %r1, 2; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4]; + and.b32 %r18, %r17, 60; + bfe.u32 %r19, %r1, 5, 2; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6]; + bfe.u32 %r20, %r1, 1, 4; + ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7]; + shl.b32 %r21, %r19, 4; + or.b32 %r2, %r21, %r20; + .loc 1 24 33 + and.b32 %r22, %r17, 4; + bfe.u32 %r23, %r1, 4, 1; + shl.b32 %r24, %r19, 1; + or.b32 %r25, %r24, %r23; + .loc 1 21 28 + mov.u32 %r15, %ctaid.x; + .loc 1 21 33 + shl.b32 %r3, %r15, 6; + .loc 1 22 23 + or.b32 %r26, %r3, %r18; + or.b32 %r27, %r3, %r2; + .loc 1 26 20 + shr.s32 %r29, %r26, 31; + shr.u32 %r30, %r29, 24; + add.s32 %r31, %r26, %r30; + shr.s32 %r32, %r31, 8; + bfe.s32 %r33, %r15, 25, 1; + shr.u32 %r34, %r33, 24; + add.s32 %r35, %r27, %r34; + shr.s32 %r36, %r35, 8; + .loc 1 37 44 + shl.b32 %r37, %r36, 7; + mul.lo.s32 %r38, %r18, 12; + or.b32 %r39, %r25, %r38; + shl.b32 %r40, %r39, 1; + mov.u32 %r41, global_smem; + add.s32 %r4, %r41, %r40; + mad.lo.s32 %r42, %r2, 12, %r22; + shl.b32 %r43, %r42, 1; + add.s32 %r6, %r41, %r43; + shl.b32 %r44, %r39, 2; + add.s32 %r7, %r41, %r44; + shl.b32 %r45, %r42, 2; + add.s32 %r9, %r41, %r45; + .loc 1 30 36 + mad.lo.s32 %r46, %r32, 32512, %r26; + shl.b32 %r47, %r19, 9; + add.s32 %r48, %r46, %r47; + shl.b32 %r49, %r23, 8; + add.s32 %r133, %r48, %r49; + or.b32 %r50, %r37, %r22; + mul.wide.s32 %rd23, %r50, 4; + add.s64 %rd40, %rd22, %rd23; + add.s64 %rd39, %rd21, %rd23; + add.s64 %rd38, %rd20, %rd23; + add.s64 %rd37, %rd19, %rd23; + mov.f32 %f95, 0f00000000; + mov.b32 %r134, -8; + mov.pred %p1, -1; + mov.f32 %f96, %f95; + mov.f32 %f97, %f95; + mov.f32 %f98, %f95; + mov.f32 %f99, %f95; + mov.f32 %f100, %f95; + mov.f32 %f101, %f95; + mov.f32 %f102, %f95; +$L__BB0_1: + .loc 1 34 34 + mul.wide.s32 %rd32, %r133, 2; + add.s64 %rd24, %rd13, %rd32; + mov.b32 %r53, 0; + .loc 1 34 63 + mov.u32 %r51, 0x0; + mov.u32 %r52, 0x0; + @%p1 ld.global.L1::evict_first.v2.b32 { %r51, %r52 }, [ %rd24 + 0 ]; + @!%p1 mov.u32 %r51, %r53; + @!%p1 mov.u32 %r52, %r53; + shr.u32 %r115, %r51, 16; + shr.u32 %r116, %r52, 16; + .loc 1 34 115 + bar.sync 0; + st.shared.u16 [%r4], %r51; + st.shared.u16 [%r4+24], %r115; + st.shared.u16 [%r4+48], %r52; + st.shared.u16 [%r4+72], %r116; + bar.sync 0; + ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%r6]; + cvt.f32.bf16 %r55, %rs1; + mov.b32 %f25, %r55; + cvt.f32.bf16 %r56, %rs2; + mov.b32 %f26, %r56; + cvt.f32.bf16 %r57, %rs3; + mov.b32 %f27, %r57; + cvt.f32.bf16 %r58, %rs4; + mov.b32 %f28, %r58; + .loc 1 35 34 + mul.wide.s32 %rd33, %r133, 4; + add.s64 %rd25, %rd14, %rd33; + .loc 1 35 63 + mov.u32 %r59, 0x0; + mov.u32 %r60, 0x0; + mov.u32 %r61, 0x0; + mov.u32 %r62, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd25 + 0 ]; + @!%p1 mov.u32 %r59, %r53; + @!%p1 mov.u32 %r60, %r53; + @!%p1 mov.u32 %r61, %r53; + @!%p1 mov.u32 %r62, %r53; + mov.b32 %f29, %r59; + mov.b32 %f30, %r60; + mov.b32 %f31, %r61; + mov.b32 %f32, %r62; + bar.sync 0; + st.shared.u32 [%r7], %r59; + st.shared.u32 [%r7+48], %r60; + st.shared.u32 [%r7+96], %r61; + st.shared.u32 [%r7+144], %r62; + bar.sync 0; + ld.shared.v4.f32 {%f33, %f34, %f35, %f36}, [%r9]; + .loc 1 36 34 + add.s64 %rd26, %rd15, %rd32; + .loc 1 36 63 + mov.u32 %r67, 0x0; + mov.u32 %r68, 0x0; + @%p1 ld.global.L1::evict_first.v2.b32 { %r67, %r68 }, [ %rd26 + 0 ]; + @!%p1 mov.u32 %r67, %r53; + @!%p1 mov.u32 %r68, %r53; + cvt.u16.u32 %rs5, %r67; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r67; } + cvt.u16.u32 %rs7, %r68; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r68; } + .loc 1 36 115 + cvt.f32.bf16 %r71, %rs5; + mov.b32 %f37, %r71; + cvt.f32.bf16 %r72, %rs6; + mov.b32 %f38, %r72; + cvt.f32.bf16 %r73, %rs7; + mov.b32 %f39, %r73; + cvt.f32.bf16 %r74, %rs8; + mov.b32 %f40, %r74; + .loc 1 37 50 + mov.u32 %r75, 0x0; + mov.u32 %r76, 0x0; + mov.u32 %r77, 0x0; + mov.u32 %r78, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd37 + 0 ]; + @!%p1 mov.u32 %r75, %r53; + @!%p1 mov.u32 %r76, %r53; + @!%p1 mov.u32 %r77, %r53; + @!%p1 mov.u32 %r78, %r53; + .loc 1 38 50 + mov.u32 %r83, 0x0; + mov.u32 %r84, 0x0; + mov.u32 %r85, 0x0; + mov.u32 %r86, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd38 + 0 ]; + @!%p1 mov.u32 %r83, %r53; + @!%p1 mov.u32 %r84, %r53; + @!%p1 mov.u32 %r85, %r53; + @!%p1 mov.u32 %r86, %r53; + .loc 1 39 35 + add.s64 %rd29, %rd16, %rd32; + .loc 1 39 64 + mov.u32 %r91, 0x0; + mov.u32 %r92, 0x0; + @%p1 ld.global.L1::evict_first.v2.b32 { %r91, %r92 }, [ %rd29 + 0 ]; + @!%p1 mov.u32 %r91, %r53; + @!%p1 mov.u32 %r92, %r53; + shr.u32 %r117, %r91, 16; + shr.u32 %r118, %r92, 16; + .loc 1 39 116 + bar.sync 0; + st.shared.u16 [%r4], %r91; + st.shared.u16 [%r4+24], %r117; + st.shared.u16 [%r4+48], %r92; + st.shared.u16 [%r4+72], %r118; + bar.sync 0; + ld.shared.v4.u16 {%rs9, %rs10, %rs11, %rs12}, [%r6]; + cvt.f32.bf16 %r95, %rs9; + mov.b32 %f41, %r95; + cvt.f32.bf16 %r96, %rs10; + mov.b32 %f42, %r96; + cvt.f32.bf16 %r97, %rs11; + mov.b32 %f43, %r97; + cvt.f32.bf16 %r98, %rs12; + mov.b32 %f44, %r98; + .loc 1 40 51 + mov.u32 %r99, 0x0; + mov.u32 %r100, 0x0; + mov.u32 %r101, 0x0; + mov.u32 %r102, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd39 + 0 ]; + @!%p1 mov.u32 %r99, %r53; + @!%p1 mov.u32 %r100, %r53; + @!%p1 mov.u32 %r101, %r53; + @!%p1 mov.u32 %r102, %r53; + .loc 1 41 51 + mov.u32 %r107, 0x0; + mov.u32 %r108, 0x0; + mov.u32 %r109, 0x0; + mov.u32 %r110, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd40 + 0 ]; + @!%p1 mov.u32 %r107, %r53; + @!%p1 mov.u32 %r108, %r53; + @!%p1 mov.u32 %r109, %r53; + @!%p1 mov.u32 %r110, %r53; + .loc 1 44 22 + add.f32 %f45, %f37, %f29; + add.f32 %f46, %f38, %f30; + add.f32 %f47, %f39, %f31; + add.f32 %f48, %f40, %f32; + bar.sync 0; + st.shared.f32 [%r7], %f45; + st.shared.f32 [%r7+48], %f46; + st.shared.f32 [%r7+96], %f47; + st.shared.f32 [%r7+144], %f48; + bar.sync 0; + ld.shared.v4.f32 {%f49, %f50, %f51, %f52}, [%r9]; + .loc 1 40 51 + mov.b32 %f53, %r75; + mov.b32 %f54, %r76; + mov.b32 %f55, %r77; + mov.b32 %f56, %r78; + mov.b32 %f57, %r99; + mov.b32 %f58, %r100; + mov.b32 %f59, %r101; + mov.b32 %f60, %r102; + .loc 1 41 51 + mov.b32 %f61, %r110; + mov.b32 %f62, %r109; + mov.b32 %f63, %r108; + mov.b32 %f64, %r107; + mov.b32 %f65, %r86; + mov.b32 %f66, %r85; + mov.b32 %f67, %r84; + mov.b32 %f68, %r83; + .loc 1 52 23 + sub.f32 %f69, %f36, %f60; + sub.f32 %f70, %f35, %f59; + sub.f32 %f71, %f34, %f58; + sub.f32 %f72, %f33, %f57; + sub.f32 %f73, %f52, %f56; + sub.f32 %f74, %f51, %f55; + sub.f32 %f75, %f50, %f54; + sub.f32 %f76, %f49, %f53; + .loc 1 53 24 + mul.f32 %f77, %f76, %f68; + mul.f32 %f78, %f75, %f67; + mul.f32 %f79, %f74, %f66; + mul.f32 %f80, %f73, %f65; + mul.f32 %f81, %f72, %f64; + mul.f32 %f82, %f71, %f63; + mul.f32 %f83, %f70, %f62; + mul.f32 %f84, %f69, %f61; + .loc 1 57 40 + fma.rn.f32 %f98, %f44, %f84, %f98; + fma.rn.f32 %f97, %f43, %f83, %f97; + fma.rn.f32 %f96, %f42, %f82, %f96; + fma.rn.f32 %f95, %f41, %f81, %f95; + fma.rn.f32 %f102, %f28, %f80, %f102; + fma.rn.f32 %f101, %f27, %f79, %f101; + fma.rn.f32 %f100, %f26, %f78, %f100; + fma.rn.f32 %f99, %f25, %f77, %f99; + .loc 1 30 36 + add.s32 %r134, %r134, 8; + add.s32 %r133, %r133, 2048; + add.s64 %rd40, %rd40, 32; + add.s64 %rd39, %rd39, 32; + add.s64 %rd38, %rd38, 32; + add.s64 %rd37, %rd37, 32; + setp.lt.u32 %p35, %r134, 120; + @%p35 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r121, %r1, 63; + .loc 1 22 23 + or.b32 %r122, %r3, %r121; +$L__tmp1: + .loc 2 233 15 + add.f32 %f85, %f99, %f100; + add.f32 %f86, %f101, %f85; + add.f32 %f87, %f102, %f86; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r123, %f87; + shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1; + mov.b32 %f88, %r124; +$L__tmp3: + .loc 2 233 15 + add.f32 %f89, %f87, %f88; +$L__tmp4: + .loc 1 58 30 + bar.sync 0; + shl.b32 %r125, %r2, 2; + add.s32 %r127, %r41, %r125; + st.shared.f32 [%r127], %f89; + bar.sync 0; + shl.b32 %r128, %r121, 2; + add.s32 %r129, %r41, %r128; + ld.shared.u32 %r119, [%r129]; + .loc 1 59 25 + mul.wide.s32 %rd36, %r122, 4; + add.s64 %rd34, %rd17, %rd36; + .loc 1 59 37 + and.b32 %r130, %r1, 64; + setp.eq.s32 %p36, %r130, 0; + @%p36 st.global.b32 [ %rd34 + 0 ], { %r119 }; +$L__tmp5: + .loc 2 233 15 + add.f32 %f90, %f95, %f96; + add.f32 %f91, %f97, %f90; + add.f32 %f92, %f98, %f91; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r131, %f92; + shfl.sync.bfly.b32 %r132, %r131, 1, 31, -1; + mov.b32 %f93, %r132; +$L__tmp7: + .loc 2 233 15 + add.f32 %f94, %f92, %f93; +$L__tmp8: + .loc 1 60 30 + bar.sync 0; + st.shared.f32 [%r127], %f94; + bar.sync 0; + ld.shared.u32 %r120, [%r129]; + .loc 1 61 25 + add.s64 %rd35, %rd18, %rd36; + .loc 1 61 37 + @%p36 st.global.b32 [ %rd35 + 0 ], { %r120 }; + .loc 1 61 4 + ret; +$L__tmp9: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/3x/c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 371 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 51 +.b8 120 +.b8 120 +.b8 115 +.b8 122 +.b8 118 +.b8 103 +.b8 116 +.b8 102 +.b8 110 +.b8 106 +.b8 98 +.b8 55 +.b8 119 +.b8 101 +.b8 108 +.b8 113 +.b8 118 +.b8 114 +.b8 51 +.b8 51 +.b8 122 +.b8 52 +.b8 99 +.b8 113 +.b8 111 +.b8 117 +.b8 120 +.b8 104 +.b8 113 +.b8 106 +.b8 121 +.b8 51 +.b8 100 +.b8 112 +.b8 119 +.b8 97 +.b8 50 +.b8 113 +.b8 109 +.b8 109 +.b8 120 +.b8 50 +.b8 120 +.b8 116 +.b8 111 +.b8 54 +.b8 115 +.b8 103 +.b8 118 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 51 +.b8 120 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 58 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp3 +.b8 2 +.b8 58 +.b8 27 +.b8 4 +.b32 125 +.b64 $L__tmp5 +.b64 $L__tmp8 +.b8 2 +.b8 60 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp5 +.b64 $L__tmp8 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp7 +.b8 2 +.b8 60 +.b8 27 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 375 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 375 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..18a3ae9222737ae7ea7dd913c7f107d1a193a44a --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir @@ -0,0 +1,127 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1> + %cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> + %cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1> + %cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2> + %8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2> + %11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked> + %12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1> + %13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2> + %14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1> + %17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked> + %19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked> + %20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1> + %21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked> + %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %24 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %25 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %26 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1> + %28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1> + %29 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %30 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %31 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %32 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %33 = tt.splat %arg7 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked1> + %34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 { + %45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1> + %46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked> + %47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1> + %48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked> + %49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1> + %50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked> + %51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked> + %52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked> + %54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked> + %55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1> + %57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1> + %60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> + %61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1> + %64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> + %67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1> + %68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1> + %69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1> + %76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> + %77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> + %80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1> + %81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked> + %82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1> + %83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1> + %84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1> + %85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1> + %86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1> + %87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> + %88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1> + %89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1> + %90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1> + %91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1> + %92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> + scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1> + } + %35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %45 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %45 : f32 + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2> + %38 = tt.splat %arg8 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked2> + %39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> + tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2> + %40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %45 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %45 : f32 + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2> + %43 = tt.splat %arg9 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked2> + %44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> + tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2> + tt.return + } +} diff --git a/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..05e1ffc2ab9a81f61d71585a164f6a8278e2bd09 --- /dev/null +++ b/.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir @@ -0,0 +1,100 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> + %c8_i32 = arith.constant 8 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<128> : tensor<64x1xi32> + %cst_1 = arith.constant dense<32768> : tensor<64x1xi32> + %cst_2 = arith.constant dense<256> : tensor<1x8xi32> + %cst_3 = arith.constant dense<128> : tensor<1x8xi32> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> + %cst_5 = arith.constant dense<256> : tensor<64x1xi32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32> + %8 = arith.remsi %5, %cst_5 : tensor<64x1xi32> + %9 = arith.divsi %5, %cst_5 : tensor<64x1xi32> + %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %11 = arith.muli %9, %cst_1 : tensor<64x1xi32> + %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %14 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %15 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %16 = arith.muli %9, %cst_0 : tensor<64x1xi32> + %17 = tt.broadcast %16 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %18 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %19 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %20 = tt.splat %arg5 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %21 = tt.splat %arg6 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %22 = tt.splat %arg7 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %23:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_4, %arg14 = %cst_4) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 { + %32 = tt.splat %arg12 : (i32) -> tensor<1x8xi32> + %33 = arith.addi %32, %7 : tensor<1x8xi32> + %34 = arith.cmpi slt, %33, %cst_3 : tensor<1x8xi32> + %35 = arith.muli %33, %cst_2 : tensor<1x8xi32> + %36 = tt.broadcast %35 : (tensor<1x8xi32>) -> tensor<64x8xi32> + %37 = arith.addi %10, %36 : tensor<64x8xi32> + %38 = arith.addi %37, %12 : tensor<64x8xi32> + %39 = tt.addptr %13, %38 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %40 = tt.broadcast %34 : (tensor<1x8xi1>) -> tensor<64x8xi1> + %41 = tt.load %39, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16> + %42 = arith.extf %41 : tensor<64x8xbf16> to tensor<64x8xf32> + %43 = tt.addptr %14, %38 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %44 = tt.load %43, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32> + %45 = tt.addptr %15, %38 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %46 = tt.load %45, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16> + %47 = arith.extf %46 : tensor<64x8xbf16> to tensor<64x8xf32> + %48 = tt.broadcast %33 : (tensor<1x8xi32>) -> tensor<64x8xi32> + %49 = arith.addi %48, %17 : tensor<64x8xi32> + %50 = tt.addptr %18, %49 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %51 = tt.load %50, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32> + %52 = tt.addptr %19, %49 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %53 = tt.load %52, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32> + %54 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %55 = tt.load %54, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16> + %56 = arith.extf %55 : tensor<64x8xbf16> to tensor<64x8xf32> + %57 = tt.addptr %21, %49 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %58 = tt.load %57, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32> + %59 = tt.addptr %22, %49 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %60 = tt.load %59, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32> + %61 = arith.addf %44, %47 : tensor<64x8xf32> + %62 = arith.subf %61, %51 : tensor<64x8xf32> + %63 = arith.mulf %62, %53 : tensor<64x8xf32> + %64 = arith.mulf %42, %63 : tensor<64x8xf32> + %65 = arith.addf %arg13, %64 : tensor<64x8xf32> + %66 = arith.select %40, %65, %arg13 : tensor<64x8xi1>, tensor<64x8xf32> + %67 = arith.subf %44, %58 : tensor<64x8xf32> + %68 = arith.mulf %67, %60 : tensor<64x8xf32> + %69 = arith.mulf %56, %68 : tensor<64x8xf32> + %70 = arith.addf %arg14, %69 : tensor<64x8xf32> + %71 = arith.select %40, %70, %arg14 : tensor<64x8xi1>, tensor<64x8xf32> + scf.yield %66, %71 : tensor<64x8xf32>, tensor<64x8xf32> + } + %24 = "tt.reduce"(%23#0) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %32 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %32 : f32 + }) : (tensor<64x8xf32>) -> tensor<64xf32> + %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %26 = tt.splat %arg8 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %27 = tt.addptr %26, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + tt.store %27, %25 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32> + %28 = "tt.reduce"(%23#1) <{axis = 1 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %32 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %32 : f32 + }) : (tensor<64x8xf32>) -> tensor<64xf32> + %29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %30 = tt.splat %arg9 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %31 = tt.addptr %30, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + tt.store %31, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..117ab3bc119d01990d81a1e6afa178caa55b2500 Binary files /dev/null and b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.cubin differ diff --git a/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c306878526ea0356721ecc09315514cc1afe3a91 --- /dev/null +++ b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx @@ -0,0 +1,565 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<18>; + .reg .b32 %r<92>; + .reg .f32 %f<43>; + .reg .b64 %rd<16>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2]; + ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1]; + ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shl.b32 %r13, %r1, 2; + and.b32 %r3, %r13, 60; + .loc 1 24 33 + bfe.u32 %r4, %r1, 5, 2; + .loc 1 21 28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 + shl.b32 %r5, %r11, 6; + .loc 1 27 36 + shl.b32 %r14, %r4, 18; + shl.b32 %r15, %r1, 13; + and.b32 %r16, %r15, 131072; + or.b32 %r17, %r14, %r16; + add.s32 %r18, %r17, %r5; + or.b32 %r90, %r18, %r3; + mov.f32 %f39, 0f00000000; + mov.b32 %r91, -8; + mov.pred %p1, -1; + mov.f32 %f40, %f39; + mov.f32 %f41, %f39; + mov.f32 %f42, %f39; +$L__BB0_1: + .loc 1 31 34 + mul.wide.s32 %rd5, %r90, 4; + add.s64 %rd4, %rd1, %rd5; + mov.b32 %r23, 0; + .loc 1 31 53 + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + mov.u32 %r22, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r19, %r20, %r21, %r22 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r19, %r23; + @!%p1 mov.u32 %r20, %r23; + @!%p1 mov.u32 %r21, %r23; + @!%p1 mov.u32 %r22, %r23; + mov.b32 %f13, %r19; + mov.b32 %f14, %r20; + mov.b32 %f15, %r21; + mov.b32 %f16, %r22; + .loc 1 34 38 + add.f32 %f42, %f42, %f16; + add.f32 %f41, %f41, %f15; + add.f32 %f40, %f40, %f14; + add.f32 %f39, %f39, %f13; + .loc 1 27 36 + add.s32 %r91, %r91, 8; + add.s32 %r90, %r90, 1048576; + setp.lt.u32 %p6, %r91, 112; + @%p6 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r45, %r1, 63; + .loc 1 22 23 + or.b32 %r46, %r5, %r45; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r47, %f39; + shfl.sync.bfly.b32 %r48, %r47, 16, 31, -1; + mov.b32 %f17, %r48; +$L__tmp2: + .loc 2 233 15 + add.f32 %f18, %f39, %f17; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r49, %f40; + shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1; + mov.b32 %f19, %r50; +$L__tmp4: + .loc 2 233 15 + add.f32 %f20, %f40, %f19; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r51, %f41; + shfl.sync.bfly.b32 %r52, %r51, 16, 31, -1; + mov.b32 %f21, %r52; +$L__tmp6: + .loc 2 233 15 + add.f32 %f22, %f41, %f21; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r53, %f42; + shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1; + mov.b32 %f23, %r54; +$L__tmp8: + .loc 2 233 15 + add.f32 %f24, %f42, %f23; +$L__tmp9: + .loc 2 243 36 + setp.lt.u32 %p7, %r2, 16; + shl.b32 %r55, %r3, 2; + or.b32 %r56, %r55, %r4; + shl.b32 %r57, %r56, 2; + mov.u32 %r58, global_smem; + add.s32 %r27, %r58, %r57; + mov.b32 %r28, %f18; + @%p7 st.shared.b32 [ %r27 + 0 ], %r28; + shl.b32 %r59, %r4, 2; + shl.b32 %r60, %r3, 4; + or.b32 %r61, %r60, 16; + or.b32 %r62, %r61, %r59; + add.s32 %r29, %r58, %r62; + mov.b32 %r30, %f20; + @%p7 st.shared.b32 [ %r29 + 0 ], %r30; + or.b32 %r63, %r60, 32; + or.b32 %r64, %r63, %r59; + add.s32 %r31, %r58, %r64; + mov.b32 %r32, %f22; + @%p7 st.shared.b32 [ %r31 + 0 ], %r32; + or.b32 %r65, %r60, 48; + or.b32 %r66, %r65, %r59; + add.s32 %r33, %r58, %r66; + mov.b32 %r34, %f24; + @%p7 st.shared.b32 [ %r33 + 0 ], %r34; + bar.sync 0; + setp.lt.s32 %p11, %r1, 256; + add.s32 %r36, %r58, %r13; + @%p11 ld.shared.b32 %r35, [ %r36 + 0 ]; + mov.b32 %f25, %r35; + shfl.sync.bfly.b32 %r68, %r35, 2, 31, -1; + mov.b32 %f26, %r68; +$L__tmp10: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r69, %f27; + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + mov.b32 %f28, %r70; +$L__tmp12: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp13: + .loc 2 243 36 + and.b32 %r71, %r1, 3; + setp.eq.s32 %p17, %r71, 0; + and.pred %p12, %p11, %p17; + mov.b32 %r38, %f29; + @%p12 st.shared.b32 [ %r36 + 0 ], %r38; + add.s32 %r40, %r36, 512; + @%p11 ld.shared.b32 %r39, [ %r40 + 0 ]; + mov.b32 %f30, %r39; + shfl.sync.bfly.b32 %r72, %r39, 2, 31, -1; + mov.b32 %f31, %r72; +$L__tmp14: + .loc 2 233 15 + add.f32 %f32, %f30, %f31; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r73, %f32; + shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1; + mov.b32 %f33, %r74; +$L__tmp16: + .loc 2 233 15 + add.f32 %f34, %f32, %f33; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r42, %f34; + @%p12 st.shared.b32 [ %r40 + 0 ], %r42; + bar.sync 0; + add.s32 %r75, %r58, %r60; + ld.shared.f32 %f35, [%r75]; + add.s32 %r76, %r58, %r61; + ld.shared.f32 %f36, [%r76]; + add.s32 %r77, %r58, %r63; + ld.shared.f32 %f37, [%r77]; + add.s32 %r78, %r58, %r65; + ld.shared.f32 %f38, [%r78]; +$L__tmp18: + .loc 1 35 28 + bar.sync 0; + add.s32 %r79, %r58, %r55; + st.shared.f32 [%r79], %f35; + st.shared.f32 [%r79+4], %f36; + st.shared.f32 [%r79+8], %f37; + st.shared.f32 [%r79+12], %f38; + bar.sync 0; + shl.b32 %r80, %r45, 2; + add.s32 %r81, %r58, %r80; + .loc 1 36 20 + shr.s32 %r83, %r46, 31; + shr.u32 %r84, %r83, 24; + add.s32 %r85, %r46, %r84; + shr.s32 %r86, %r85, 8; + and.b32 %r87, %r85, -256; + sub.s32 %r88, %r46, %r87; + .loc 1 38 30 + mul.wide.s32 %rd9, %r86, 8; + add.s64 %rd7, %rd2, %rd9; + .loc 1 45 55 + ld.shared.u32 %r44, [%r81]; + .loc 1 38 35 + mov.u64 %rd6, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ]; + .loc 1 41 32 + shr.u64 %rd10, %rd6, 54; + and.b64 %rd11, %rd10, 512; + add.s64 %rd12, %rd11, %rd6; + .loc 1 45 30 + shl.b64 %rd13, %rd12, 10; + add.s64 %rd14, %rd3, %rd13; + mul.wide.s32 %rd15, %r88, 4; + add.s64 %rd8, %rd14, %rd15; + .loc 1 45 55 + and.b32 %r89, %r1, 64; + setp.eq.s32 %p16, %r89, 0; + mov.u32 %r43, 0x0; + @%p16 atom.global.gpu.acq_rel.add.f32 %r43, [ %rd8 + 0 ], %r44; + .loc 1 45 4 + ret; +$L__tmp19: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 35 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a096e70e65dcd9b1f20c23d9ab8f451813190f0d --- /dev/null +++ b/.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir @@ -0,0 +1,61 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi64> + %cst_0 = arith.constant dense<0> : tensor<64x1xi64> + %cst_1 = arith.constant dense<512> : tensor<64x1xi64> + %c8_i32 = arith.constant 8 : i32 + %c120_i32 = arith.constant 120 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_2 = arith.constant dense : tensor<64x1xi1> + %cst_3 = arith.constant dense<256> : tensor<64x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x8xi32> + %cst_5 = arith.constant dense<120> : tensor<1x8xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32> + %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %9 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c8_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x8xf32>) : i32 { + %27 = tt.splat %arg5 : (i32) -> tensor<1x8xi32> + %28 = arith.addi %27, %7 : tensor<1x8xi32> + %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x8xi32> + %30 = arith.muli %28, %cst_4 : tensor<1x8xi32> + %31 = tt.broadcast %30 : (tensor<1x8xi32>) -> tensor<64x8xi32> + %32 = arith.addi %8, %31 : tensor<64x8xi32> + %33 = tt.addptr %9, %32 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %34 = tt.broadcast %29 : (tensor<1x8xi1>) -> tensor<64x8xi1> + %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32> + %36 = arith.addf %arg6, %35 : tensor<64x8xf32> + %37 = arith.select %34, %36, %arg6 : tensor<64x8xi1>, tensor<64x8xf32> + scf.yield %37 : tensor<64x8xf32> + } + %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %27 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %27 : f32 + }) : (tensor<64x8xf32>) -> tensor<64xf32> + %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32> + %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32> + %15 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %18 = arith.addi %17, %cst_1 : tensor<64x1xi64> + %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64> + %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64> + %21 = arith.muli %20, %cst : tensor<64x1xi64> + %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64> + %23 = arith.addi %22, %21 : tensor<64x1xi64> + %24 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr>, tensor<64x1xi64> + %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.cubin b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..8a3008b977645808ceea005075da47d6868cc967 Binary files /dev/null and b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.cubin differ diff --git a/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.llir b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..d3644c1c357e43b8cc162f9e4b1c53ba0d208e6d --- /dev/null +++ b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.llir @@ -0,0 +1,41 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 { + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %4 = shl i32 %3, 1, !dbg !8 + %5 = and i32 %4, 510, !dbg !8 + %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %7 = shl i32 %6, 9, !dbg !10 + %8 = or i32 %7, %5, !dbg !11 + %9 = sext i32 %8 to i64, !dbg !12 + %10 = getelementptr float, ptr addrspace(1) %0, i64 %9, !dbg !12 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %10, i1 true) #1, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w") +!3 = !{ptr @triton__0d1de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1de, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 25, column: 25, scope: !5) +!13 = !DILocation(line: 25, column: 36, scope: !5) +!14 = !DILocation(line: 25, column: 4, scope: !5) diff --git a/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ptx b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c92d2b0f18bdf137b5cad38e4aad7ac273105aa3 --- /dev/null +++ b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ptx @@ -0,0 +1,277 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1de + +.visible .entry triton__0d1de( + .param .u64 triton__0d1de_param_0, + .param .u32 triton__0d1de_param_1 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<2>; + .reg .b32 %r<9>; + .reg .b64 %rd<4>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd2, [triton__0d1de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r7, %r1, 9; + .loc 1 21 23 + or.b32 %r8, %r7, %r6; + .loc 1 25 25 + mul.wide.s32 %rd3, %r8, 4; + add.s64 %rd1, %rd2, %rd3; + mov.b32 %r2, 0; + mov.pred %p1, -1; + .loc 1 25 36 + @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 }; + .loc 1 25 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/7w/c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 172 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 55 +.b8 119 +.b8 53 +.b8 114 +.b8 54 +.b8 54 +.b8 102 +.b8 99 +.b8 103 +.b8 103 +.b8 109 +.b8 54 +.b8 97 +.b8 111 +.b8 107 +.b8 107 +.b8 116 +.b8 122 +.b8 119 +.b8 109 +.b8 103 +.b8 50 +.b8 52 +.b8 109 +.b8 108 +.b8 101 +.b8 118 +.b8 113 +.b8 50 +.b8 104 +.b8 113 +.b8 100 +.b8 119 +.b8 50 +.b8 98 +.b8 103 +.b8 119 +.b8 122 +.b8 119 +.b8 108 +.b8 111 +.b8 118 +.b8 114 +.b8 101 +.b8 108 +.b8 54 +.b8 114 +.b8 101 +.b8 53 +.b8 121 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 55 +.b8 119 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 176 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttir b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..111c5b31270e5035c994bf5be1773b8aab61a67b --- /dev/null +++ b/.triton/dump/ac249789b41c99e39c165fc12afa9269/triton_.ttir @@ -0,0 +1,15 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xf32> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6a260c6b9779c130896a7c5918c9a7c4adcece55 Binary files /dev/null and b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.cubin differ diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.llir b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..db627a837fd48f8183df2733ea2b1170f019cd95 --- /dev/null +++ b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.llir @@ -0,0 +1,593 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %11 = lshr i32 %10, 5, !dbg !8 + %urem = and i32 %10, 255, !dbg !9 + %12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %13 = sext i32 %12 to i64, !dbg !11 + %14 = shl nsw i64 %13, 3, !dbg !12 + %15 = or i64 %14, 1, !dbg !13 + %16 = or i64 %14, 2, !dbg !13 + %17 = or i64 %14, 3, !dbg !13 + %18 = or i64 %14, 4, !dbg !13 + %19 = or i64 %14, 5, !dbg !13 + %20 = or i64 %14, 6, !dbg !13 + %21 = or i64 %14, 7, !dbg !13 + %22 = getelementptr i64, ptr addrspace(1) %1, i64 %14, !dbg !14 + %23 = getelementptr i64, ptr addrspace(1) %1, i64 %15, !dbg !14 + %24 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !14 + %25 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !14 + %26 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14 + %27 = getelementptr i64, ptr addrspace(1) %1, i64 %19, !dbg !14 + %28 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !14 + %29 = getelementptr i64, ptr addrspace(1) %1, i64 %21, !dbg !14 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #3, !dbg !15 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #3, !dbg !15 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #3, !dbg !15 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #3, !dbg !15 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #3, !dbg !15 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #3, !dbg !15 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #3, !dbg !15 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #3, !dbg !15 + %38 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16 + %39 = bitcast i32 %38 to float, !dbg !16 + %40 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17 + %41 = bitcast i32 %40 to float, !dbg !17 + %42 = mul nsw i64 %13, 402056, !dbg !18 + %43 = mul nsw i64 %15, 50257, !dbg !18 + %44 = mul nsw i64 %16, 50257, !dbg !18 + %45 = mul nsw i64 %17, 50257, !dbg !18 + %46 = mul nsw i64 %18, 50257, !dbg !18 + %47 = mul nsw i64 %19, 50257, !dbg !18 + %48 = mul nsw i64 %20, 50257, !dbg !18 + %49 = mul nsw i64 %21, 50257, !dbg !18 + %50 = insertelement <8 x i64> poison, i64 %30, i64 0, !dbg !19 + %51 = insertelement <8 x i64> %50, i64 %31, i64 1, !dbg !19 + %52 = insertelement <8 x i64> %51, i64 %32, i64 2, !dbg !19 + %53 = insertelement <8 x i64> %52, i64 %33, i64 3, !dbg !19 + %54 = insertelement <8 x i64> %53, i64 %34, i64 4, !dbg !19 + %55 = insertelement <8 x i64> %54, i64 %35, i64 5, !dbg !19 + %56 = insertelement <8 x i64> %55, i64 %36, i64 6, !dbg !19 + %57 = insertelement <8 x i64> %56, i64 %37, i64 7, !dbg !19 + %58 = icmp eq <8 x i64> %57, , !dbg !19 + %59 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %39, float %41) #3, !dbg !20 + %60 = insertelement <8 x float> poison, float %59, i64 0, !dbg !21 + %61 = shufflevector <8 x float> %60, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !21 + %62 = select <8 x i1> %58, <8 x float> zeroinitializer, <8 x float> %61, !dbg !21 + %63 = getelementptr float, ptr addrspace(1) %0, i64 %42 + %64 = getelementptr float, ptr addrspace(1) %0, i64 %43 + %65 = getelementptr float, ptr addrspace(1) %0, i64 %44 + %66 = getelementptr float, ptr addrspace(1) %0, i64 %45 + %67 = getelementptr float, ptr addrspace(1) %0, i64 %46 + %68 = getelementptr float, ptr addrspace(1) %0, i64 %47 + %69 = getelementptr float, ptr addrspace(1) %0, i64 %48 + %70 = getelementptr float, ptr addrspace(1) %0, i64 %49 + br label %71, !dbg !22 + +71: ; preds = %9, %71 + %72 = phi i32 [ 0, %9 ], [ %107, %71 ] + %73 = phi <8 x float> [ zeroinitializer, %9 ], [ %106, %71 ] + %74 = or i32 %72, %urem, !dbg !23 + %75 = zext nneg i32 %74 to i64, !dbg !23 + %76 = icmp ult i32 %74, 50257, !dbg !24 + %77 = getelementptr float, ptr addrspace(1) %63, i64 %75, !dbg !25 + %78 = getelementptr float, ptr addrspace(1) %64, i64 %75, !dbg !25 + %79 = getelementptr float, ptr addrspace(1) %65, i64 %75, !dbg !25 + %80 = getelementptr float, ptr addrspace(1) %66, i64 %75, !dbg !25 + %81 = getelementptr float, ptr addrspace(1) %67, i64 %75, !dbg !25 + %82 = getelementptr float, ptr addrspace(1) %68, i64 %75, !dbg !25 + %83 = getelementptr float, ptr addrspace(1) %69, i64 %75, !dbg !25 + %84 = getelementptr float, ptr addrspace(1) %70, i64 %75, !dbg !25 + %85 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %77, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %86 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %78, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %87 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %79, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %88 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %80, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %89 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %81, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %90 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %91 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %83, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %92 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %84, i1 %76, i32 0, i1 %76) #3, !dbg !26 + %93 = insertelement <8 x i32> poison, i32 %85, i64 0, !dbg !26 + %94 = insertelement <8 x i32> %93, i32 %86, i64 1, !dbg !26 + %95 = insertelement <8 x i32> %94, i32 %87, i64 2, !dbg !26 + %96 = insertelement <8 x i32> %95, i32 %88, i64 3, !dbg !26 + %97 = insertelement <8 x i32> %96, i32 %89, i64 4, !dbg !26 + %98 = insertelement <8 x i32> %97, i32 %90, i64 5, !dbg !26 + %99 = insertelement <8 x i32> %98, i32 %91, i64 6, !dbg !26 + %100 = insertelement <8 x i32> %99, i32 %92, i64 7, !dbg !26 + %101 = bitcast <8 x i32> %100 to <8 x float>, !dbg !26 + %102 = fmul <8 x float> %62, %101, !dbg !27 + %103 = insertelement <8 x i1> poison, i1 %76, i64 0, !dbg !28 + %104 = shufflevector <8 x i1> %103, <8 x i1> poison, <8 x i32> zeroinitializer, !dbg !28 + %105 = select <8 x i1> %104, <8 x float> %102, <8 x float> , !dbg !28 + %106 = fadd <8 x float> %73, %105, !dbg !28 + %107 = add nuw nsw i32 %72, 256, !dbg !22 + %108 = icmp ult i32 %72, 50001, !dbg !22 + br i1 %108, label %71, label %109, !dbg !22 + +109: ; preds = %71 + %110 = and i32 %10, 31, !dbg !8 + %111 = and i32 %11, 7, !dbg !9 + %112 = extractelement <8 x float> %106, i64 0, !dbg !29 + %113 = bitcast float %112 to i32, !dbg !29 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !29 + %115 = bitcast i32 %114 to float, !dbg !29 + %116 = fadd float %112, %115, !dbg !33 + %117 = bitcast float %116 to i32, !dbg !29 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 8, i32 31), !dbg !29 + %119 = bitcast i32 %118 to float, !dbg !29 + %120 = fadd float %116, %119, !dbg !33 + %121 = bitcast float %120 to i32, !dbg !29 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 4, i32 31), !dbg !29 + %123 = bitcast i32 %122 to float, !dbg !29 + %124 = fadd float %120, %123, !dbg !33 + %125 = bitcast float %124 to i32, !dbg !29 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 2, i32 31), !dbg !29 + %127 = bitcast i32 %126 to float, !dbg !29 + %128 = fadd float %124, %127, !dbg !33 + %129 = bitcast float %128 to i32, !dbg !29 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 1, i32 31), !dbg !29 + %131 = bitcast i32 %130 to float, !dbg !29 + %132 = fadd float %128, %131, !dbg !33 + %133 = extractelement <8 x float> %106, i64 1, !dbg !29 + %134 = bitcast float %133 to i32, !dbg !29 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 16, i32 31), !dbg !29 + %136 = bitcast i32 %135 to float, !dbg !29 + %137 = fadd float %133, %136, !dbg !33 + %138 = bitcast float %137 to i32, !dbg !29 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 8, i32 31), !dbg !29 + %140 = bitcast i32 %139 to float, !dbg !29 + %141 = fadd float %137, %140, !dbg !33 + %142 = bitcast float %141 to i32, !dbg !29 + %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 4, i32 31), !dbg !29 + %144 = bitcast i32 %143 to float, !dbg !29 + %145 = fadd float %141, %144, !dbg !33 + %146 = bitcast float %145 to i32, !dbg !29 + %147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %146, i32 2, i32 31), !dbg !29 + %148 = bitcast i32 %147 to float, !dbg !29 + %149 = fadd float %145, %148, !dbg !33 + %150 = bitcast float %149 to i32, !dbg !29 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 1, i32 31), !dbg !29 + %152 = bitcast i32 %151 to float, !dbg !29 + %153 = fadd float %149, %152, !dbg !33 + %154 = extractelement <8 x float> %106, i64 2, !dbg !29 + %155 = bitcast float %154 to i32, !dbg !29 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 16, i32 31), !dbg !29 + %157 = bitcast i32 %156 to float, !dbg !29 + %158 = fadd float %154, %157, !dbg !33 + %159 = bitcast float %158 to i32, !dbg !29 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !29 + %161 = bitcast i32 %160 to float, !dbg !29 + %162 = fadd float %158, %161, !dbg !33 + %163 = bitcast float %162 to i32, !dbg !29 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 4, i32 31), !dbg !29 + %165 = bitcast i32 %164 to float, !dbg !29 + %166 = fadd float %162, %165, !dbg !33 + %167 = bitcast float %166 to i32, !dbg !29 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 2, i32 31), !dbg !29 + %169 = bitcast i32 %168 to float, !dbg !29 + %170 = fadd float %166, %169, !dbg !33 + %171 = bitcast float %170 to i32, !dbg !29 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 1, i32 31), !dbg !29 + %173 = bitcast i32 %172 to float, !dbg !29 + %174 = fadd float %170, %173, !dbg !33 + %175 = extractelement <8 x float> %106, i64 3, !dbg !29 + %176 = bitcast float %175 to i32, !dbg !29 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 16, i32 31), !dbg !29 + %178 = bitcast i32 %177 to float, !dbg !29 + %179 = fadd float %175, %178, !dbg !33 + %180 = bitcast float %179 to i32, !dbg !29 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !29 + %182 = bitcast i32 %181 to float, !dbg !29 + %183 = fadd float %179, %182, !dbg !33 + %184 = bitcast float %183 to i32, !dbg !29 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 4, i32 31), !dbg !29 + %186 = bitcast i32 %185 to float, !dbg !29 + %187 = fadd float %183, %186, !dbg !33 + %188 = bitcast float %187 to i32, !dbg !29 + %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 2, i32 31), !dbg !29 + %190 = bitcast i32 %189 to float, !dbg !29 + %191 = fadd float %187, %190, !dbg !33 + %192 = bitcast float %191 to i32, !dbg !29 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !29 + %194 = bitcast i32 %193 to float, !dbg !29 + %195 = fadd float %191, %194, !dbg !33 + %196 = extractelement <8 x float> %106, i64 4, !dbg !29 + %197 = bitcast float %196 to i32, !dbg !29 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 16, i32 31), !dbg !29 + %199 = bitcast i32 %198 to float, !dbg !29 + %200 = fadd float %196, %199, !dbg !33 + %201 = bitcast float %200 to i32, !dbg !29 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 8, i32 31), !dbg !29 + %203 = bitcast i32 %202 to float, !dbg !29 + %204 = fadd float %200, %203, !dbg !33 + %205 = bitcast float %204 to i32, !dbg !29 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 4, i32 31), !dbg !29 + %207 = bitcast i32 %206 to float, !dbg !29 + %208 = fadd float %204, %207, !dbg !33 + %209 = bitcast float %208 to i32, !dbg !29 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 2, i32 31), !dbg !29 + %211 = bitcast i32 %210 to float, !dbg !29 + %212 = fadd float %208, %211, !dbg !33 + %213 = bitcast float %212 to i32, !dbg !29 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !29 + %215 = bitcast i32 %214 to float, !dbg !29 + %216 = fadd float %212, %215, !dbg !33 + %217 = extractelement <8 x float> %106, i64 5, !dbg !29 + %218 = bitcast float %217 to i32, !dbg !29 + %219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 16, i32 31), !dbg !29 + %220 = bitcast i32 %219 to float, !dbg !29 + %221 = fadd float %217, %220, !dbg !33 + %222 = bitcast float %221 to i32, !dbg !29 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 8, i32 31), !dbg !29 + %224 = bitcast i32 %223 to float, !dbg !29 + %225 = fadd float %221, %224, !dbg !33 + %226 = bitcast float %225 to i32, !dbg !29 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 4, i32 31), !dbg !29 + %228 = bitcast i32 %227 to float, !dbg !29 + %229 = fadd float %225, %228, !dbg !33 + %230 = bitcast float %229 to i32, !dbg !29 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 2, i32 31), !dbg !29 + %232 = bitcast i32 %231 to float, !dbg !29 + %233 = fadd float %229, %232, !dbg !33 + %234 = bitcast float %233 to i32, !dbg !29 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 1, i32 31), !dbg !29 + %236 = bitcast i32 %235 to float, !dbg !29 + %237 = fadd float %233, %236, !dbg !33 + %238 = extractelement <8 x float> %106, i64 6, !dbg !29 + %239 = bitcast float %238 to i32, !dbg !29 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 16, i32 31), !dbg !29 + %241 = bitcast i32 %240 to float, !dbg !29 + %242 = fadd float %238, %241, !dbg !33 + %243 = bitcast float %242 to i32, !dbg !29 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !29 + %245 = bitcast i32 %244 to float, !dbg !29 + %246 = fadd float %242, %245, !dbg !33 + %247 = bitcast float %246 to i32, !dbg !29 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 4, i32 31), !dbg !29 + %249 = bitcast i32 %248 to float, !dbg !29 + %250 = fadd float %246, %249, !dbg !33 + %251 = bitcast float %250 to i32, !dbg !29 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 2, i32 31), !dbg !29 + %253 = bitcast i32 %252 to float, !dbg !29 + %254 = fadd float %250, %253, !dbg !33 + %255 = bitcast float %254 to i32, !dbg !29 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !29 + %257 = bitcast i32 %256 to float, !dbg !29 + %258 = fadd float %254, %257, !dbg !33 + %259 = extractelement <8 x float> %106, i64 7, !dbg !29 + %260 = bitcast float %259 to i32, !dbg !29 + %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 16, i32 31), !dbg !29 + %262 = bitcast i32 %261 to float, !dbg !29 + %263 = fadd float %259, %262, !dbg !33 + %264 = bitcast float %263 to i32, !dbg !29 + %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 8, i32 31), !dbg !29 + %266 = bitcast i32 %265 to float, !dbg !29 + %267 = fadd float %263, %266, !dbg !33 + %268 = bitcast float %267 to i32, !dbg !29 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !29 + %270 = bitcast i32 %269 to float, !dbg !29 + %271 = fadd float %267, %270, !dbg !33 + %272 = bitcast float %271 to i32, !dbg !29 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 2, i32 31), !dbg !29 + %274 = bitcast i32 %273 to float, !dbg !29 + %275 = fadd float %271, %274, !dbg !33 + %276 = bitcast float %275 to i32, !dbg !29 + %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 1, i32 31), !dbg !29 + %278 = bitcast i32 %277 to float, !dbg !29 + %279 = fadd float %275, %278, !dbg !33 + %280 = icmp eq i32 %110, 0, !dbg !29 + %281 = zext nneg i32 %111 to i64, !dbg !29 + %282 = getelementptr float, ptr addrspace(3) @global_smem, i64 %281, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %282, float %132, i1 %280) #3, !dbg !29 + %283 = or i32 %111, 8, !dbg !29 + %284 = zext nneg i32 %283 to i64, !dbg !29 + %285 = getelementptr float, ptr addrspace(3) @global_smem, i64 %284, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %285, float %153, i1 %280) #3, !dbg !29 + %286 = or i32 %111, 16, !dbg !29 + %287 = zext nneg i32 %286 to i64, !dbg !29 + %288 = getelementptr float, ptr addrspace(3) @global_smem, i64 %287, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %288, float %174, i1 %280) #3, !dbg !29 + %289 = or i32 %111, 24, !dbg !29 + %290 = zext nneg i32 %289 to i64, !dbg !29 + %291 = getelementptr float, ptr addrspace(3) @global_smem, i64 %290, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %291, float %195, i1 %280) #3, !dbg !29 + %292 = or i32 %111, 32, !dbg !29 + %293 = zext nneg i32 %292 to i64, !dbg !29 + %294 = getelementptr float, ptr addrspace(3) @global_smem, i64 %293, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %294, float %216, i1 %280) #3, !dbg !29 + %295 = or i32 %111, 40, !dbg !29 + %296 = zext nneg i32 %295 to i64, !dbg !29 + %297 = getelementptr float, ptr addrspace(3) @global_smem, i64 %296, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %297, float %237, i1 %280) #3, !dbg !29 + %298 = or i32 %111, 48, !dbg !29 + %299 = zext nneg i32 %298 to i64, !dbg !29 + %300 = getelementptr float, ptr addrspace(3) @global_smem, i64 %299, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %300, float %258, i1 %280) #3, !dbg !29 + %301 = or i32 %111, 56, !dbg !29 + %302 = zext nneg i32 %301 to i64, !dbg !29 + %303 = getelementptr float, ptr addrspace(3) @global_smem, i64 %302, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %303, float %279, i1 %280) #3, !dbg !29 + tail call void @llvm.nvvm.barrier0(), !dbg !29 + %304 = icmp slt i32 %10, 64, !dbg !29 + %305 = sext i32 %10 to i64, !dbg !29 + %306 = getelementptr float, ptr addrspace(3) @global_smem, i64 %305, !dbg !29 + %307 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %306, i1 %304) #3, !dbg !29 + %308 = bitcast float %307 to i32, !dbg !29 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 4, i32 31), !dbg !29 + %310 = bitcast i32 %309 to float, !dbg !29 + %311 = fadd float %307, %310, !dbg !33 + %312 = bitcast float %311 to i32, !dbg !29 + %313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 2, i32 31), !dbg !29 + %314 = bitcast i32 %313 to float, !dbg !29 + %315 = fadd float %311, %314, !dbg !33 + %316 = bitcast float %315 to i32, !dbg !29 + %317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 1, i32 31), !dbg !29 + %318 = bitcast i32 %317 to float, !dbg !29 + %319 = fadd float %315, %318, !dbg !33 + %320 = and i32 %10, 7, !dbg !29 + %321 = icmp eq i32 %320, 0, !dbg !29 + %322 = and i1 %304, %321, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %306, float %319, i1 %322) #3, !dbg !29 + tail call void @llvm.nvvm.barrier0(), !dbg !29 + %323 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !29 + %324 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !29 + %325 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 64), align 4, !dbg !29 + %326 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 96), align 4, !dbg !29 + %327 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 128), align 4, !dbg !29 + %328 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 160), align 4, !dbg !29 + %329 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 192), align 4, !dbg !29 + %330 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 224), align 4, !dbg !29 + %331 = extractelement <8 x float> %62, i64 0, !dbg !37 + %332 = extractelement <8 x float> %62, i64 1, !dbg !37 + %333 = extractelement <8 x float> %62, i64 2, !dbg !37 + %334 = extractelement <8 x float> %62, i64 3, !dbg !37 + %335 = extractelement <8 x float> %62, i64 4, !dbg !37 + %336 = extractelement <8 x float> %62, i64 5, !dbg !37 + %337 = extractelement <8 x float> %62, i64 6, !dbg !37 + %338 = extractelement <8 x float> %62, i64 7, !dbg !37 + br label %339, !dbg !38 + +339: ; preds = %109, %339 + %340 = phi i32 [ 0, %109 ], [ %488, %339 ] + %341 = or i32 %340, %urem, !dbg !39 + %342 = zext nneg i32 %341 to i64, !dbg !39 + %343 = icmp ult i32 %341, 50257, !dbg !40 + %344 = add nsw i64 %42, %342, !dbg !41 + %345 = add nsw i64 %43, %342, !dbg !41 + %346 = add nsw i64 %44, %342, !dbg !41 + %347 = add nsw i64 %45, %342, !dbg !41 + %348 = add nsw i64 %46, %342, !dbg !41 + %349 = add nsw i64 %47, %342, !dbg !41 + %350 = add nsw i64 %48, %342, !dbg !41 + %351 = add nsw i64 %49, %342, !dbg !41 + %352 = getelementptr i16, ptr addrspace(1) %4, i64 %344, !dbg !42 + %353 = getelementptr i16, ptr addrspace(1) %4, i64 %345, !dbg !42 + %354 = getelementptr i16, ptr addrspace(1) %4, i64 %346, !dbg !42 + %355 = getelementptr i16, ptr addrspace(1) %4, i64 %347, !dbg !42 + %356 = getelementptr i16, ptr addrspace(1) %4, i64 %348, !dbg !42 + %357 = getelementptr i16, ptr addrspace(1) %4, i64 %349, !dbg !42 + %358 = getelementptr i16, ptr addrspace(1) %4, i64 %350, !dbg !42 + %359 = getelementptr i16, ptr addrspace(1) %4, i64 %351, !dbg !42 + %360 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %352, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %361 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %353, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %362 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %354, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %363 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %355, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %364 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %356, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %365 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %357, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %366 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %358, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %367 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %359, i1 %343, i16 0, i1 %343) #3, !dbg !43 + %368 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %360) #3, !dbg !44 + %369 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %361) #3, !dbg !44 + %370 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %362) #3, !dbg !44 + %371 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %363) #3, !dbg !44 + %372 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %364) #3, !dbg !44 + %373 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %365) #3, !dbg !44 + %374 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %366) #3, !dbg !44 + %375 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %367) #3, !dbg !44 + %376 = getelementptr float, ptr addrspace(1) %0, i64 %344, !dbg !45 + %377 = getelementptr float, ptr addrspace(1) %0, i64 %345, !dbg !45 + %378 = getelementptr float, ptr addrspace(1) %0, i64 %346, !dbg !45 + %379 = getelementptr float, ptr addrspace(1) %0, i64 %347, !dbg !45 + %380 = getelementptr float, ptr addrspace(1) %0, i64 %348, !dbg !45 + %381 = getelementptr float, ptr addrspace(1) %0, i64 %349, !dbg !45 + %382 = getelementptr float, ptr addrspace(1) %0, i64 %350, !dbg !45 + %383 = getelementptr float, ptr addrspace(1) %0, i64 %351, !dbg !45 + %384 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %376, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %385 = bitcast i32 %384 to float, !dbg !46 + %386 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %377, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %387 = bitcast i32 %386 to float, !dbg !46 + %388 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %378, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %389 = bitcast i32 %388 to float, !dbg !46 + %390 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %379, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %391 = bitcast i32 %390 to float, !dbg !46 + %392 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %380, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %393 = bitcast i32 %392 to float, !dbg !46 + %394 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %381, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %395 = bitcast i32 %394 to float, !dbg !46 + %396 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %382, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %397 = bitcast i32 %396 to float, !dbg !46 + %398 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %383, i1 %343, i32 0, i1 %343) #3, !dbg !46 + %399 = bitcast i32 %398 to float, !dbg !46 + %400 = getelementptr i16, ptr addrspace(1) %5, i64 %344, !dbg !47 + %401 = getelementptr i16, ptr addrspace(1) %5, i64 %345, !dbg !47 + %402 = getelementptr i16, ptr addrspace(1) %5, i64 %346, !dbg !47 + %403 = getelementptr i16, ptr addrspace(1) %5, i64 %347, !dbg !47 + %404 = getelementptr i16, ptr addrspace(1) %5, i64 %348, !dbg !47 + %405 = getelementptr i16, ptr addrspace(1) %5, i64 %349, !dbg !47 + %406 = getelementptr i16, ptr addrspace(1) %5, i64 %350, !dbg !47 + %407 = getelementptr i16, ptr addrspace(1) %5, i64 %351, !dbg !47 + %408 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %409 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %410 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %411 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %412 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %413 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %414 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %406, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %415 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %407, i1 %343, i16 0, i1 %343) #3, !dbg !48 + %416 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %408) #3, !dbg !49 + %417 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %409) #3, !dbg !49 + %418 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %410) #3, !dbg !49 + %419 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %411) #3, !dbg !49 + %420 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %412) #3, !dbg !49 + %421 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %413) #3, !dbg !49 + %422 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %414) #3, !dbg !49 + %423 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %415) #3, !dbg !49 + %424 = fmul float %331, %385, !dbg !37 + %425 = fmul float %332, %387, !dbg !37 + %426 = fmul float %333, %389, !dbg !37 + %427 = fmul float %334, %391, !dbg !37 + %428 = fmul float %335, %393, !dbg !37 + %429 = fmul float %336, %395, !dbg !37 + %430 = fmul float %337, %397, !dbg !37 + %431 = fmul float %338, %399, !dbg !37 + %432 = fmul float %416, 0x3FF7154760000000, !dbg !50 + %433 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %432) #3, !dbg !50 + %434 = fmul float %417, 0x3FF7154760000000, !dbg !50 + %435 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %434) #3, !dbg !50 + %436 = fmul float %418, 0x3FF7154760000000, !dbg !50 + %437 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %436) #3, !dbg !50 + %438 = fmul float %419, 0x3FF7154760000000, !dbg !50 + %439 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %438) #3, !dbg !50 + %440 = fmul float %420, 0x3FF7154760000000, !dbg !50 + %441 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %440) #3, !dbg !50 + %442 = fmul float %421, 0x3FF7154760000000, !dbg !50 + %443 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %442) #3, !dbg !50 + %444 = fmul float %422, 0x3FF7154760000000, !dbg !50 + %445 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %444) #3, !dbg !50 + %446 = fmul float %423, 0x3FF7154760000000, !dbg !50 + %447 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %446) #3, !dbg !50 + %448 = fmul float %323, %433, !dbg !51 + %449 = fmul float %324, %435, !dbg !51 + %450 = fmul float %325, %437, !dbg !51 + %451 = fmul float %326, %439, !dbg !51 + %452 = fmul float %327, %441, !dbg !51 + %453 = fmul float %328, %443, !dbg !51 + %454 = fmul float %329, %445, !dbg !51 + %455 = fmul float %330, %447, !dbg !51 + %456 = fsub float %424, %448, !dbg !52 + %457 = fsub float %425, %449, !dbg !52 + %458 = fsub float %426, %450, !dbg !52 + %459 = fsub float %427, %451, !dbg !52 + %460 = fsub float %428, %452, !dbg !52 + %461 = fsub float %429, %453, !dbg !52 + %462 = fsub float %430, %454, !dbg !52 + %463 = fsub float %431, %455, !dbg !52 + %464 = fadd float %368, %456, !dbg !53 + %465 = fadd float %369, %457, !dbg !53 + %466 = fadd float %370, %458, !dbg !53 + %467 = fadd float %371, %459, !dbg !53 + %468 = fadd float %372, %460, !dbg !53 + %469 = fadd float %373, %461, !dbg !53 + %470 = fadd float %374, %462, !dbg !53 + %471 = fadd float %375, %463, !dbg !53 + %472 = getelementptr i16, ptr addrspace(1) %6, i64 %344, !dbg !54 + %473 = getelementptr i16, ptr addrspace(1) %6, i64 %345, !dbg !54 + %474 = getelementptr i16, ptr addrspace(1) %6, i64 %346, !dbg !54 + %475 = getelementptr i16, ptr addrspace(1) %6, i64 %347, !dbg !54 + %476 = getelementptr i16, ptr addrspace(1) %6, i64 %348, !dbg !54 + %477 = getelementptr i16, ptr addrspace(1) %6, i64 %349, !dbg !54 + %478 = getelementptr i16, ptr addrspace(1) %6, i64 %350, !dbg !54 + %479 = getelementptr i16, ptr addrspace(1) %6, i64 %351, !dbg !54 + %480 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %464) #3, !dbg !55 + %481 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %465) #3, !dbg !55 + %482 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %466) #3, !dbg !55 + %483 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %467) #3, !dbg !55 + %484 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %468) #3, !dbg !55 + %485 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %469) #3, !dbg !55 + %486 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %470) #3, !dbg !55 + %487 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %471) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %480, ptr addrspace(1) %472, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %481, ptr addrspace(1) %473, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %482, ptr addrspace(1) %474, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %483, ptr addrspace(1) %475, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %484, ptr addrspace(1) %476, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %485, ptr addrspace(1) %477, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %486, ptr addrspace(1) %478, i1 %343) #3, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %487, ptr addrspace(1) %479, i1 %343) #3, !dbg !55 + %488 = add nuw nsw i32 %340, 256, !dbg !38 + %489 = icmp ult i32 %340, 50001, !dbg !38 + br i1 %489, label %339, label %490, !dbg !38 + +490: ; preds = %339 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 34, scope: !5) +!12 = !DILocation(line: 21, column: 46, scope: !5) +!13 = !DILocation(line: 22, column: 23, scope: !5) +!14 = !DILocation(line: 26, column: 30, scope: !5) +!15 = !DILocation(line: 26, column: 35, scope: !5) +!16 = !DILocation(line: 27, column: 19, scope: !5) +!17 = !DILocation(line: 29, column: 19, scope: !5) +!18 = !DILocation(line: 36, column: 46, scope: !5) +!19 = !DILocation(line: 38, column: 23, scope: !5) +!20 = !DILocation(line: 39, column: 22, scope: !5) +!21 = !DILocation(line: 41, column: 37, scope: !5) +!22 = !DILocation(line: 32, column: 36, scope: !5) +!23 = !DILocation(line: 33, column: 27, scope: !5) +!24 = !DILocation(line: 34, column: 25, scope: !5) +!25 = !DILocation(line: 36, column: 34, scope: !5) +!26 = !DILocation(line: 36, column: 52, scope: !5) +!27 = !DILocation(line: 42, column: 23, scope: !5) +!28 = !DILocation(line: 45, column: 40, scope: !5) +!29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!32 = !DILocation(line: 46, column: 27, scope: !30) +!33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35) +!34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0) +!35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 27, scope: !34) +!37 = !DILocation(line: 63, column: 24, scope: !5) +!38 = !DILocation(line: 51, column: 36, scope: !5) +!39 = !DILocation(line: 52, column: 27, scope: !5) +!40 = !DILocation(line: 53, column: 25, scope: !5) +!41 = !DILocation(line: 55, column: 41, scope: !5) +!42 = !DILocation(line: 55, column: 35, scope: !5) +!43 = !DILocation(line: 55, column: 53, scope: !5) +!44 = !DILocation(line: 55, column: 105, scope: !5) +!45 = !DILocation(line: 56, column: 35, scope: !5) +!46 = !DILocation(line: 56, column: 53, scope: !5) +!47 = !DILocation(line: 57, column: 35, scope: !5) +!48 = !DILocation(line: 57, column: 53, scope: !5) +!49 = !DILocation(line: 57, column: 105, scope: !5) +!50 = !DILocation(line: 65, column: 23, scope: !5) +!51 = !DILocation(line: 66, column: 24, scope: !5) +!52 = !DILocation(line: 67, column: 24, scope: !5) +!53 = !DILocation(line: 69, column: 24, scope: !5) +!54 = !DILocation(line: 70, column: 29, scope: !5) +!55 = !DILocation(line: 70, column: 54, scope: !5) +!56 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ptx b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6bdf59f2b6cb47737f9678c87cfa98f48fd8b1d1 --- /dev/null +++ b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ptx @@ -0,0 +1,1194 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7de8 +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7de8( + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<104>; + .reg .b16 %rs<65>; + .reg .b32 %r<187>; + .reg .f32 %f<241>; + .reg .b64 %rd<94>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4]; + ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd40, [triton__0d1d2d3d4d5d6d7de8_param_1]; + shr.u32 %r2, %r1, 5; + ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_2]; + .loc 1 24 33 + and.b32 %r3, %r1, 255; + ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_3]; + .loc 1 21 28 + mov.u32 %r8, %ctaid.x; + .loc 1 21 34 + cvt.s64.s32 %rd1, %r8; + .loc 1 21 46 + mul.wide.s32 %rd41, %r8, 8; + .loc 1 26 30 + shl.b64 %rd42, %rd41, 3; + add.s64 %rd22, %rd40, %rd42; + add.s64 %rd24, %rd22, 8; + add.s64 %rd26, %rd22, 16; + add.s64 %rd28, %rd22, 24; + add.s64 %rd30, %rd22, 32; + add.s64 %rd32, %rd22, 40; + add.s64 %rd34, %rd22, 48; + add.s64 %rd36, %rd22, 56; + mov.pred %p1, -1; + .loc 1 26 35 + mov.u64 %rd21, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ]; + mov.u64 %rd23, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd24 + 0 ]; + mov.u64 %rd25, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd26 + 0 ]; + mov.u64 %rd27, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd28 + 0 ]; + mov.u64 %rd29, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd30 + 0 ]; + mov.u64 %rd31, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd32 + 0 ]; + mov.u64 %rd33, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd34 + 0 ]; + mov.u64 %rd35, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ]; + .loc 1 27 19 + mov.u32 %r12, 0x0; + @%p1 ld.global.b32 { %r12 }, [ %rd37 + 0 ]; + .loc 1 29 19 + mov.u32 %r13, 0x0; + @%p1 ld.global.b32 { %r13 }, [ %rd38 + 0 ]; + .loc 1 38 23 + setp.eq.s64 %p11, %rd21, -1; + setp.eq.s64 %p12, %rd23, -1; + setp.eq.s64 %p13, %rd25, -1; + setp.eq.s64 %p14, %rd27, -1; + setp.eq.s64 %p15, %rd29, -1; + setp.eq.s64 %p16, %rd31, -1; + setp.eq.s64 %p17, %rd33, -1; + setp.eq.s64 %p18, %rd35, -1; + .loc 1 39 22 + div.full.f32 %r11, %r12, %r13; + mov.b32 %f49, %r11; + .loc 1 41 37 + selp.f32 %f8, 0f00000000, %f49, %p18; + selp.f32 %f7, 0f00000000, %f49, %p17; + selp.f32 %f6, 0f00000000, %f49, %p16; + selp.f32 %f5, 0f00000000, %f49, %p15; + selp.f32 %f4, 0f00000000, %f49, %p14; + selp.f32 %f3, 0f00000000, %f49, %p13; + selp.f32 %f2, 0f00000000, %f49, %p12; + selp.f32 %f1, 0f00000000, %f49, %p11; + .loc 1 32 36 + mul.wide.s32 %rd43, %r8, 1608224; + cvt.u64.u32 %rd44, %r1; + and.b64 %rd2, %rd44, 255; + mul.wide.u32 %rd45, %r3, 4; + add.s64 %rd46, %rd43, %rd45; + add.s64 %rd47, %rd46, %rd39; + add.s64 %rd93, %rd47, 1407196; + mov.f32 %f233, 0f00000000; + mov.b32 %r185, -256; + mov.u64 %rd89, %rd93; + mov.f32 %f234, %f233; + mov.f32 %f235, %f233; + mov.f32 %f236, %f233; + mov.f32 %f237, %f233; + mov.f32 %f238, %f233; + mov.f32 %f239, %f233; + mov.f32 %f240, %f233; +$L__BB0_1: + add.s32 %r185, %r185, 256; + .loc 1 33 27 + add.s32 %r31, %r185, %r3; + .loc 1 34 25 + setp.lt.u32 %p19, %r31, 50257; + .loc 1 36 34 + add.s64 %rd48, %rd89, -1407196; + add.s64 %rd49, %rd89, -1206168; + add.s64 %rd50, %rd89, -1005140; + add.s64 %rd51, %rd89, -804112; + add.s64 %rd52, %rd89, -603084; + add.s64 %rd53, %rd89, -402056; + add.s64 %rd54, %rd89, -201028; + mov.b32 %r153, 0; + .loc 1 36 52 + mov.u32 %r15, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r15 }, [ %rd48 + 0 ]; + @!%p19 mov.u32 %r15, %r153; + mov.u32 %r17, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r17 }, [ %rd49 + 0 ]; + @!%p19 mov.u32 %r17, %r153; + mov.u32 %r19, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r19 }, [ %rd50 + 0 ]; + @!%p19 mov.u32 %r19, %r153; + mov.u32 %r21, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r21 }, [ %rd51 + 0 ]; + @!%p19 mov.u32 %r21, %r153; + mov.u32 %r23, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r23 }, [ %rd52 + 0 ]; + @!%p19 mov.u32 %r23, %r153; + mov.u32 %r25, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r25 }, [ %rd53 + 0 ]; + @!%p19 mov.u32 %r25, %r153; + mov.u32 %r27, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r27 }, [ %rd54 + 0 ]; + @!%p19 mov.u32 %r27, %r153; + mov.u32 %r29, 0x0; + @%p19 ld.global.L1::evict_last.b32 { %r29 }, [ %rd89 + 0 ]; + @!%p19 mov.u32 %r29, %r153; + mov.b32 %f50, %r29; + mov.b32 %f51, %r27; + mov.b32 %f52, %r25; + mov.b32 %f53, %r23; + mov.b32 %f54, %r21; + mov.b32 %f55, %r19; + mov.b32 %f56, %r17; + mov.b32 %f57, %r15; + .loc 1 42 23 + mul.f32 %f58, %f1, %f57; + mul.f32 %f59, %f2, %f56; + mul.f32 %f60, %f3, %f55; + mul.f32 %f61, %f4, %f54; + mul.f32 %f62, %f5, %f53; + mul.f32 %f63, %f6, %f52; + mul.f32 %f64, %f7, %f51; + mul.f32 %f65, %f8, %f50; + .loc 1 45 40 + selp.f32 %f66, %f65, 0f80000000, %p19; + selp.f32 %f67, %f64, 0f80000000, %p19; + selp.f32 %f68, %f63, 0f80000000, %p19; + selp.f32 %f69, %f62, 0f80000000, %p19; + selp.f32 %f70, %f61, 0f80000000, %p19; + selp.f32 %f71, %f60, 0f80000000, %p19; + selp.f32 %f72, %f59, 0f80000000, %p19; + selp.f32 %f73, %f58, 0f80000000, %p19; + add.f32 %f233, %f233, %f73; + add.f32 %f234, %f234, %f72; + add.f32 %f235, %f235, %f71; + add.f32 %f236, %f236, %f70; + add.f32 %f237, %f237, %f69; + add.f32 %f238, %f238, %f68; + add.f32 %f239, %f239, %f67; + add.f32 %f240, %f240, %f66; + .loc 1 32 36 + add.s64 %rd89, %rd89, 1024; + setp.lt.u32 %p35, %r185, 50001; + @%p35 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r53, %r1, 31; + .loc 1 24 33 + and.b32 %r54, %r2, 7; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r55, %f233; + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + mov.b32 %f74, %r56; +$L__tmp2: + .loc 2 233 15 + add.f32 %f75, %f233, %f74; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r57, %f75; + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + mov.b32 %f76, %r58; +$L__tmp4: + .loc 2 233 15 + add.f32 %f77, %f75, %f76; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r59, %f77; + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + mov.b32 %f78, %r60; +$L__tmp6: + .loc 2 233 15 + add.f32 %f79, %f77, %f78; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r61, %f79; + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + mov.b32 %f80, %r62; +$L__tmp8: + .loc 2 233 15 + add.f32 %f81, %f79, %f80; +$L__tmp9: + .loc 2 243 36 + mov.b32 %r63, %f81; + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + mov.b32 %f82, %r64; +$L__tmp10: + .loc 2 233 15 + add.f32 %f83, %f81, %f82; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r65, %f234; + shfl.sync.bfly.b32 %r66, %r65, 16, 31, -1; + mov.b32 %f84, %r66; +$L__tmp12: + .loc 2 233 15 + add.f32 %f85, %f234, %f84; +$L__tmp13: + .loc 2 243 36 + mov.b32 %r67, %f85; + shfl.sync.bfly.b32 %r68, %r67, 8, 31, -1; + mov.b32 %f86, %r68; +$L__tmp14: + .loc 2 233 15 + add.f32 %f87, %f85, %f86; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r69, %f87; + shfl.sync.bfly.b32 %r70, %r69, 4, 31, -1; + mov.b32 %f88, %r70; +$L__tmp16: + .loc 2 233 15 + add.f32 %f89, %f87, %f88; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r71, %f89; + shfl.sync.bfly.b32 %r72, %r71, 2, 31, -1; + mov.b32 %f90, %r72; +$L__tmp18: + .loc 2 233 15 + add.f32 %f91, %f89, %f90; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r73, %f91; + shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1; + mov.b32 %f92, %r74; +$L__tmp20: + .loc 2 233 15 + add.f32 %f93, %f91, %f92; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r75, %f235; + shfl.sync.bfly.b32 %r76, %r75, 16, 31, -1; + mov.b32 %f94, %r76; +$L__tmp22: + .loc 2 233 15 + add.f32 %f95, %f235, %f94; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r77, %f95; + shfl.sync.bfly.b32 %r78, %r77, 8, 31, -1; + mov.b32 %f96, %r78; +$L__tmp24: + .loc 2 233 15 + add.f32 %f97, %f95, %f96; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r79, %f97; + shfl.sync.bfly.b32 %r80, %r79, 4, 31, -1; + mov.b32 %f98, %r80; +$L__tmp26: + .loc 2 233 15 + add.f32 %f99, %f97, %f98; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r81, %f99; + shfl.sync.bfly.b32 %r82, %r81, 2, 31, -1; + mov.b32 %f100, %r82; +$L__tmp28: + .loc 2 233 15 + add.f32 %f101, %f99, %f100; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r83, %f101; + shfl.sync.bfly.b32 %r84, %r83, 1, 31, -1; + mov.b32 %f102, %r84; +$L__tmp30: + .loc 2 233 15 + add.f32 %f103, %f101, %f102; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r85, %f236; + shfl.sync.bfly.b32 %r86, %r85, 16, 31, -1; + mov.b32 %f104, %r86; +$L__tmp32: + .loc 2 233 15 + add.f32 %f105, %f236, %f104; +$L__tmp33: + .loc 2 243 36 + mov.b32 %r87, %f105; + shfl.sync.bfly.b32 %r88, %r87, 8, 31, -1; + mov.b32 %f106, %r88; +$L__tmp34: + .loc 2 233 15 + add.f32 %f107, %f105, %f106; +$L__tmp35: + .loc 2 243 36 + mov.b32 %r89, %f107; + shfl.sync.bfly.b32 %r90, %r89, 4, 31, -1; + mov.b32 %f108, %r90; +$L__tmp36: + .loc 2 233 15 + add.f32 %f109, %f107, %f108; +$L__tmp37: + .loc 2 243 36 + mov.b32 %r91, %f109; + shfl.sync.bfly.b32 %r92, %r91, 2, 31, -1; + mov.b32 %f110, %r92; +$L__tmp38: + .loc 2 233 15 + add.f32 %f111, %f109, %f110; +$L__tmp39: + .loc 2 243 36 + mov.b32 %r93, %f111; + shfl.sync.bfly.b32 %r94, %r93, 1, 31, -1; + mov.b32 %f112, %r94; +$L__tmp40: + .loc 2 233 15 + add.f32 %f113, %f111, %f112; +$L__tmp41: + .loc 2 243 36 + mov.b32 %r95, %f237; + shfl.sync.bfly.b32 %r96, %r95, 16, 31, -1; + mov.b32 %f114, %r96; +$L__tmp42: + .loc 2 233 15 + add.f32 %f115, %f237, %f114; +$L__tmp43: + .loc 2 243 36 + mov.b32 %r97, %f115; + shfl.sync.bfly.b32 %r98, %r97, 8, 31, -1; + mov.b32 %f116, %r98; +$L__tmp44: + .loc 2 233 15 + add.f32 %f117, %f115, %f116; +$L__tmp45: + .loc 2 243 36 + mov.b32 %r99, %f117; + shfl.sync.bfly.b32 %r100, %r99, 4, 31, -1; + mov.b32 %f118, %r100; +$L__tmp46: + .loc 2 233 15 + add.f32 %f119, %f117, %f118; +$L__tmp47: + .loc 2 243 36 + mov.b32 %r101, %f119; + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + mov.b32 %f120, %r102; +$L__tmp48: + .loc 2 233 15 + add.f32 %f121, %f119, %f120; +$L__tmp49: + .loc 2 243 36 + mov.b32 %r103, %f121; + shfl.sync.bfly.b32 %r104, %r103, 1, 31, -1; + mov.b32 %f122, %r104; +$L__tmp50: + .loc 2 233 15 + add.f32 %f123, %f121, %f122; +$L__tmp51: + .loc 2 243 36 + mov.b32 %r105, %f238; + shfl.sync.bfly.b32 %r106, %r105, 16, 31, -1; + mov.b32 %f124, %r106; +$L__tmp52: + .loc 2 233 15 + add.f32 %f125, %f238, %f124; +$L__tmp53: + .loc 2 243 36 + mov.b32 %r107, %f125; + shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1; + mov.b32 %f126, %r108; +$L__tmp54: + .loc 2 233 15 + add.f32 %f127, %f125, %f126; +$L__tmp55: + .loc 2 243 36 + mov.b32 %r109, %f127; + shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1; + mov.b32 %f128, %r110; +$L__tmp56: + .loc 2 233 15 + add.f32 %f129, %f127, %f128; +$L__tmp57: + .loc 2 243 36 + mov.b32 %r111, %f129; + shfl.sync.bfly.b32 %r112, %r111, 2, 31, -1; + mov.b32 %f130, %r112; +$L__tmp58: + .loc 2 233 15 + add.f32 %f131, %f129, %f130; +$L__tmp59: + .loc 2 243 36 + mov.b32 %r113, %f131; + shfl.sync.bfly.b32 %r114, %r113, 1, 31, -1; + mov.b32 %f132, %r114; +$L__tmp60: + .loc 2 233 15 + add.f32 %f133, %f131, %f132; +$L__tmp61: + .loc 2 243 36 + mov.b32 %r115, %f239; + shfl.sync.bfly.b32 %r116, %r115, 16, 31, -1; + mov.b32 %f134, %r116; +$L__tmp62: + .loc 2 233 15 + add.f32 %f135, %f239, %f134; +$L__tmp63: + .loc 2 243 36 + mov.b32 %r117, %f135; + shfl.sync.bfly.b32 %r118, %r117, 8, 31, -1; + mov.b32 %f136, %r118; +$L__tmp64: + .loc 2 233 15 + add.f32 %f137, %f135, %f136; +$L__tmp65: + .loc 2 243 36 + mov.b32 %r119, %f137; + shfl.sync.bfly.b32 %r120, %r119, 4, 31, -1; + mov.b32 %f138, %r120; +$L__tmp66: + .loc 2 233 15 + add.f32 %f139, %f137, %f138; +$L__tmp67: + .loc 2 243 36 + mov.b32 %r121, %f139; + shfl.sync.bfly.b32 %r122, %r121, 2, 31, -1; + mov.b32 %f140, %r122; +$L__tmp68: + .loc 2 233 15 + add.f32 %f141, %f139, %f140; +$L__tmp69: + .loc 2 243 36 + mov.b32 %r123, %f141; + shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1; + mov.b32 %f142, %r124; +$L__tmp70: + .loc 2 233 15 + add.f32 %f143, %f141, %f142; +$L__tmp71: + .loc 2 243 36 + mov.b32 %r125, %f240; + shfl.sync.bfly.b32 %r126, %r125, 16, 31, -1; + mov.b32 %f144, %r126; +$L__tmp72: + .loc 2 233 15 + add.f32 %f145, %f240, %f144; +$L__tmp73: + .loc 2 243 36 + mov.b32 %r127, %f145; + shfl.sync.bfly.b32 %r128, %r127, 8, 31, -1; + mov.b32 %f146, %r128; +$L__tmp74: + .loc 2 233 15 + add.f32 %f147, %f145, %f146; +$L__tmp75: + .loc 2 243 36 + mov.b32 %r129, %f147; + shfl.sync.bfly.b32 %r130, %r129, 4, 31, -1; + mov.b32 %f148, %r130; +$L__tmp76: + .loc 2 233 15 + add.f32 %f149, %f147, %f148; +$L__tmp77: + .loc 2 243 36 + mov.b32 %r131, %f149; + shfl.sync.bfly.b32 %r132, %r131, 2, 31, -1; + mov.b32 %f150, %r132; +$L__tmp78: + .loc 2 233 15 + add.f32 %f151, %f149, %f150; +$L__tmp79: + .loc 2 243 36 + mov.b32 %r133, %f151; + shfl.sync.bfly.b32 %r134, %r133, 1, 31, -1; + mov.b32 %f152, %r134; +$L__tmp80: + .loc 2 233 15 + add.f32 %f153, %f151, %f152; +$L__tmp81: + .loc 2 243 36 + setp.eq.s32 %p36, %r53, 0; + shl.b32 %r135, %r54, 2; + mov.u32 %r136, global_smem; + add.s32 %r32, %r136, %r135; + mov.b32 %r33, %f83; + @%p36 st.shared.b32 [ %r32 + 0 ], %r33; + add.s32 %r34, %r32, 32; + mov.b32 %r35, %f93; + @%p36 st.shared.b32 [ %r34 + 0 ], %r35; + add.s32 %r36, %r32, 64; + mov.b32 %r37, %f103; + @%p36 st.shared.b32 [ %r36 + 0 ], %r37; + add.s32 %r38, %r32, 96; + mov.b32 %r39, %f113; + @%p36 st.shared.b32 [ %r38 + 0 ], %r39; + add.s32 %r40, %r32, 128; + mov.b32 %r41, %f123; + @%p36 st.shared.b32 [ %r40 + 0 ], %r41; + add.s32 %r42, %r32, 160; + mov.b32 %r43, %f133; + @%p36 st.shared.b32 [ %r42 + 0 ], %r43; + add.s32 %r44, %r32, 192; + mov.b32 %r45, %f143; + @%p36 st.shared.b32 [ %r44 + 0 ], %r45; + add.s32 %r46, %r32, 224; + mov.b32 %r47, %f153; + @%p36 st.shared.b32 [ %r46 + 0 ], %r47; + bar.sync 0; + setp.lt.s32 %p44, %r1, 64; + shl.b32 %r137, %r1, 2; + add.s32 %r49, %r136, %r137; + @%p44 ld.shared.b32 %r48, [ %r49 + 0 ]; + mov.b32 %f154, %r48; + shfl.sync.bfly.b32 %r138, %r48, 4, 31, -1; + mov.b32 %f155, %r138; +$L__tmp82: + .loc 2 233 15 + add.f32 %f156, %f154, %f155; +$L__tmp83: + .loc 2 243 36 + mov.b32 %r139, %f156; + shfl.sync.bfly.b32 %r140, %r139, 2, 31, -1; + mov.b32 %f157, %r140; +$L__tmp84: + .loc 2 233 15 + add.f32 %f158, %f156, %f157; +$L__tmp85: + .loc 2 243 36 + mov.b32 %r141, %f158; + shfl.sync.bfly.b32 %r142, %r141, 1, 31, -1; + mov.b32 %f159, %r142; +$L__tmp86: + .loc 2 233 15 + add.f32 %f160, %f158, %f159; +$L__tmp87: + .loc 2 243 36 + and.b32 %r143, %r1, 7; + setp.eq.s32 %p46, %r143, 0; + and.pred %p45, %p44, %p46; + mov.b32 %r51, %f160; + @%p45 st.shared.b32 [ %r49 + 0 ], %r51; + bar.sync 0; + ld.shared.f32 %f25, [global_smem]; + ld.shared.f32 %f26, [global_smem+32]; + ld.shared.f32 %f27, [global_smem+64]; + ld.shared.f32 %f28, [global_smem+96]; + ld.shared.f32 %f29, [global_smem+128]; + ld.shared.f32 %f30, [global_smem+160]; + ld.shared.f32 %f31, [global_smem+192]; + ld.shared.f32 %f32, [global_smem+224]; +$L__tmp88: + .loc 1 51 36 + shl.b64 %rd6, %rd2, 1; + mul.lo.s64 %rd56, %rd1, 804112; + add.s64 %rd92, %rd20, %rd56; + add.s64 %rd91, %rd19, %rd56; + add.s64 %rd90, %rd18, %rd56; + mov.b32 %r186, -256; + mov.u16 %rs2, 0; +$L__BB0_3: + add.s32 %r186, %r186, 256; + .loc 1 52 27 + add.s32 %r184, %r186, %r3; + .loc 1 53 25 + setp.lt.u32 %p47, %r184, 50257; + .loc 1 55 35 + add.s64 %rd57, %rd90, %rd6; + add.s64 %rd58, %rd57, 100514; + add.s64 %rd59, %rd57, 201028; + add.s64 %rd60, %rd57, 301542; + add.s64 %rd61, %rd57, 402056; + add.s64 %rd62, %rd57, 502570; + add.s64 %rd63, %rd57, 603084; + .loc 1 55 53 + add.s64 %rd64, %rd57, 703598; + mov.u16 %rs1, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd57 + 0 ]; + @!%p47 mov.u16 %rs1, %rs2; + mov.u16 %rs3, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd58 + 0 ]; + @!%p47 mov.u16 %rs3, %rs2; + mov.u16 %rs5, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd59 + 0 ]; + @!%p47 mov.u16 %rs5, %rs2; + mov.u16 %rs7, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd60 + 0 ]; + @!%p47 mov.u16 %rs7, %rs2; + mov.u16 %rs9, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd61 + 0 ]; + @!%p47 mov.u16 %rs9, %rs2; + mov.u16 %rs11, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd62 + 0 ]; + @!%p47 mov.u16 %rs11, %rs2; + mov.u16 %rs13, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd63 + 0 ]; + @!%p47 mov.u16 %rs13, %rs2; + mov.u16 %rs15, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd64 + 0 ]; + @!%p47 mov.u16 %rs15, %rs2; + .loc 1 55 105 + cvt.f32.bf16 %r144, %rs1; + mov.b32 %f177, %r144; + cvt.f32.bf16 %r145, %rs3; + mov.b32 %f178, %r145; + cvt.f32.bf16 %r146, %rs5; + mov.b32 %f179, %r146; + cvt.f32.bf16 %r147, %rs7; + mov.b32 %f180, %r147; + cvt.f32.bf16 %r148, %rs9; + mov.b32 %f181, %r148; + cvt.f32.bf16 %r149, %rs11; + mov.b32 %f182, %r149; + cvt.f32.bf16 %r150, %rs13; + mov.b32 %f183, %r150; + cvt.f32.bf16 %r151, %rs15; + mov.b32 %f184, %r151; + .loc 1 56 35 + add.s64 %rd65, %rd93, -1407196; + add.s64 %rd66, %rd93, -1206168; + add.s64 %rd67, %rd93, -1005140; + add.s64 %rd68, %rd93, -804112; + add.s64 %rd69, %rd93, -603084; + add.s64 %rd70, %rd93, -402056; + add.s64 %rd71, %rd93, -201028; + .loc 1 56 53 + mov.u32 %r152, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r152 }, [ %rd65 + 0 ]; + @!%p47 mov.u32 %r152, %r153; + mov.b32 %f185, %r152; + mov.u32 %r154, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r154 }, [ %rd66 + 0 ]; + @!%p47 mov.u32 %r154, %r153; + mov.b32 %f186, %r154; + mov.u32 %r156, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r156 }, [ %rd67 + 0 ]; + @!%p47 mov.u32 %r156, %r153; + mov.b32 %f187, %r156; + mov.u32 %r158, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r158 }, [ %rd68 + 0 ]; + @!%p47 mov.u32 %r158, %r153; + mov.b32 %f188, %r158; + mov.u32 %r160, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r160 }, [ %rd69 + 0 ]; + @!%p47 mov.u32 %r160, %r153; + mov.b32 %f189, %r160; + mov.u32 %r162, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r162 }, [ %rd70 + 0 ]; + @!%p47 mov.u32 %r162, %r153; + mov.b32 %f190, %r162; + mov.u32 %r164, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r164 }, [ %rd71 + 0 ]; + @!%p47 mov.u32 %r164, %r153; + mov.b32 %f191, %r164; + mov.u32 %r166, 0x0; + @%p47 ld.global.L1::evict_first.b32 { %r166 }, [ %rd93 + 0 ]; + @!%p47 mov.u32 %r166, %r153; + mov.b32 %f192, %r166; + .loc 1 57 35 + add.s64 %rd73, %rd91, %rd6; + add.s64 %rd74, %rd73, 100514; + add.s64 %rd75, %rd73, 201028; + add.s64 %rd76, %rd73, 301542; + add.s64 %rd77, %rd73, 402056; + add.s64 %rd78, %rd73, 502570; + add.s64 %rd79, %rd73, 603084; + .loc 1 57 53 + add.s64 %rd80, %rd73, 703598; + mov.u16 %rs25, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd73 + 0 ]; + @!%p47 mov.u16 %rs25, %rs2; + mov.u16 %rs27, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd74 + 0 ]; + @!%p47 mov.u16 %rs27, %rs2; + mov.u16 %rs29, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd75 + 0 ]; + @!%p47 mov.u16 %rs29, %rs2; + mov.u16 %rs31, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd76 + 0 ]; + @!%p47 mov.u16 %rs31, %rs2; + mov.u16 %rs33, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd77 + 0 ]; + @!%p47 mov.u16 %rs33, %rs2; + mov.u16 %rs35, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd78 + 0 ]; + @!%p47 mov.u16 %rs35, %rs2; + mov.u16 %rs37, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd79 + 0 ]; + @!%p47 mov.u16 %rs37, %rs2; + mov.u16 %rs39, 0x0; + @%p47 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd80 + 0 ]; + @!%p47 mov.u16 %rs39, %rs2; + .loc 1 57 105 + cvt.f32.bf16 %r168, %rs25; + mov.b32 %f193, %r168; + cvt.f32.bf16 %r169, %rs27; + mov.b32 %f194, %r169; + cvt.f32.bf16 %r170, %rs29; + mov.b32 %f195, %r170; + cvt.f32.bf16 %r171, %rs31; + mov.b32 %f196, %r171; + cvt.f32.bf16 %r172, %rs33; + mov.b32 %f197, %r172; + cvt.f32.bf16 %r173, %rs35; + mov.b32 %f198, %r173; + cvt.f32.bf16 %r174, %rs37; + mov.b32 %f199, %r174; + cvt.f32.bf16 %r175, %rs39; + mov.b32 %f200, %r175; + .loc 1 65 23 + mul.f32 %f162, %f193, 0f3FB8AA3B; + ex2.approx.f32 %f161, %f162; + mul.f32 %f164, %f194, 0f3FB8AA3B; + ex2.approx.f32 %f163, %f164; + mul.f32 %f166, %f195, 0f3FB8AA3B; + ex2.approx.f32 %f165, %f166; + mul.f32 %f168, %f196, 0f3FB8AA3B; + ex2.approx.f32 %f167, %f168; + mul.f32 %f170, %f197, 0f3FB8AA3B; + ex2.approx.f32 %f169, %f170; + mul.f32 %f172, %f198, 0f3FB8AA3B; + ex2.approx.f32 %f171, %f172; + mul.f32 %f174, %f199, 0f3FB8AA3B; + ex2.approx.f32 %f173, %f174; + mul.f32 %f176, %f200, 0f3FB8AA3B; + ex2.approx.f32 %f175, %f176; + .loc 1 66 24 + mul.f32 %f201, %f25, %f161; + mul.f32 %f202, %f26, %f163; + mul.f32 %f203, %f27, %f165; + mul.f32 %f204, %f28, %f167; + mul.f32 %f205, %f29, %f169; + mul.f32 %f206, %f30, %f171; + mul.f32 %f207, %f31, %f173; + mul.f32 %f208, %f32, %f175; + .loc 1 67 24 + neg.f32 %f209, %f201; + fma.rn.f32 %f210, %f1, %f185, %f209; + neg.f32 %f211, %f202; + fma.rn.f32 %f212, %f2, %f186, %f211; + neg.f32 %f213, %f203; + fma.rn.f32 %f214, %f3, %f187, %f213; + neg.f32 %f215, %f204; + fma.rn.f32 %f216, %f4, %f188, %f215; + neg.f32 %f217, %f205; + fma.rn.f32 %f218, %f5, %f189, %f217; + neg.f32 %f219, %f206; + fma.rn.f32 %f220, %f6, %f190, %f219; + neg.f32 %f221, %f207; + fma.rn.f32 %f222, %f7, %f191, %f221; + neg.f32 %f223, %f208; + fma.rn.f32 %f224, %f8, %f192, %f223; + .loc 1 69 24 + add.f32 %f225, %f177, %f210; + add.f32 %f226, %f178, %f212; + add.f32 %f227, %f179, %f214; + add.f32 %f228, %f180, %f216; + add.f32 %f229, %f181, %f218; + add.f32 %f230, %f182, %f220; + add.f32 %f231, %f183, %f222; + add.f32 %f232, %f184, %f224; + .loc 1 70 29 + add.s64 %rd81, %rd92, %rd6; + add.s64 %rd82, %rd81, 100514; + add.s64 %rd83, %rd81, 201028; + add.s64 %rd84, %rd81, 301542; + add.s64 %rd85, %rd81, 402056; + add.s64 %rd86, %rd81, 502570; + add.s64 %rd87, %rd81, 603084; + .loc 1 70 54 + add.s64 %rd88, %rd81, 703598; + mov.b32 %r176, %f225; + cvt.rn.bf16.f32 %rs49, %r176; + mov.b32 %r177, %f226; + cvt.rn.bf16.f32 %rs50, %r177; + mov.b32 %r178, %f227; + cvt.rn.bf16.f32 %rs51, %r178; + mov.b32 %r179, %f228; + cvt.rn.bf16.f32 %rs52, %r179; + mov.b32 %r180, %f229; + cvt.rn.bf16.f32 %rs53, %r180; + mov.b32 %r181, %f230; + cvt.rn.bf16.f32 %rs54, %r181; + mov.b32 %r182, %f231; + cvt.rn.bf16.f32 %rs55, %r182; + mov.b32 %r183, %f232; + cvt.rn.bf16.f32 %rs56, %r183; + @%p47 st.global.b16 [ %rd81 + 0 ], { %rs49 }; + @%p47 st.global.b16 [ %rd82 + 0 ], { %rs50 }; + @%p47 st.global.b16 [ %rd83 + 0 ], { %rs51 }; + @%p47 st.global.b16 [ %rd84 + 0 ], { %rs52 }; + @%p47 st.global.b16 [ %rd85 + 0 ], { %rs53 }; + @%p47 st.global.b16 [ %rd86 + 0 ], { %rs54 }; + @%p47 st.global.b16 [ %rd87 + 0 ], { %rs55 }; + @%p47 st.global.b16 [ %rd88 + 0 ], { %rs56 }; + .loc 1 51 36 + add.s64 %rd93, %rd93, 1024; + add.s64 %rd92, %rd92, 512; + add.s64 %rd91, %rd91, 512; + add.s64 %rd90, %rd90, 512; + setp.lt.u32 %p103, %r186, 50001; + @%p103 bra $L__BB0_3; + .loc 1 51 4 + ret; +$L__tmp89: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 278 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 107 +.b8 122 +.b8 103 +.b8 108 +.b8 55 +.b8 116 +.b8 104 +.b8 98 +.b8 52 +.b8 120 +.b8 100 +.b8 102 +.b8 107 +.b8 102 +.b8 110 +.b8 100 +.b8 50 +.b8 116 +.b8 105 +.b8 100 +.b8 107 +.b8 115 +.b8 54 +.b8 109 +.b8 116 +.b8 53 +.b8 102 +.b8 51 +.b8 104 +.b8 97 +.b8 117 +.b8 119 +.b8 102 +.b8 121 +.b8 106 +.b8 102 +.b8 108 +.b8 98 +.b8 116 +.b8 122 +.b8 121 +.b8 101 +.b8 112 +.b8 111 +.b8 53 +.b8 111 +.b8 120 +.b8 107 +.b8 118 +.b8 104 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 107 +.b8 122 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp88 +.b8 2 +.b8 46 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp87 +.b8 2 +.b8 46 +.b8 27 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp87 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttgir b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5a267aa35c573d8c6b18e5304b5afaa1e7d90d3d --- /dev/null +++ b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttgir @@ -0,0 +1,92 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x1xf32, #blocked> + %cst_0 = arith.constant dense<50257> : tensor<8x1xi64, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<8x1xi64, #blocked> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x256xf32, #blocked> + %c8_i64 = arith.constant 8 : i64 + %cst_3 = arith.constant dense<50257> : tensor<1x256xi64, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c256_i32 = arith.constant 256 : i32 + %c50257_i32 = arith.constant 50257 : i32 + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c8_i64 : i64 + %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xi32, #blocked> + %5 = arith.extsi %4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> + %6 = tt.splat %2 : (i64) -> tensor<8x1xi64, #blocked> + %7 = arith.addi %6, %5 : tensor<8x1xi64, #blocked> + %8 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked> + %10 = arith.extsi %9 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked> + %11 = tt.splat %arg1 : (!tt.ptr) -> tensor<8x1x!tt.ptr, #blocked> + %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi64, #blocked> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64, #blocked> + %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %18 = arith.muli %7, %cst_0 : tensor<8x1xi64, #blocked> + %19 = tt.broadcast %18 : (tensor<8x1xi64, #blocked>) -> tensor<8x256xi64, #blocked> + %20 = tt.splat %arg0 : (!tt.ptr) -> tensor<8x256x!tt.ptr, #blocked> + %21 = arith.cmpi ne, %13, %cst_1 : tensor<8x1xi64, #blocked> + %22 = arith.divf %15, %17 : f32 + %23 = tt.splat %22 : (f32) -> tensor<8x1xf32, #blocked> + %24 = arith.select %21, %23, %cst : tensor<8x1xi1, #blocked>, tensor<8x1xf32, #blocked> + %25 = tt.broadcast %24 : (tensor<8x1xf32, #blocked>) -> tensor<8x256xf32, #blocked> + %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c256_i32 iter_args(%arg10 = %cst_2) -> (tensor<8x256xf32, #blocked>) : i32 { + %33 = arith.extsi %arg9 : i32 to i64 + %34 = tt.splat %33 : (i64) -> tensor<1x256xi64, #blocked> + %35 = arith.addi %34, %10 : tensor<1x256xi64, #blocked> + %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x256xi64, #blocked> + %37 = tt.broadcast %35 : (tensor<1x256xi64, #blocked>) -> tensor<8x256xi64, #blocked> + %38 = arith.addi %37, %19 : tensor<8x256xi64, #blocked> + %39 = tt.addptr %20, %38 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi64, #blocked> + %40 = tt.broadcast %36 : (tensor<1x256xi1, #blocked>) -> tensor<8x256xi1, #blocked> + %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x256xf32, #blocked> + %42 = arith.mulf %41, %25 : tensor<8x256xf32, #blocked> + %43 = arith.addf %arg10, %42 : tensor<8x256xf32, #blocked> + %44 = arith.select %40, %43, %arg10 : tensor<8x256xi1, #blocked>, tensor<8x256xf32, #blocked> + scf.yield %44 : tensor<8x256xf32, #blocked> + } + %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %33 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %33 : f32 + }) : (tensor<8x256xf32, #blocked>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xf32, #blocked> + %29 = tt.splat %arg4 : (!tt.ptr) -> tensor<8x256x!tt.ptr, #blocked> + %30 = tt.splat %arg5 : (!tt.ptr) -> tensor<8x256x!tt.ptr, #blocked> + %31 = tt.broadcast %28 : (tensor<8x1xf32, #blocked>) -> tensor<8x256xf32, #blocked> + %32 = tt.splat %arg6 : (!tt.ptr) -> tensor<8x256x!tt.ptr, #blocked> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c256_i32 : i32 { + %33 = arith.extsi %arg9 : i32 to i64 + %34 = tt.splat %33 : (i64) -> tensor<1x256xi64, #blocked> + %35 = arith.addi %34, %10 : tensor<1x256xi64, #blocked> + %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x256xi64, #blocked> + %37 = tt.broadcast %35 : (tensor<1x256xi64, #blocked>) -> tensor<8x256xi64, #blocked> + %38 = arith.addi %37, %19 : tensor<8x256xi64, #blocked> + %39 = tt.addptr %29, %38 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi64, #blocked> + %40 = tt.broadcast %36 : (tensor<1x256xi1, #blocked>) -> tensor<8x256xi1, #blocked> + %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xbf16, #blocked> + %42 = arith.extf %41 : tensor<8x256xbf16, #blocked> to tensor<8x256xf32, #blocked> + %43 = tt.addptr %20, %38 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi64, #blocked> + %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xf32, #blocked> + %45 = tt.addptr %30, %38 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi64, #blocked> + %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xbf16, #blocked> + %47 = arith.extf %46 : tensor<8x256xbf16, #blocked> to tensor<8x256xf32, #blocked> + %48 = arith.mulf %44, %25 : tensor<8x256xf32, #blocked> + %49 = math.exp %47 : tensor<8x256xf32, #blocked> + %50 = arith.mulf %49, %31 : tensor<8x256xf32, #blocked> + %51 = arith.subf %48, %50 : tensor<8x256xf32, #blocked> + %52 = arith.addf %42, %51 : tensor<8x256xf32, #blocked> + %53 = tt.addptr %32, %38 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi64, #blocked> + %54 = arith.truncf %52 : tensor<8x256xf32, #blocked> to tensor<8x256xbf16, #blocked> + tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<8x256xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttir b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..91c5714cfcec82a1a114c2c2fb43f96ffb900fc5 --- /dev/null +++ b/.triton/dump/b439d4fb5c699c6d430247617f6325f6/triton_.ttir @@ -0,0 +1,99 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x256xbf16> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x1xf32> + %c50257_i32 = arith.constant 50257 : i32 + %c256_i32 = arith.constant 256 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<50257> : tensor<8x1xi64> + %cst_2 = arith.constant dense<50257> : tensor<1x256xi64> + %c8_i64 = arith.constant 8 : i64 + %cst_3 = arith.constant dense<-1> : tensor<8x1xi64> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x256xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c8_i64 : i64 + %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32>) -> tensor<8x1xi32> + %5 = arith.extsi %4 : tensor<8x1xi32> to tensor<8x1xi64> + %6 = tt.splat %2 : (i64) -> tensor<8x1xi64> + %7 = arith.addi %6, %5 : tensor<8x1xi64> + %8 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32> + %10 = arith.extsi %9 : tensor<1x256xi32> to tensor<1x256xi64> + %11 = tt.splat %arg1 : (!tt.ptr) -> tensor<8x1x!tt.ptr> + %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> + %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64> + %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %18 = arith.muli %7, %cst_1 : tensor<8x1xi64> + %19 = tt.broadcast %18 : (tensor<8x1xi64>) -> tensor<8x256xi64> + %20 = tt.splat %arg0 : (!tt.ptr) -> tensor<8x256x!tt.ptr> + %21 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64> + %22 = arith.divf %15, %17 : f32 + %23 = tt.splat %22 : (f32) -> tensor<8x1xf32> + %24 = arith.select %21, %23, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32> + %25 = tt.broadcast %24 : (tensor<8x1xf32>) -> tensor<8x256xf32> + %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c256_i32 iter_args(%arg10 = %cst_4) -> (tensor<8x256xf32>) : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x256xi64> + %43 = arith.addi %42, %10 : tensor<1x256xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x256xi64> + %45 = tt.broadcast %43 : (tensor<1x256xi64>) -> tensor<8x256xi64> + %46 = arith.addi %45, %19 : tensor<8x256xi64> + %47 = tt.addptr %20, %46 : tensor<8x256x!tt.ptr>, tensor<8x256xi64> + %48 = tt.broadcast %44 : (tensor<1x256xi1>) -> tensor<8x256xi1> + %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x256xf32> + %50 = arith.mulf %49, %25 : tensor<8x256xf32> + %51 = arith.addf %arg10, %50 : tensor<8x256xf32> + %52 = arith.select %48, %51, %arg10 : tensor<8x256xi1>, tensor<8x256xf32> + scf.yield %52 : tensor<8x256xf32> + } + %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %41 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %41 : f32 + }) : (tensor<8x256xf32>) -> tensor<8xf32> + %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32>) -> tensor<8x1xf32> + %29 = arith.muli %7, %cst_1 : tensor<8x1xi64> + %30 = tt.broadcast %29 : (tensor<8x1xi64>) -> tensor<8x256xi64> + %31 = tt.splat %arg4 : (!tt.ptr) -> tensor<8x256x!tt.ptr> + %32 = tt.splat %arg0 : (!tt.ptr) -> tensor<8x256x!tt.ptr> + %33 = tt.splat %arg5 : (!tt.ptr) -> tensor<8x256x!tt.ptr> + %34 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64> + %35 = arith.divf %15, %17 : f32 + %36 = tt.splat %35 : (f32) -> tensor<8x1xf32> + %37 = arith.select %34, %36, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32> + %38 = tt.broadcast %37 : (tensor<8x1xf32>) -> tensor<8x256xf32> + %39 = tt.broadcast %28 : (tensor<8x1xf32>) -> tensor<8x256xf32> + %40 = tt.splat %arg6 : (!tt.ptr) -> tensor<8x256x!tt.ptr> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c256_i32 : i32 { + %41 = arith.extsi %arg9 : i32 to i64 + %42 = tt.splat %41 : (i64) -> tensor<1x256xi64> + %43 = arith.addi %42, %10 : tensor<1x256xi64> + %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x256xi64> + %45 = tt.broadcast %43 : (tensor<1x256xi64>) -> tensor<8x256xi64> + %46 = arith.addi %45, %30 : tensor<8x256xi64> + %47 = tt.addptr %31, %46 : tensor<8x256x!tt.ptr>, tensor<8x256xi64> + %48 = tt.broadcast %44 : (tensor<1x256xi1>) -> tensor<8x256xi1> + %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xbf16> + %50 = arith.extf %49 : tensor<8x256xbf16> to tensor<8x256xf32> + %51 = tt.addptr %32, %46 : tensor<8x256x!tt.ptr>, tensor<8x256xi64> + %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xf32> + %53 = tt.addptr %33, %46 : tensor<8x256x!tt.ptr>, tensor<8x256xi64> + %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x256xbf16> + %55 = arith.extf %54 : tensor<8x256xbf16> to tensor<8x256xf32> + %56 = arith.mulf %52, %38 : tensor<8x256xf32> + %57 = math.exp %55 : tensor<8x256xf32> + %58 = arith.mulf %57, %39 : tensor<8x256xf32> + %59 = arith.subf %56, %58 : tensor<8x256xf32> + %60 = arith.addf %50, %59 : tensor<8x256xf32> + %61 = tt.addptr %40, %46 : tensor<8x256x!tt.ptr>, tensor<8x256xi64> + %62 = arith.truncf %60 : tensor<8x256xf32> to tensor<8x256xbf16> + tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<8x256xbf16> + } + tt.return + } +} diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.cubin b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b6af25c5dcf0c52fc79ccdced6bb30c99a054e79 Binary files /dev/null and b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.cubin differ diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.llir b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..172e2d34e1e88aa3eebf0aeef9067bb2198eaef2 --- /dev/null +++ b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.llir @@ -0,0 +1,300 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %11 = lshr i32 %10, 5, !dbg !8 + %urem = and i32 %10, 255, !dbg !8 + %12 = or i32 %urem, 256, !dbg !8 + %13 = or i32 %urem, 512, !dbg !8 + %14 = or i32 %urem, 768, !dbg !8 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %16 = sext i32 %15 to i64, !dbg !10 + %17 = insertelement <4 x i32> poison, i32 %urem, i64 0 + %18 = insertelement <4 x i32> %17, i32 %12, i64 1 + %19 = insertelement <4 x i32> %18, i32 %13, i64 2 + %20 = insertelement <4 x i32> %19, i32 %14, i64 3 + %21 = zext nneg <4 x i32> %20 to <4 x i64> + %22 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !11 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #3, !dbg !12 + %24 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13 + %25 = bitcast i32 %24 to float, !dbg !13 + %26 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14 + %27 = bitcast i32 %26 to float, !dbg !14 + %28 = mul nsw i64 %16, 50257, !dbg !15 + %.not = icmp eq i64 %23, -1, !dbg !16 + %29 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %25, float %27) #3, !dbg !17 + %30 = select i1 %.not, float 0.000000e+00, float %29, !dbg !18 + %invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19 + %31 = insertelement <4 x float> poison, float %30, i64 0, !dbg !20 + %32 = shufflevector <4 x float> %31, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !20 + br label %33, !dbg !19 + +33: ; preds = %9, %33 + %34 = phi i32 [ 0, %9 ], [ %61, %33 ] + %35 = phi <4 x float> [ zeroinitializer, %9 ], [ %60, %33 ] + %36 = zext nneg i32 %34 to i64, !dbg !21 + %37 = insertelement <4 x i64> poison, i64 %36, i64 0, !dbg !21 + %38 = shufflevector <4 x i64> %37, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !21 + %39 = or <4 x i64> %38, %21, !dbg !21 + %40 = icmp ult <4 x i64> %39, , !dbg !22 + %41 = extractelement <4 x i64> %39, i64 0, !dbg !23 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %41, !dbg !23 + %42 = extractelement <4 x i64> %39, i64 1, !dbg !23 + %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %42, !dbg !23 + %43 = extractelement <4 x i64> %39, i64 2, !dbg !23 + %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %43, !dbg !23 + %44 = extractelement <4 x i64> %39, i64 3, !dbg !23 + %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %44, !dbg !23 + %45 = extractelement <4 x i1> %40, i64 0, !dbg !24 + %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %45, i32 0, i1 %45) #3, !dbg !24 + %47 = extractelement <4 x i1> %40, i64 1, !dbg !24 + %48 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %47, i32 0, i1 %47) #3, !dbg !24 + %49 = extractelement <4 x i1> %40, i64 2, !dbg !24 + %50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %49, i32 0, i1 %49) #3, !dbg !24 + %51 = extractelement <4 x i1> %40, i64 3, !dbg !24 + %52 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %51, i32 0, i1 %51) #3, !dbg !24 + %53 = insertelement <4 x i32> poison, i32 %46, i64 0, !dbg !24 + %54 = insertelement <4 x i32> %53, i32 %48, i64 1, !dbg !24 + %55 = insertelement <4 x i32> %54, i32 %50, i64 2, !dbg !24 + %56 = insertelement <4 x i32> %55, i32 %52, i64 3, !dbg !24 + %57 = bitcast <4 x i32> %56 to <4 x float>, !dbg !24 + %58 = fmul <4 x float> %32, %57, !dbg !20 + %59 = select <4 x i1> %40, <4 x float> %58, <4 x float> , !dbg !25 + %60 = fadd <4 x float> %35, %59, !dbg !25 + %61 = add nuw nsw i32 %34, 1024, !dbg !19 + %62 = icmp ult i32 %34, 49233, !dbg !19 + br i1 %62, label %33, label %63, !dbg !19 + +63: ; preds = %33 + %64 = and i32 %10, 31, !dbg !8 + %65 = and i32 %11, 7, !dbg !8 + %shift = shufflevector <4 x float> %60, <4 x float> poison, <4 x i32> , !dbg !26 + %66 = fadd <4 x float> %60, %shift, !dbg !26 + %shift17 = shufflevector <4 x float> %60, <4 x float> poison, <4 x i32> , !dbg !26 + %67 = fadd <4 x float> %shift17, %66, !dbg !26 + %shift18 = shufflevector <4 x float> %60, <4 x float> poison, <4 x i32> , !dbg !26 + %68 = fadd <4 x float> %shift18, %67, !dbg !26 + %69 = extractelement <4 x float> %68, i64 0, !dbg !26 + %70 = bitcast float %69 to i32, !dbg !32 + %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32 + %72 = bitcast i32 %71 to float, !dbg !32 + %73 = fadd float %69, %72, !dbg !26 + %74 = bitcast float %73 to i32, !dbg !32 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32 + %76 = bitcast i32 %75 to float, !dbg !32 + %77 = fadd float %73, %76, !dbg !26 + %78 = bitcast float %77 to i32, !dbg !32 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32 + %80 = bitcast i32 %79 to float, !dbg !32 + %81 = fadd float %77, %80, !dbg !26 + %82 = bitcast float %81 to i32, !dbg !32 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32 + %84 = bitcast i32 %83 to float, !dbg !32 + %85 = fadd float %81, %84, !dbg !26 + %86 = bitcast float %85 to i32, !dbg !32 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32 + %88 = bitcast i32 %87 to float, !dbg !32 + %89 = fadd float %85, %88, !dbg !26 + %90 = icmp eq i32 %64, 0, !dbg !32 + %91 = zext nneg i32 %65 to i64, !dbg !32 + %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #3, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %93 = icmp slt i32 %10, 8, !dbg !32 + %94 = sext i32 %10 to i64, !dbg !32 + %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32 + %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #3, !dbg !32 + %97 = bitcast float %96 to i32, !dbg !32 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 4, i32 31), !dbg !32 + %99 = bitcast i32 %98 to float, !dbg !32 + %100 = fadd float %96, %99, !dbg !26 + %101 = bitcast float %100 to i32, !dbg !32 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 2, i32 31), !dbg !32 + %103 = bitcast i32 %102 to float, !dbg !32 + %104 = fadd float %100, %103, !dbg !26 + %105 = bitcast float %104 to i32, !dbg !32 + %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !32 + %107 = bitcast i32 %106 to float, !dbg !32 + %108 = fadd float %104, %107, !dbg !26 + %109 = and i32 %10, 7, !dbg !32 + %110 = icmp eq i32 %109, 0, !dbg !32 + %111 = and i1 %93, %110, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %108, i1 %111) #3, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 + %113 = extractelement <4 x i64> %21, i64 0, !dbg !34 + %114 = extractelement <4 x i64> %21, i64 1, !dbg !34 + %115 = extractelement <4 x i64> %21, i64 2, !dbg !34 + %116 = extractelement <4 x i64> %21, i64 3, !dbg !34 + br label %117, !dbg !35 + +117: ; preds = %63, %117 + %118 = phi i32 [ 0, %63 ], [ %200, %117 ] + %119 = zext nneg i32 %118 to i64, !dbg !34 + %120 = or i64 %113, %119, !dbg !34 + %121 = or i64 %114, %119, !dbg !34 + %122 = or i64 %115, %119, !dbg !34 + %123 = or i64 %116, %119, !dbg !34 + %124 = icmp ult i64 %120, 50257, !dbg !36 + %125 = icmp ult i64 %121, 50257, !dbg !36 + %126 = icmp ult i64 %122, 50257, !dbg !36 + %127 = icmp ult i64 %123, 50257, !dbg !36 + %128 = add nsw i64 %120, %28, !dbg !37 + %129 = add nsw i64 %121, %28, !dbg !37 + %130 = add nsw i64 %122, %28, !dbg !37 + %131 = add nsw i64 %123, %28, !dbg !37 + %132 = getelementptr i16, ptr addrspace(1) %4, i64 %128, !dbg !38 + %133 = getelementptr i16, ptr addrspace(1) %4, i64 %129, !dbg !38 + %134 = getelementptr i16, ptr addrspace(1) %4, i64 %130, !dbg !38 + %135 = getelementptr i16, ptr addrspace(1) %4, i64 %131, !dbg !38 + %136 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %132, i1 %124, i16 0, i1 %124) #3, !dbg !39 + %137 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %133, i1 %125, i16 0, i1 %125) #3, !dbg !39 + %138 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %134, i1 %126, i16 0, i1 %126) #3, !dbg !39 + %139 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %135, i1 %127, i16 0, i1 %127) #3, !dbg !39 + %140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %136) #3, !dbg !40 + %141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %137) #3, !dbg !40 + %142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %138) #3, !dbg !40 + %143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %139) #3, !dbg !40 + %144 = getelementptr float, ptr addrspace(1) %0, i64 %128, !dbg !41 + %145 = getelementptr float, ptr addrspace(1) %0, i64 %129, !dbg !41 + %146 = getelementptr float, ptr addrspace(1) %0, i64 %130, !dbg !41 + %147 = getelementptr float, ptr addrspace(1) %0, i64 %131, !dbg !41 + %148 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %144, i1 %124, i32 0, i1 %124) #3, !dbg !42 + %149 = bitcast i32 %148 to float, !dbg !42 + %150 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %145, i1 %125, i32 0, i1 %125) #3, !dbg !42 + %151 = bitcast i32 %150 to float, !dbg !42 + %152 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %146, i1 %126, i32 0, i1 %126) #3, !dbg !42 + %153 = bitcast i32 %152 to float, !dbg !42 + %154 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %147, i1 %127, i32 0, i1 %127) #3, !dbg !42 + %155 = bitcast i32 %154 to float, !dbg !42 + %156 = getelementptr i16, ptr addrspace(1) %5, i64 %128, !dbg !43 + %157 = getelementptr i16, ptr addrspace(1) %5, i64 %129, !dbg !43 + %158 = getelementptr i16, ptr addrspace(1) %5, i64 %130, !dbg !43 + %159 = getelementptr i16, ptr addrspace(1) %5, i64 %131, !dbg !43 + %160 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %156, i1 %124, i16 0, i1 %124) #3, !dbg !44 + %161 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %157, i1 %125, i16 0, i1 %125) #3, !dbg !44 + %162 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %158, i1 %126, i16 0, i1 %126) #3, !dbg !44 + %163 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %159, i1 %127, i16 0, i1 %127) #3, !dbg !44 + %164 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %160) #3, !dbg !45 + %165 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %161) #3, !dbg !45 + %166 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %162) #3, !dbg !45 + %167 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %163) #3, !dbg !45 + %168 = fmul float %30, %149, !dbg !46 + %169 = fmul float %30, %151, !dbg !46 + %170 = fmul float %30, %153, !dbg !46 + %171 = fmul float %30, %155, !dbg !46 + %172 = fmul float %164, 0x3FF7154760000000, !dbg !47 + %173 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %172) #3, !dbg !47 + %174 = fmul float %165, 0x3FF7154760000000, !dbg !47 + %175 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %174) #3, !dbg !47 + %176 = fmul float %166, 0x3FF7154760000000, !dbg !47 + %177 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %176) #3, !dbg !47 + %178 = fmul float %167, 0x3FF7154760000000, !dbg !47 + %179 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %178) #3, !dbg !47 + %180 = fmul float %112, %173, !dbg !48 + %181 = fmul float %112, %175, !dbg !48 + %182 = fmul float %112, %177, !dbg !48 + %183 = fmul float %112, %179, !dbg !48 + %184 = fsub float %168, %180, !dbg !49 + %185 = fsub float %169, %181, !dbg !49 + %186 = fsub float %170, %182, !dbg !49 + %187 = fsub float %171, %183, !dbg !49 + %188 = fadd float %140, %184, !dbg !50 + %189 = fadd float %141, %185, !dbg !50 + %190 = fadd float %142, %186, !dbg !50 + %191 = fadd float %143, %187, !dbg !50 + %192 = getelementptr i16, ptr addrspace(1) %6, i64 %128, !dbg !51 + %193 = getelementptr i16, ptr addrspace(1) %6, i64 %129, !dbg !51 + %194 = getelementptr i16, ptr addrspace(1) %6, i64 %130, !dbg !51 + %195 = getelementptr i16, ptr addrspace(1) %6, i64 %131, !dbg !51 + %196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !52 + %197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !52 + %198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !52 + %199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %191) #3, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %196, ptr addrspace(1) %192, i1 %124) #3, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %197, ptr addrspace(1) %193, i1 %125) #3, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %198, ptr addrspace(1) %194, i1 %126) #3, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %199, ptr addrspace(1) %195, i1 %127) #3, !dbg !52 + %200 = add nuw nsw i32 %118, 1024, !dbg !35 + %201 = icmp ult i32 %118, 49233, !dbg !35 + br i1 %201, label %117, label %202, !dbg !35 + +202: ; preds = %117 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 24, column: 33, scope: !5) +!9 = !DILocation(line: 21, column: 28, scope: !5) +!10 = !DILocation(line: 21, column: 34, scope: !5) +!11 = !DILocation(line: 26, column: 30, scope: !5) +!12 = !DILocation(line: 26, column: 35, scope: !5) +!13 = !DILocation(line: 27, column: 19, scope: !5) +!14 = !DILocation(line: 29, column: 19, scope: !5) +!15 = !DILocation(line: 36, column: 46, scope: !5) +!16 = !DILocation(line: 38, column: 23, scope: !5) +!17 = !DILocation(line: 39, column: 22, scope: !5) +!18 = !DILocation(line: 41, column: 37, scope: !5) +!19 = !DILocation(line: 32, column: 36, scope: !5) +!20 = !DILocation(line: 42, column: 23, scope: !5) +!21 = !DILocation(line: 33, column: 27, scope: !5) +!22 = !DILocation(line: 34, column: 25, scope: !5) +!23 = !DILocation(line: 36, column: 34, scope: !5) +!24 = !DILocation(line: 36, column: 52, scope: !5) +!25 = !DILocation(line: 45, column: 40, scope: !5) +!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) +!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) +!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) +!31 = !DILocation(line: 46, column: 27, scope: !27) +!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) +!33 = !DILocation(line: 46, column: 27, scope: !29) +!34 = !DILocation(line: 52, column: 27, scope: !5) +!35 = !DILocation(line: 51, column: 36, scope: !5) +!36 = !DILocation(line: 53, column: 25, scope: !5) +!37 = !DILocation(line: 55, column: 41, scope: !5) +!38 = !DILocation(line: 55, column: 35, scope: !5) +!39 = !DILocation(line: 55, column: 53, scope: !5) +!40 = !DILocation(line: 55, column: 105, scope: !5) +!41 = !DILocation(line: 56, column: 35, scope: !5) +!42 = !DILocation(line: 56, column: 53, scope: !5) +!43 = !DILocation(line: 57, column: 35, scope: !5) +!44 = !DILocation(line: 57, column: 53, scope: !5) +!45 = !DILocation(line: 57, column: 105, scope: !5) +!46 = !DILocation(line: 63, column: 24, scope: !5) +!47 = !DILocation(line: 65, column: 23, scope: !5) +!48 = !DILocation(line: 66, column: 24, scope: !5) +!49 = !DILocation(line: 67, column: 24, scope: !5) +!50 = !DILocation(line: 69, column: 24, scope: !5) +!51 = !DILocation(line: 70, column: 29, scope: !5) +!52 = !DILocation(line: 70, column: 54, scope: !5) +!53 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ptx b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..943b75c14ff3ce6fd58579149db621b72f1c3801 --- /dev/null +++ b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ptx @@ -0,0 +1,733 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7de8 +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7de8( + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<47>; + .reg .b16 %rs<33>; + .reg .b32 %r<72>; + .reg .f32 %f<92>; + .reg .b64 %rd<74>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5]; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4]; + ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_0]; +$L__tmp0: + .loc 1 24 33 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_1]; + shr.u32 %r2, %r1, 5; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_2]; + and.b32 %r9, %r1, 255; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8_param_3]; + or.b32 %r10, %r9, 256; + or.b32 %r11, %r9, 512; + or.b32 %r12, %r9, 768; + .loc 1 21 28 + mov.u32 %r3, %ctaid.x; + cvt.u64.u32 %rd1, %r9; + cvt.u64.u32 %rd4, %r12; + cvt.u64.u32 %rd3, %r11; + cvt.u64.u32 %rd2, %r10; + .loc 1 26 30 + mul.wide.s32 %rd25, %r3, 8; + add.s64 %rd19, %rd24, %rd25; + mov.pred %p1, -1; + .loc 1 26 35 + mov.u64 %rd18, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ]; + .loc 1 27 19 + mov.u32 %r7, 0x0; + @%p1 ld.global.b32 { %r7 }, [ %rd20 + 0 ]; + .loc 1 29 19 + mov.u32 %r8, 0x0; + @%p1 ld.global.b32 { %r8 }, [ %rd21 + 0 ]; + .loc 1 36 46 + mul.wide.s32 %rd5, %r3, 50257; + .loc 1 38 23 + setp.eq.s64 %p4, %rd18, -1; + .loc 1 39 22 + div.full.f32 %r6, %r7, %r8; + mov.b32 %f19, %r6; + .loc 1 41 37 + selp.f32 %f2, 0f00000000, %f19, %p4; + .loc 1 32 36 + shl.b64 %rd26, %rd5, 2; + add.s64 %rd6, %rd23, %rd26; + mov.f32 %f88, 0f00000000; + mov.u64 %rd72, 0; + mov.f32 %f89, %f88; + mov.f32 %f90, %f88; + mov.f32 %f91, %f88; +$L__BB0_1: + .loc 1 33 27 + or.b64 %rd31, %rd72, %rd1; + or.b64 %rd32, %rd72, %rd2; + or.b64 %rd33, %rd72, %rd3; + or.b64 %rd34, %rd72, %rd4; + .loc 1 34 25 + setp.lt.u64 %p12, %rd34, 50257; + setp.lt.u64 %p10, %rd33, 50257; + setp.lt.u64 %p8, %rd32, 50257; + setp.lt.u64 %p6, %rd31, 50257; + .loc 1 36 34 + shl.b64 %rd35, %rd31, 2; + add.s64 %rd27, %rd6, %rd35; + shl.b64 %rd36, %rd32, 2; + add.s64 %rd28, %rd6, %rd36; + shl.b64 %rd37, %rd33, 2; + add.s64 %rd29, %rd6, %rd37; + shl.b64 %rd38, %rd34, 2; + add.s64 %rd30, %rd6, %rd38; + mov.b32 %r55, 0; + .loc 1 36 52 + mov.u32 %r13, 0x0; + @%p6 ld.global.L1::evict_last.b32 { %r13 }, [ %rd27 + 0 ]; + @!%p6 mov.u32 %r13, %r55; + mov.u32 %r15, 0x0; + @%p8 ld.global.L1::evict_last.b32 { %r15 }, [ %rd28 + 0 ]; + @!%p8 mov.u32 %r15, %r55; + mov.u32 %r17, 0x0; + @%p10 ld.global.L1::evict_last.b32 { %r17 }, [ %rd29 + 0 ]; + @!%p10 mov.u32 %r17, %r55; + mov.u32 %r19, 0x0; + @%p12 ld.global.L1::evict_last.b32 { %r19 }, [ %rd30 + 0 ]; + @!%p12 mov.u32 %r19, %r55; + mov.b32 %f20, %r19; + mov.b32 %f21, %r17; + mov.b32 %f22, %r15; + mov.b32 %f23, %r13; + .loc 1 42 23 + mul.f32 %f24, %f2, %f23; + mul.f32 %f25, %f2, %f22; + mul.f32 %f26, %f2, %f21; + mul.f32 %f27, %f2, %f20; + .loc 1 45 40 + selp.f32 %f28, %f27, 0f80000000, %p12; + selp.f32 %f29, %f26, 0f80000000, %p10; + selp.f32 %f30, %f25, 0f80000000, %p8; + selp.f32 %f31, %f24, 0f80000000, %p6; + add.f32 %f88, %f88, %f31; + add.f32 %f89, %f89, %f30; + add.f32 %f90, %f90, %f29; + add.f32 %f91, %f91, %f28; + .loc 1 32 36 + add.s64 %rd72, %rd72, 1024; + cvt.u32.u64 %r21, %rd72; + add.s32 %r22, %r21, -1024; + setp.lt.u32 %p13, %r22, 49233; + @%p13 bra $L__BB0_1; + .loc 1 24 33 + and.b32 %r29, %r1, 31; + and.b32 %r30, %r2, 7; +$L__tmp1: + .loc 2 233 15 + add.f32 %f32, %f88, %f89; + add.f32 %f33, %f90, %f32; + add.f32 %f34, %f91, %f33; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r31, %f34; + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; + mov.b32 %f35, %r32; +$L__tmp3: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r33, %f36; + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; + mov.b32 %f37, %r34; +$L__tmp5: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r35, %f38; + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; + mov.b32 %f39, %r36; +$L__tmp7: + .loc 2 233 15 + add.f32 %f40, %f38, %f39; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r37, %f40; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + mov.b32 %f41, %r38; +$L__tmp9: + .loc 2 233 15 + add.f32 %f42, %f40, %f41; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r39, %f42; + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; + mov.b32 %f43, %r40; +$L__tmp11: + .loc 2 233 15 + add.f32 %f44, %f42, %f43; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p14, %r29, 0; + shl.b32 %r41, %r30, 2; + mov.u32 %r42, global_smem; + add.s32 %r23, %r42, %r41; + mov.b32 %r24, %f44; + @%p14 st.shared.b32 [ %r23 + 0 ], %r24; + bar.sync 0; + setp.lt.s32 %p15, %r1, 8; + shl.b32 %r43, %r1, 2; + add.s32 %r26, %r42, %r43; + @%p15 ld.shared.b32 %r25, [ %r26 + 0 ]; + mov.b32 %f45, %r25; + shfl.sync.bfly.b32 %r44, %r25, 4, 31, -1; + mov.b32 %f46, %r44; +$L__tmp13: + .loc 2 233 15 + add.f32 %f47, %f45, %f46; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r45, %f47; + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + mov.b32 %f48, %r46; +$L__tmp15: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r47, %f49; + shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1; + mov.b32 %f50, %r48; +$L__tmp17: + .loc 2 233 15 + add.f32 %f51, %f49, %f50; +$L__tmp18: + .loc 2 243 36 + and.b32 %r49, %r1, 7; + setp.eq.s32 %p17, %r49, 0; + and.pred %p16, %p15, %p17; + mov.b32 %r28, %f51; + @%p16 st.shared.b32 [ %r26 + 0 ], %r28; + bar.sync 0; + ld.shared.f32 %f14, [global_smem]; + mov.u64 %rd73, 0; + mov.u16 %rs2, 0; +$L__tmp19: +$L__BB0_3: + .loc 1 52 27 + or.b64 %rd56, %rd1, %rd73; + or.b64 %rd57, %rd2, %rd73; + or.b64 %rd58, %rd3, %rd73; + or.b64 %rd59, %rd4, %rd73; + .loc 1 53 25 + setp.lt.u64 %p18, %rd56, 50257; + setp.lt.u64 %p20, %rd57, 50257; + setp.lt.u64 %p22, %rd58, 50257; + setp.lt.u64 %p24, %rd59, 50257; + .loc 1 55 41 + add.s64 %rd60, %rd56, %rd5; + add.s64 %rd61, %rd57, %rd5; + add.s64 %rd62, %rd58, %rd5; + add.s64 %rd63, %rd59, %rd5; + .loc 1 55 35 + shl.b64 %rd64, %rd60, 1; + add.s64 %rd40, %rd15, %rd64; + shl.b64 %rd65, %rd61, 1; + add.s64 %rd41, %rd15, %rd65; + shl.b64 %rd66, %rd62, 1; + add.s64 %rd42, %rd15, %rd66; + shl.b64 %rd67, %rd63, 1; + add.s64 %rd43, %rd15, %rd67; + .loc 1 55 53 + mov.u16 %rs1, 0x0; + @%p18 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd40 + 0 ]; + @!%p18 mov.u16 %rs1, %rs2; + mov.u16 %rs3, 0x0; + @%p20 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd41 + 0 ]; + @!%p20 mov.u16 %rs3, %rs2; + mov.u16 %rs5, 0x0; + @%p22 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd42 + 0 ]; + @!%p22 mov.u16 %rs5, %rs2; + mov.u16 %rs7, 0x0; + @%p24 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd43 + 0 ]; + @!%p24 mov.u16 %rs7, %rs2; + .loc 1 55 105 + cvt.f32.bf16 %r50, %rs1; + mov.b32 %f60, %r50; + cvt.f32.bf16 %r51, %rs3; + mov.b32 %f61, %r51; + cvt.f32.bf16 %r52, %rs5; + mov.b32 %f62, %r52; + cvt.f32.bf16 %r53, %rs7; + mov.b32 %f63, %r53; + .loc 1 56 35 + shl.b64 %rd68, %rd56, 2; + add.s64 %rd44, %rd6, %rd68; + shl.b64 %rd69, %rd57, 2; + add.s64 %rd45, %rd6, %rd69; + shl.b64 %rd70, %rd58, 2; + add.s64 %rd46, %rd6, %rd70; + shl.b64 %rd71, %rd59, 2; + add.s64 %rd47, %rd6, %rd71; + .loc 1 56 53 + mov.u32 %r54, 0x0; + @%p18 ld.global.L1::evict_first.b32 { %r54 }, [ %rd44 + 0 ]; + @!%p18 mov.u32 %r54, %r55; + mov.b32 %f64, %r54; + mov.u32 %r56, 0x0; + @%p20 ld.global.L1::evict_first.b32 { %r56 }, [ %rd45 + 0 ]; + @!%p20 mov.u32 %r56, %r55; + mov.b32 %f65, %r56; + mov.u32 %r58, 0x0; + @%p22 ld.global.L1::evict_first.b32 { %r58 }, [ %rd46 + 0 ]; + @!%p22 mov.u32 %r58, %r55; + mov.b32 %f66, %r58; + mov.u32 %r60, 0x0; + @%p24 ld.global.L1::evict_first.b32 { %r60 }, [ %rd47 + 0 ]; + @!%p24 mov.u32 %r60, %r55; + mov.b32 %f67, %r60; + .loc 1 57 35 + add.s64 %rd48, %rd16, %rd64; + add.s64 %rd49, %rd16, %rd65; + add.s64 %rd50, %rd16, %rd66; + add.s64 %rd51, %rd16, %rd67; + .loc 1 57 53 + mov.u16 %rs13, 0x0; + @%p18 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd48 + 0 ]; + @!%p18 mov.u16 %rs13, %rs2; + mov.u16 %rs15, 0x0; + @%p20 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd49 + 0 ]; + @!%p20 mov.u16 %rs15, %rs2; + mov.u16 %rs17, 0x0; + @%p22 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd50 + 0 ]; + @!%p22 mov.u16 %rs17, %rs2; + mov.u16 %rs19, 0x0; + @%p24 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd51 + 0 ]; + @!%p24 mov.u16 %rs19, %rs2; + .loc 1 57 105 + cvt.f32.bf16 %r62, %rs13; + mov.b32 %f68, %r62; + cvt.f32.bf16 %r63, %rs15; + mov.b32 %f69, %r63; + cvt.f32.bf16 %r64, %rs17; + mov.b32 %f70, %r64; + cvt.f32.bf16 %r65, %rs19; + mov.b32 %f71, %r65; + .loc 1 65 23 + mul.f32 %f53, %f68, 0f3FB8AA3B; + ex2.approx.f32 %f52, %f53; + mul.f32 %f55, %f69, 0f3FB8AA3B; + ex2.approx.f32 %f54, %f55; + mul.f32 %f57, %f70, 0f3FB8AA3B; + ex2.approx.f32 %f56, %f57; + mul.f32 %f59, %f71, 0f3FB8AA3B; + ex2.approx.f32 %f58, %f59; + .loc 1 66 24 + mul.f32 %f72, %f14, %f52; + mul.f32 %f73, %f14, %f54; + mul.f32 %f74, %f14, %f56; + mul.f32 %f75, %f14, %f58; + .loc 1 67 24 + neg.f32 %f76, %f72; + fma.rn.f32 %f77, %f2, %f64, %f76; + neg.f32 %f78, %f73; + fma.rn.f32 %f79, %f2, %f65, %f78; + neg.f32 %f80, %f74; + fma.rn.f32 %f81, %f2, %f66, %f80; + neg.f32 %f82, %f75; + fma.rn.f32 %f83, %f2, %f67, %f82; + .loc 1 69 24 + add.f32 %f84, %f60, %f77; + add.f32 %f85, %f61, %f79; + add.f32 %f86, %f62, %f81; + add.f32 %f87, %f63, %f83; + .loc 1 70 29 + add.s64 %rd52, %rd17, %rd64; + add.s64 %rd53, %rd17, %rd65; + add.s64 %rd54, %rd17, %rd66; + add.s64 %rd55, %rd17, %rd67; + .loc 1 70 54 + mov.b32 %r66, %f84; + cvt.rn.bf16.f32 %rs25, %r66; + mov.b32 %r67, %f85; + cvt.rn.bf16.f32 %rs26, %r67; + mov.b32 %r68, %f86; + cvt.rn.bf16.f32 %rs27, %r68; + mov.b32 %r69, %f87; + cvt.rn.bf16.f32 %rs28, %r69; + @%p18 st.global.b16 [ %rd52 + 0 ], { %rs25 }; + @%p20 st.global.b16 [ %rd53 + 0 ], { %rs26 }; + @%p22 st.global.b16 [ %rd54 + 0 ], { %rs27 }; + @%p24 st.global.b16 [ %rd55 + 0 ], { %rs28 }; + .loc 1 51 36 + add.s64 %rd73, %rd73, 1024; + cvt.u32.u64 %r70, %rd73; + add.s32 %r71, %r70, -1024; + setp.lt.u32 %p46, %r71, 49233; + @%p46 bra $L__BB0_3; + .loc 1 51 4 + ret; +$L__tmp20: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 278 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 107 +.b8 122 +.b8 103 +.b8 108 +.b8 55 +.b8 116 +.b8 104 +.b8 98 +.b8 52 +.b8 120 +.b8 100 +.b8 102 +.b8 107 +.b8 102 +.b8 110 +.b8 100 +.b8 50 +.b8 116 +.b8 105 +.b8 100 +.b8 107 +.b8 115 +.b8 54 +.b8 109 +.b8 116 +.b8 53 +.b8 102 +.b8 51 +.b8 104 +.b8 97 +.b8 117 +.b8 119 +.b8 102 +.b8 121 +.b8 106 +.b8 102 +.b8 108 +.b8 98 +.b8 116 +.b8 122 +.b8 121 +.b8 101 +.b8 112 +.b8 111 +.b8 53 +.b8 111 +.b8 120 +.b8 107 +.b8 118 +.b8 104 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 107 +.b8 122 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 46 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp19 +.b8 2 +.b8 46 +.b8 27 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 101 +.b8 56 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 282 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5618dc9f84257d09aeab684a881fd40e2a3d7a37 --- /dev/null +++ b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttgir @@ -0,0 +1,81 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked> + %cst_0 = arith.constant dense<-1> : tensor<1x1xi64, #blocked> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32, #blocked> + %cst_2 = arith.constant dense<50257> : tensor<1x1024xi64, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c50257_i32 = arith.constant 50257 : i32 + %c50257_i64 = arith.constant 50257 : i64 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x1024xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<1024xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x1024xi32, #blocked> + %4 = arith.extsi %3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> + %5 = tt.addptr %arg1, %1 : !tt.ptr, i64 + %6 = tt.splat %5 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked> + %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %12 = arith.muli %1, %c50257_i64 : i64 + %13 = tt.splat %12 : (i64) -> tensor<1x1024xi64, #blocked> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %15 = arith.cmpi ne, %7, %cst_0 : tensor<1x1xi64, #blocked> + %16 = arith.divf %9, %11 : f32 + %17 = tt.splat %16 : (f32) -> tensor<1x1xf32, #blocked> + %18 = arith.select %15, %17, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked> + %19 = tt.broadcast %18 : (tensor<1x1xf32, #blocked>) -> tensor<1x1024xf32, #blocked> + %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 iter_args(%arg10 = %cst_1) -> (tensor<1x1024xf32, #blocked>) : i32 { + %27 = arith.extsi %arg9 : i32 to i64 + %28 = tt.splat %27 : (i64) -> tensor<1x1024xi64, #blocked> + %29 = arith.addi %28, %4 : tensor<1x1024xi64, #blocked> + %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x1024xi64, #blocked> + %31 = arith.addi %29, %13 : tensor<1x1024xi64, #blocked> + %32 = tt.addptr %14, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %33 = tt.load %32, %30, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1024xf32, #blocked> + %34 = arith.mulf %33, %19 : tensor<1x1024xf32, #blocked> + %35 = arith.addf %arg10, %34 : tensor<1x1024xf32, #blocked> + %36 = arith.select %30, %35, %arg10 : tensor<1x1024xi1, #blocked>, tensor<1x1024xf32, #blocked> + scf.yield %36 : tensor<1x1024xf32, #blocked> + } + %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %27 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %27 : f32 + }) : (tensor<1x1024xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked> + %23 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %24 = tt.splat %arg5 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + %25 = tt.broadcast %22 : (tensor<1x1xf32, #blocked>) -> tensor<1x1024xf32, #blocked> + %26 = tt.splat %arg6 : (!tt.ptr) -> tensor<1x1024x!tt.ptr, #blocked> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 : i32 { + %27 = arith.extsi %arg9 : i32 to i64 + %28 = tt.splat %27 : (i64) -> tensor<1x1024xi64, #blocked> + %29 = arith.addi %28, %4 : tensor<1x1024xi64, #blocked> + %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x1024xi64, #blocked> + %31 = arith.addi %29, %13 : tensor<1x1024xi64, #blocked> + %32 = tt.addptr %23, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %33 = tt.load %32, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16, #blocked> + %34 = arith.extf %33 : tensor<1x1024xbf16, #blocked> to tensor<1x1024xf32, #blocked> + %35 = tt.addptr %14, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %36 = tt.load %35, %30, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xf32, #blocked> + %37 = tt.addptr %24, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %38 = tt.load %37, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16, #blocked> + %39 = arith.extf %38 : tensor<1x1024xbf16, #blocked> to tensor<1x1024xf32, #blocked> + %40 = arith.mulf %36, %19 : tensor<1x1024xf32, #blocked> + %41 = math.exp %39 : tensor<1x1024xf32, #blocked> + %42 = arith.mulf %41, %25 : tensor<1x1024xf32, #blocked> + %43 = arith.subf %40, %42 : tensor<1x1024xf32, #blocked> + %44 = arith.addf %34, %43 : tensor<1x1024xf32, #blocked> + %45 = tt.addptr %26, %31 : tensor<1x1024x!tt.ptr, #blocked>, tensor<1x1024xi64, #blocked> + %46 = arith.truncf %44 : tensor<1x1024xf32, #blocked> to tensor<1x1024xbf16, #blocked> + tt.store %45, %46, %30 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1024xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttir b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..aac19a60e0fe180f1dd2d036d83186c14cd7637f --- /dev/null +++ b/.triton/dump/d43e80648db5fd3dd8ecf4f6006c83ba/triton_.ttir @@ -0,0 +1,88 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x1024xbf16> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32> + %c50257_i64 = arith.constant 50257 : i64 + %c50257_i32 = arith.constant 50257 : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<50257> : tensor<1x1024xi64> + %cst_2 = arith.constant dense<-1> : tensor<1x1xi64> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<1024xi32>) -> tensor<1x1024xi32> + %4 = arith.extsi %3 : tensor<1x1024xi32> to tensor<1x1024xi64> + %5 = tt.addptr %arg1, %1 : !tt.ptr, i64 + %6 = tt.splat %5 : (!tt.ptr) -> tensor<1x1x!tt.ptr> + %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64> + %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr, i32 + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr, i32 + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 + %12 = arith.muli %1, %c50257_i64 : i64 + %13 = tt.splat %12 : (i64) -> tensor<1x1024xi64> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x1024x!tt.ptr> + %15 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64> + %16 = arith.divf %9, %11 : f32 + %17 = tt.splat %16 : (f32) -> tensor<1x1xf32> + %18 = arith.select %15, %17, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32> + %19 = tt.broadcast %18 : (tensor<1x1xf32>) -> tensor<1x1024xf32> + %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 iter_args(%arg10 = %cst_3) -> (tensor<1x1024xf32>) : i32 { + %35 = arith.extsi %arg9 : i32 to i64 + %36 = tt.splat %35 : (i64) -> tensor<1x1024xi64> + %37 = arith.addi %36, %4 : tensor<1x1024xi64> + %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x1024xi64> + %39 = arith.addi %37, %13 : tensor<1x1024xi64> + %40 = tt.addptr %14, %39 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> + %41 = tt.load %40, %38, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1024xf32> + %42 = arith.mulf %41, %19 : tensor<1x1024xf32> + %43 = arith.addf %arg10, %42 : tensor<1x1024xf32> + %44 = arith.select %38, %43, %arg10 : tensor<1x1024xi1>, tensor<1x1024xf32> + scf.yield %44 : tensor<1x1024xf32> + } + %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32, %arg10: f32): + %35 = arith.addf %arg9, %arg10 : f32 + tt.reduce.return %35 : f32 + }) : (tensor<1x1024xf32>) -> tensor<1xf32> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32> + %23 = arith.muli %1, %c50257_i64 : i64 + %24 = tt.splat %23 : (i64) -> tensor<1x1024xi64> + %25 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x1024x!tt.ptr> + %26 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x1024x!tt.ptr> + %27 = tt.splat %arg5 : (!tt.ptr) -> tensor<1x1024x!tt.ptr> + %28 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64> + %29 = arith.divf %9, %11 : f32 + %30 = tt.splat %29 : (f32) -> tensor<1x1xf32> + %31 = arith.select %28, %30, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32> + %32 = tt.broadcast %31 : (tensor<1x1xf32>) -> tensor<1x1024xf32> + %33 = tt.broadcast %22 : (tensor<1x1xf32>) -> tensor<1x1024xf32> + %34 = tt.splat %arg6 : (!tt.ptr) -> tensor<1x1024x!tt.ptr> + scf.for %arg9 = %c0_i32 to %c50257_i32 step %c1024_i32 : i32 { + %35 = arith.extsi %arg9 : i32 to i64 + %36 = tt.splat %35 : (i64) -> tensor<1x1024xi64> + %37 = arith.addi %36, %4 : tensor<1x1024xi64> + %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x1024xi64> + %39 = arith.addi %37, %24 : tensor<1x1024xi64> + %40 = tt.addptr %25, %39 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> + %41 = tt.load %40, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16> + %42 = arith.extf %41 : tensor<1x1024xbf16> to tensor<1x1024xf32> + %43 = tt.addptr %26, %39 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> + %44 = tt.load %43, %38, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xf32> + %45 = tt.addptr %27, %39 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> + %46 = tt.load %45, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x1024xbf16> + %47 = arith.extf %46 : tensor<1x1024xbf16> to tensor<1x1024xf32> + %48 = arith.mulf %44, %32 : tensor<1x1024xf32> + %49 = math.exp %47 : tensor<1x1024xf32> + %50 = arith.mulf %49, %33 : tensor<1x1024xf32> + %51 = arith.subf %48, %50 : tensor<1x1024xf32> + %52 = arith.addf %42, %51 : tensor<1x1024xf32> + %53 = tt.addptr %34, %39 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> + %54 = arith.truncf %52 : tensor<1x1024xf32> to tensor<1x1024xbf16> + tt.store %53, %54, %38 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1024xbf16> + } + tt.return + } +} diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a80c9962b644d504d59d67501eb4ca259df024bc Binary files /dev/null and b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.cubin differ diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.llir b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..cf3b0ffcce6361725dac87b4a9792b5459a8b245 --- /dev/null +++ b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.llir @@ -0,0 +1,327 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %12 = and i32 %11, 31, !dbg !8 + %13 = lshr i32 %11, 5, !dbg !8 + %14 = and i32 %13, 1, !dbg !8 + %urem = shl i32 %11, 2, !dbg !8 + %15 = and i32 %urem, 252, !dbg !8 + %16 = or i32 %15, 1, !dbg !8 + %17 = or i32 %15, 2, !dbg !8 + %18 = or i32 %15, 3, !dbg !8 + %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %20 = shl i32 %19, 8, !dbg !10 + %21 = or i32 %20, %15, !dbg !11 + %22 = sext i32 %21 to i64, !dbg !12 + %23 = getelementptr i16, ptr addrspace(1) %1, i64 %22, !dbg !12 + %24 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %23, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %25 = extractvalue { i32, i32 } %24, 0, !dbg !13 + %26 = extractvalue { i32, i32 } %24, 1, !dbg !13 + %27 = trunc i32 %25 to i16, !dbg !13 + %extelt.offset = lshr i32 %25, 16, !dbg !13 + %28 = trunc i32 %extelt.offset to i16, !dbg !13 + %29 = trunc i32 %26 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %26, 16, !dbg !13 + %30 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14 + %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #3, !dbg !14 + %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #3, !dbg !14 + %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #3, !dbg !14 + %35 = zext nneg i32 %15 to i64, !dbg !15 + %36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !15 + %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !16 + %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !16 + %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !16 + %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !16 + %42 = bitcast i32 %38 to float, !dbg !16 + %43 = bitcast i32 %39 to float, !dbg !16 + %44 = bitcast i32 %40 to float, !dbg !16 + %45 = bitcast i32 %41 to float, !dbg !16 + %46 = getelementptr float, ptr addrspace(1) %3, i64 %22, !dbg !17 + %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %46, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18 + %49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18 + %50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18 + %51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18 + %52 = bitcast i32 %48 to float, !dbg !18 + %53 = bitcast i32 %49 to float, !dbg !18 + %54 = bitcast i32 %50 to float, !dbg !18 + %55 = bitcast i32 %51 to float, !dbg !18 + %56 = sext i32 %19 to i64, !dbg !19 + %57 = getelementptr float, ptr addrspace(1) %4, i64 %56, !dbg !19 + %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20 + %59 = bitcast i32 %58 to float, !dbg !20 + %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20 + %61 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20 + %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20 + %63 = getelementptr float, ptr addrspace(1) %5, i64 %56, !dbg !21 + %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22 + %65 = bitcast i32 %64 to float, !dbg !22 + %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22 + %67 = bitcast i32 %66 to float, !dbg !22 + %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22 + %69 = bitcast i32 %68 to float, !dbg !22 + %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22 + %71 = bitcast i32 %70 to float, !dbg !22 + %72 = getelementptr float, ptr addrspace(1) %0, i64 %22, !dbg !23 + %73 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24 + %74 = extractvalue { i32, i32, i32, i32 } %73, 0, !dbg !24 + %75 = extractvalue { i32, i32, i32, i32 } %73, 1, !dbg !24 + %76 = extractvalue { i32, i32, i32, i32 } %73, 2, !dbg !24 + %77 = extractvalue { i32, i32, i32, i32 } %73, 3, !dbg !24 + %78 = bitcast i32 %74 to float, !dbg !24 + %79 = bitcast i32 %75 to float, !dbg !24 + %80 = bitcast i32 %76 to float, !dbg !24 + %81 = bitcast i32 %77 to float, !dbg !24 + %82 = getelementptr i64, ptr addrspace(1) %6, i64 %56, !dbg !25 + %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !26 + %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !26 + %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !26 + %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !26 + %87 = fmul float %31, %42, !dbg !27 + %88 = fmul float %32, %43, !dbg !27 + %89 = fmul float %33, %44, !dbg !27 + %90 = fmul float %34, %45, !dbg !27 + %91 = fadd float %87, %88, !dbg !28 + %92 = fadd float %89, %91, !dbg !28 + %93 = fadd float %90, %92, !dbg !28 + %94 = bitcast float %93 to i32, !dbg !34 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !34 + %96 = bitcast i32 %95 to float, !dbg !34 + %97 = fadd float %93, %96, !dbg !28 + %98 = bitcast float %97 to i32, !dbg !34 + %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 8, i32 31), !dbg !34 + %100 = bitcast i32 %99 to float, !dbg !34 + %101 = fadd float %97, %100, !dbg !28 + %102 = bitcast float %101 to i32, !dbg !34 + %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 4, i32 31), !dbg !34 + %104 = bitcast i32 %103 to float, !dbg !34 + %105 = fadd float %101, %104, !dbg !28 + %106 = bitcast float %105 to i32, !dbg !34 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 2, i32 31), !dbg !34 + %108 = bitcast i32 %107 to float, !dbg !34 + %109 = fadd float %105, %108, !dbg !28 + %110 = bitcast float %109 to i32, !dbg !34 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 1, i32 31), !dbg !34 + %112 = bitcast i32 %111 to float, !dbg !34 + %113 = fadd float %109, %112, !dbg !28 + %114 = icmp eq i32 %12, 0, !dbg !34 + %115 = zext nneg i32 %14 to i64, !dbg !34 + %116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !34 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %113, i1 %114) #3, !dbg !34 + tail call void @llvm.nvvm.barrier0(), !dbg !34 + %117 = icmp slt i32 %11, 2, !dbg !34 + %118 = sext i32 %11 to i64, !dbg !34 + %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !34 + %120 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !34 + %121 = bitcast float %120 to i32, !dbg !34 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !34 + %123 = bitcast i32 %122 to float, !dbg !34 + %124 = fadd float %120, %123, !dbg !28 + %125 = and i32 %11, 1, !dbg !34 + %126 = icmp eq i32 %125, 0, !dbg !34 + %127 = and i1 %117, %126, !dbg !34 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %124, i1 %127) #3, !dbg !34 + tail call void @llvm.nvvm.barrier0(), !dbg !34 + %128 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !34 + %129 = fadd float %128, 0.000000e+00, !dbg !36 + %130 = fsub float %52, %59, !dbg !40 + %131 = fsub float %53, %59, !dbg !40 + %132 = fsub float %54, %59, !dbg !40 + %133 = fsub float %55, %59, !dbg !40 + %134 = fmul float %130, %65, !dbg !41 + %135 = fmul float %131, %65, !dbg !41 + %136 = fmul float %132, %65, !dbg !41 + %137 = fmul float %133, %65, !dbg !41 + %138 = fmul float %87, %134, !dbg !42 + %139 = fmul float %88, %135, !dbg !42 + %140 = fmul float %89, %136, !dbg !42 + %141 = fmul float %90, %137, !dbg !42 + tail call void @llvm.nvvm.barrier0(), !dbg !43 + %142 = fadd float %138, %139, !dbg !45 + %143 = fadd float %140, %142, !dbg !45 + %144 = fadd float %141, %143, !dbg !45 + %145 = bitcast float %144 to i32, !dbg !43 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 16, i32 31), !dbg !43 + %147 = bitcast i32 %146 to float, !dbg !43 + %148 = fadd float %144, %147, !dbg !45 + %149 = bitcast float %148 to i32, !dbg !43 + %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !43 + %151 = bitcast i32 %150 to float, !dbg !43 + %152 = fadd float %148, %151, !dbg !45 + %153 = bitcast float %152 to i32, !dbg !43 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 4, i32 31), !dbg !43 + %155 = bitcast i32 %154 to float, !dbg !43 + %156 = fadd float %152, %155, !dbg !45 + %157 = bitcast float %156 to i32, !dbg !43 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 2, i32 31), !dbg !43 + %159 = bitcast i32 %158 to float, !dbg !43 + %160 = fadd float %156, %159, !dbg !45 + %161 = bitcast float %160 to i32, !dbg !43 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 1, i32 31), !dbg !43 + %163 = bitcast i32 %162 to float, !dbg !43 + %164 = fadd float %160, %163, !dbg !45 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %164, i1 %114) #3, !dbg !43 + tail call void @llvm.nvvm.barrier0(), !dbg !43 + %165 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !43 + %166 = bitcast float %165 to i32, !dbg !43 + %167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !43 + %168 = bitcast i32 %167 to float, !dbg !43 + %169 = fadd float %165, %168, !dbg !45 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %169, i1 %127) #3, !dbg !43 + tail call void @llvm.nvvm.barrier0(), !dbg !43 + %170 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !43 + %171 = fadd float %170, 0.000000e+00, !dbg !48 + %172 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !50 + %173 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !50 + %174 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 2.560000e+02) #3, !dbg !50 + %175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 2.560000e+02) #3, !dbg !50 + %176 = fmul float %87, 2.560000e+02, !dbg !51 + %177 = fmul float %88, 2.560000e+02, !dbg !51 + %178 = fmul float %89, 2.560000e+02, !dbg !51 + %179 = fmul float %90, 2.560000e+02, !dbg !51 + %180 = fsub float %176, %129, !dbg !52 + %181 = fsub float %177, %129, !dbg !52 + %182 = fsub float %178, %129, !dbg !52 + %183 = fsub float %179, %129, !dbg !52 + %184 = fmul float %134, %171, !dbg !53 + %185 = fmul float %135, %171, !dbg !53 + %186 = fmul float %136, %171, !dbg !53 + %187 = fmul float %137, %171, !dbg !53 + %188 = fsub float %180, %184, !dbg !54 + %189 = fsub float %181, %185, !dbg !54 + %190 = fsub float %182, %186, !dbg !54 + %191 = fsub float %183, %187, !dbg !54 + %192 = fmul float %172, %188, !dbg !55 + %193 = fmul float %172, %189, !dbg !55 + %194 = fmul float %172, %190, !dbg !55 + %195 = fmul float %172, %191, !dbg !55 + %196 = fadd float %192, %78, !dbg !56 + %197 = fadd float %193, %79, !dbg !56 + %198 = fadd float %194, %80, !dbg !56 + %199 = fadd float %195, %81, !dbg !56 + %200 = icmp slt i64 %83, 0, !dbg !57 + %201 = icmp eq i64 %83, -1, !dbg !58 + %202 = select i1 %201, float 0.000000e+00, float %196, !dbg !59 + %203 = select i1 %201, float 0.000000e+00, float %197, !dbg !59 + %204 = select i1 %201, float 0.000000e+00, float %198, !dbg !59 + %205 = select i1 %201, float 0.000000e+00, float %199, !dbg !59 + %206 = bitcast float %196 to i32, !dbg !60 + %207 = bitcast float %197 to i32, !dbg !60 + %208 = bitcast float %198 to i32, !dbg !60 + %209 = bitcast float %199 to i32, !dbg !60 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %206, i32 %207, i32 %208, i32 %209, ptr addrspace(1) %72, i1 true) #3, !dbg !60 + %210 = shl i64 %83, 8, !dbg !61 + %211 = add i64 %210, 12865792, !dbg !61 + %212 = select i1 %200, i64 %211, i64 %210, !dbg !61 + %213 = zext nneg i32 %16 to i64 + %214 = zext nneg i32 %17 to i64 + %215 = zext nneg i32 %18 to i64 + %216 = or i64 %212, %35, !dbg !62 + %217 = or i64 %212, %213, !dbg !62 + %218 = or i64 %212, %214, !dbg !62 + %219 = or i64 %212, %215, !dbg !62 + %220 = getelementptr float, ptr addrspace(1) %7, i64 %216, !dbg !63 + %221 = getelementptr float, ptr addrspace(1) %7, i64 %217, !dbg !63 + %222 = getelementptr float, ptr addrspace(1) %7, i64 %218, !dbg !63 + %223 = getelementptr float, ptr addrspace(1) %7, i64 %219, !dbg !63 + %224 = insertelement <1 x float> undef, float %202, i64 0, !dbg !64 + %225 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %220, <1 x float> %224, i1 true) #3, !dbg !64 + %226 = insertelement <1 x float> undef, float %203, i64 0, !dbg !64 + %227 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %221, <1 x float> %226, i1 true) #3, !dbg !64 + %228 = insertelement <1 x float> undef, float %204, i64 0, !dbg !64 + %229 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %222, <1 x float> %228, i1 true) #3, !dbg !64 + %230 = insertelement <1 x float> undef, float %205, i64 0, !dbg !64 + %231 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %223, <1 x float> %230, i1 true) #3, !dbg !64 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cpofrksimh6penb3xp2bds523dhndqtyshsszqampnuelfbypvda.py", directory: "/tmp/torchinductor_root/po") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 30, column: 67, scope: !5) +!15 = !DILocation(line: 31, column: 30, scope: !5) +!16 = !DILocation(line: 31, column: 35, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 33, column: 30, scope: !5) +!20 = !DILocation(line: 33, column: 35, scope: !5) +!21 = !DILocation(line: 34, column: 31, scope: !5) +!22 = !DILocation(line: 34, column: 36, scope: !5) +!23 = !DILocation(line: 35, column: 35, scope: !5) +!24 = !DILocation(line: 35, column: 51, scope: !5) +!25 = !DILocation(line: 36, column: 31, scope: !5) +!26 = !DILocation(line: 36, column: 36, scope: !5) +!27 = !DILocation(line: 38, column: 18, scope: !5) +!28 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !32) +!29 = distinct !DILexicalBlockFile(scope: !31, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!31 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) +!33 = !DILocation(line: 41, column: 57, scope: !29) +!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35) +!35 = !DILocation(line: 41, column: 57, scope: !31) +!36 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !39) +!37 = distinct !DILexicalBlockFile(scope: !5, file: !38, discriminator: 0) +!38 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!39 = !DILocation(line: 41, column: 44, scope: !37) +!40 = !DILocation(line: 42, column: 19, scope: !5) +!41 = !DILocation(line: 43, column: 20, scope: !5) +!42 = !DILocation(line: 44, column: 19, scope: !5) +!43 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !44) +!44 = !DILocation(line: 47, column: 59, scope: !31) +!45 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !46) +!46 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !47) +!47 = !DILocation(line: 47, column: 59, scope: !29) +!48 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !49) +!49 = !DILocation(line: 47, column: 45, scope: !37) +!50 = !DILocation(line: 49, column: 20, scope: !5) +!51 = !DILocation(line: 50, column: 19, scope: !5) +!52 = !DILocation(line: 51, column: 20, scope: !5) +!53 = !DILocation(line: 52, column: 20, scope: !5) +!54 = !DILocation(line: 53, column: 20, scope: !5) +!55 = !DILocation(line: 54, column: 20, scope: !5) +!56 = !DILocation(line: 55, column: 20, scope: !5) +!57 = !DILocation(line: 57, column: 20, scope: !5) +!58 = !DILocation(line: 60, column: 21, scope: !5) +!59 = !DILocation(line: 62, column: 35, scope: !5) +!60 = !DILocation(line: 63, column: 51, scope: !5) +!61 = !DILocation(line: 64, column: 56, scope: !5) +!62 = !DILocation(line: 64, column: 52, scope: !5) +!63 = !DILocation(line: 64, column: 30, scope: !5) +!64 = !DILocation(line: 64, column: 83, scope: !5) +!65 = !DILocation(line: 64, column: 4, scope: !5) diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ptx b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..58e7414776718dffef1a33026b3cf4e94a499835 --- /dev/null +++ b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ptx @@ -0,0 +1,788 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8de9de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7d8de9de( + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<46>; + .reg .b16 %rs<5>; + .reg .b32 %r<110>; + .reg .f32 %f<90>; + .reg .b64 %rd<45>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; + ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r78, %tid.x; + and.b32 %r79, %r78, 31; + ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; + ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; + ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; + shl.b32 %r80, %r78, 2; + ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; + and.b32 %r81, %r80, 252; + ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; + ld.param.u64 %rd33, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r82, %r1, 8; + .loc 1 30 36 + or.b32 %r83, %r82, %r81; + .loc 1 30 30 + mul.wide.s32 %rd34, %r83, 2; + add.s64 %rd1, %rd27, %rd34; + mov.b32 %r4, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r4; + @!%p1 mov.u32 %r3, %r4; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 30 67 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + .loc 1 31 30 + cvt.u64.u32 %rd35, %r81; + mul.wide.u32 %rd36, %r81, 4; + add.s64 %rd2, %rd28, %rd36; + .loc 1 31 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r4; + @!%p1 mov.u32 %r11, %r4; + @!%p1 mov.u32 %r12, %r4; + @!%p1 mov.u32 %r13, %r4; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 32 30 + mul.wide.s32 %rd37, %r83, 4; + add.s64 %rd3, %rd29, %rd37; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r4; + @!%p1 mov.u32 %r19, %r4; + @!%p1 mov.u32 %r20, %r4; + @!%p1 mov.u32 %r21, %r4; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + .loc 1 33 30 + mul.wide.s32 %rd38, %r1, 4; + add.s64 %rd4, %rd30, %rd38; + .loc 1 33 35 + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ]; + mov.b32 %f13, %r26; + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ]; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ]; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ]; + .loc 1 34 31 + add.s64 %rd8, %rd31, %rd38; + .loc 1 34 36 + mov.u32 %r55, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ]; + mov.b32 %f14, %r55; + mov.u32 %r31, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ]; + mov.u32 %r32, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ]; + mov.u32 %r33, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ]; + .loc 1 35 35 + add.s64 %rd12, %rd26, %rd37; + .loc 1 35 51 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ]; + @!%p1 mov.u32 %r34, %r4; + @!%p1 mov.u32 %r35, %r4; + @!%p1 mov.u32 %r36, %r4; + @!%p1 mov.u32 %r37, %r4; + mov.b32 %f15, %r34; + mov.b32 %f16, %r35; + mov.b32 %f17, %r36; + mov.b32 %f18, %r37; + .loc 1 36 31 + mul.wide.s32 %rd39, %r1, 8; + add.s64 %rd14, %rd32, %rd39; + .loc 1 36 36 + mov.u64 %rd13, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd14 + 0 ]; + mov.u64 %rd15, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd14 + 0 ]; + mov.u64 %rd17, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd14 + 0 ]; + mov.u64 %rd19, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd14 + 0 ]; + .loc 1 38 18 + mul.f32 %f19, %f1, %f5; + mul.f32 %f20, %f2, %f6; + mul.f32 %f21, %f3, %f7; + mul.f32 %f22, %f4, %f8; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f23, %f1, %f5, %f20; + fma.rn.f32 %f24, %f3, %f7, %f23; + fma.rn.f32 %f25, %f4, %f8, %f24; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r84, %f25; + shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1; + mov.b32 %f26, %r85; +$L__tmp3: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r86, %f27; + shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1; + mov.b32 %f28, %r87; +$L__tmp5: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r88, %f29; + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + mov.b32 %f30, %r89; +$L__tmp7: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r90, %f31; + shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1; + mov.b32 %f32, %r91; +$L__tmp9: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r92, %f33; + shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1; + mov.b32 %f34, %r93; +$L__tmp11: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p31, %r79, 0; + shr.u32 %r94, %r78, 3; + and.b32 %r95, %r94, 4; + mov.u32 %r96, global_smem; + add.s32 %r42, %r96, %r95; + mov.b32 %r43, %f35; + @%p31 st.shared.b32 [ %r42 + 0 ], %r43; + bar.sync 0; + setp.lt.s32 %p32, %r78, 2; + add.s32 %r45, %r96, %r80; + @%p32 ld.shared.b32 %r44, [ %r45 + 0 ]; + mov.b32 %f36, %r44; + shfl.sync.bfly.b32 %r97, %r44, 1, 31, -1; + mov.b32 %f37, %r97; +$L__tmp13: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp14: + .loc 2 243 36 + and.b32 %r98, %r78, 1; + setp.eq.b32 %p42, %r98, 1; + not.pred %p43, %p42; + and.pred %p33, %p32, %p43; + mov.b32 %r47, %f38; + @%p33 st.shared.b32 [ %r45 + 0 ], %r47; + bar.sync 0; + ld.shared.f32 %f39, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f40, %f39, 0f00000000; +$L__tmp16: + .loc 1 42 19 + sub.f32 %f41, %f9, %f13; + sub.f32 %f42, %f10, %f13; + sub.f32 %f43, %f11, %f13; + sub.f32 %f44, %f12, %f13; + .loc 1 43 20 + mul.f32 %f45, %f41, %f14; + mul.f32 %f46, %f42, %f14; + mul.f32 %f47, %f43, %f14; + mul.f32 %f48, %f44, %f14; + .loc 1 44 19 + mul.f32 %f49, %f20, %f46; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f50, %f19, %f45, %f49; + fma.rn.f32 %f51, %f21, %f47, %f50; + fma.rn.f32 %f52, %f22, %f48, %f51; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r99, %f52; + shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1; + mov.b32 %f53, %r100; +$L__tmp20: + .loc 2 233 15 + add.f32 %f54, %f52, %f53; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r101, %f54; + shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1; + mov.b32 %f55, %r102; +$L__tmp22: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r103, %f56; + shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1; + mov.b32 %f57, %r104; +$L__tmp24: + .loc 2 233 15 + add.f32 %f58, %f56, %f57; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r105, %f58; + shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1; + mov.b32 %f59, %r106; +$L__tmp26: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r107, %f60; + shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1; + mov.b32 %f61, %r108; +$L__tmp28: + .loc 2 233 15 + add.f32 %f62, %f60, %f61; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r49, %f62; + @%p31 st.shared.b32 [ %r42 + 0 ], %r49; + bar.sync 0; + @%p32 ld.shared.b32 %r50, [ %r45 + 0 ]; + mov.b32 %f63, %r50; + shfl.sync.bfly.b32 %r109, %r50, 1, 31, -1; + mov.b32 %f64, %r109; +$L__tmp30: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r53, %f65; + @%p33 st.shared.b32 [ %r45 + 0 ], %r53; + bar.sync 0; + ld.shared.f32 %f66, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f67, %f66, 0f00000000; + mov.b32 %r56, 1132462080; +$L__tmp33: + .loc 1 49 20 + div.full.f32 %r54, %r55, %r56; + mov.b32 %f68, %r54; + .loc 1 51 20 + neg.f32 %f69, %f40; + fma.rn.f32 %f70, %f19, 0f43800000, %f69; + fma.rn.f32 %f71, %f20, 0f43800000, %f69; + fma.rn.f32 %f72, %f21, 0f43800000, %f69; + fma.rn.f32 %f73, %f22, 0f43800000, %f69; + .loc 1 53 20 + neg.f32 %f74, %f45; + fma.rn.f32 %f75, %f74, %f67, %f70; + neg.f32 %f76, %f46; + fma.rn.f32 %f77, %f76, %f67, %f71; + neg.f32 %f78, %f47; + fma.rn.f32 %f79, %f78, %f67, %f72; + neg.f32 %f80, %f48; + fma.rn.f32 %f81, %f80, %f67, %f73; + .loc 1 55 20 + fma.rn.f32 %f82, %f68, %f75, %f15; + fma.rn.f32 %f83, %f68, %f77, %f16; + fma.rn.f32 %f84, %f68, %f79, %f17; + fma.rn.f32 %f85, %f68, %f81, %f18; + .loc 1 57 20 + setp.lt.s64 %p44, %rd13, 0; + .loc 1 60 21 + setp.eq.s64 %p45, %rd13, -1; + .loc 1 62 35 + selp.f32 %f86, 0f00000000, %f82, %p45; + selp.f32 %f87, 0f00000000, %f83, %p45; + selp.f32 %f88, 0f00000000, %f84, %p45; + selp.f32 %f89, 0f00000000, %f85, %p45; + .loc 1 63 51 + mov.b32 %r66, %f82; + mov.b32 %r67, %f83; + mov.b32 %r68, %f84; + mov.b32 %r69, %f85; + @%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 }; + .loc 1 64 56 + shl.b64 %rd40, %rd13, 8; + add.s64 %rd41, %rd40, 12865792; + selp.b64 %rd42, %rd41, %rd40, %p44; + .loc 1 64 52 + or.b64 %rd43, %rd42, %rd35; + .loc 1 64 30 + shl.b64 %rd44, %rd43, 2; + add.s64 %rd22, %rd33, %rd44; + add.s64 %rd23, %rd22, 4; + add.s64 %rd24, %rd22, 8; + add.s64 %rd25, %rd22, 12; + .loc 1 64 83 + mov.b32 %r71, %f86; + mov.u32 %r70, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r70, [ %rd22 + 0 ], %r71; + mov.b32 %r73, %f87; + mov.u32 %r72, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r72, [ %rd23 + 0 ], %r73; + mov.b32 %r75, %f88; + mov.u32 %r74, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r74, [ %rd24 + 0 ], %r75; + mov.b32 %r77, %f89; + mov.u32 %r76, 0x0; + @%p1 atom.global.gpu.acq_rel.add.f32 %r76, [ %rd25 + 0 ], %r77; + .loc 1 64 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/po/cpofrksimh6penb3xp2bds523dhndqtyshsszqampnuelfbypvda.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 407 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 111 +.b8 102 +.b8 114 +.b8 107 +.b8 115 +.b8 105 +.b8 109 +.b8 104 +.b8 54 +.b8 112 +.b8 101 +.b8 110 +.b8 98 +.b8 51 +.b8 120 +.b8 112 +.b8 50 +.b8 98 +.b8 100 +.b8 115 +.b8 53 +.b8 50 +.b8 51 +.b8 100 +.b8 104 +.b8 110 +.b8 100 +.b8 113 +.b8 116 +.b8 121 +.b8 115 +.b8 104 +.b8 115 +.b8 115 +.b8 122 +.b8 113 +.b8 97 +.b8 109 +.b8 112 +.b8 110 +.b8 117 +.b8 101 +.b8 108 +.b8 102 +.b8 98 +.b8 121 +.b8 112 +.b8 118 +.b8 100 +.b8 97 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 111 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 41 +.b8 57 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 41 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 47 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 47 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 47 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a637ba49a95253d43106bd4af13cb18dde546f08 --- /dev/null +++ b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttgir @@ -0,0 +1,89 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked> + %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked> + %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked> + %cst_5 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %24 = tt.load %23, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %25 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %26 = tt.splat %25 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %27 = tt.load %26 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked> + %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %31 = arith.addf %30, %cst_5 : f32 + %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %33 = arith.subf %15, %32 : tensor<256xf32, #blocked> + %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked> + %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked> + %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %39 = arith.addf %38, %cst_5 : f32 + %40 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked> + %41 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked> + %42 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked> + %43 = arith.subf %41, %42 : tensor<256xf32, #blocked> + %44 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked> + %45 = arith.mulf %35, %44 : tensor<256xf32, #blocked> + %46 = arith.subf %43, %45 : tensor<256xf32, #blocked> + %47 = tt.broadcast %40 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %48 = arith.mulf %47, %46 : tensor<256xf32, #blocked> + %49 = arith.addf %24, %48 : tensor<256xf32, #blocked> + %50 = arith.addi %27, %cst_4 : tensor<1xi64, #blocked> + %51 = arith.cmpi slt, %27, %cst_3 : tensor<1xi64, #blocked> + %52 = arith.select %51, %50, %27 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked> + %53 = arith.cmpi eq, %27, %cst_1 : tensor<1xi64, #blocked> + %54 = tt.broadcast %53 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked> + %55 = arith.select %54, %cst_6, %49 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + tt.store %23, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %56 = arith.muli %52, %cst_2 : tensor<1xi64, #blocked> + %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked> + %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %59 = arith.addi %58, %57 : tensor<256xi64, #blocked> + %60 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %62 = "tt.atomic_rmw"(%61, %55, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttir b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f6a2ea0e2e58c7a574987a76993b9ac72a2d1ed7 --- /dev/null +++ b/.triton/dump/de65136a69e74dcdbbc6266b27e86b0a/triton_.ttir @@ -0,0 +1,88 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<256> : tensor<1xi64> + %cst_2 = arith.constant dense<0> : tensor<1xi64> + %cst_3 = arith.constant dense<50257> : tensor<1xi64> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_5 = arith.constant dense<-1> : tensor<1xi64> + %cst_6 = arith.constant dense<2.560000e+02> : tensor<256xf32> + %cst_7 = arith.constant dense<2.560000e+02> : tensor<1xf32> + %cst_8 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_8 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %22 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %24 = tt.load %23, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %25 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %26 = tt.splat %25 : (!tt.ptr) -> tensor<1x!tt.ptr> + %27 = tt.load %26 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64> + %28 = arith.mulf %9, %12 : tensor<256xf32> + %29 = arith.select %2, %28, %cst_4 : tensor<256xi1>, tensor<256xf32> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32>) -> f32 + %31 = arith.addf %30, %cst_0 : f32 + %32 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32> + %33 = arith.subf %15, %32 : tensor<256xf32> + %34 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32> + %35 = arith.mulf %33, %34 : tensor<256xf32> + %36 = arith.mulf %28, %35 : tensor<256xf32> + %37 = arith.select %2, %36, %cst_4 : tensor<256xi1>, tensor<256xf32> + %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32>) -> f32 + %39 = arith.addf %38, %cst_0 : f32 + %40 = arith.divf %21, %cst_7 : tensor<1xf32> + %41 = arith.mulf %28, %cst_6 : tensor<256xf32> + %42 = tt.splat %31 : (f32) -> tensor<256xf32> + %43 = arith.subf %41, %42 : tensor<256xf32> + %44 = tt.splat %39 : (f32) -> tensor<256xf32> + %45 = arith.mulf %35, %44 : tensor<256xf32> + %46 = arith.subf %43, %45 : tensor<256xf32> + %47 = tt.broadcast %40 : (tensor<1xf32>) -> tensor<256xf32> + %48 = arith.mulf %47, %46 : tensor<256xf32> + %49 = arith.addf %24, %48 : tensor<256xf32> + %50 = arith.addi %27, %cst_3 : tensor<1xi64> + %51 = arith.cmpi slt, %27, %cst_2 : tensor<1xi64> + %52 = arith.select %51, %50, %27 : tensor<1xi1>, tensor<1xi64> + %53 = arith.cmpi eq, %27, %cst_5 : tensor<1xi64> + %54 = tt.broadcast %53 : (tensor<1xi1>) -> tensor<256xi1> + %55 = arith.select %54, %cst_4, %49 : tensor<256xi1>, tensor<256xf32> + tt.store %23, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %56 = arith.muli %52, %cst_1 : tensor<1xi64> + %57 = tt.broadcast %56 : (tensor<1xi64>) -> tensor<256xi64> + %58 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64> + %59 = arith.addi %58, %57 : tensor<256xi64> + %60 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr>, tensor<256xi64> + %62 = "tt.atomic_rmw"(%61, %55, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr>, tensor<256xf32>, tensor<256xi1>) -> tensor<256xf32> + tt.return + } +} diff --git a/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ptx b/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e8c9d794384aac738436b727e167f633410dbee0 --- /dev/null +++ b/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ptx @@ -0,0 +1,392 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u64 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<13>; + .reg .b16 %rs<5>; + .reg .b32 %r<17>; + .reg .b64 %rd<35>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2de_param_1]; + ld.param.u64 %rd10, [triton__0d1d2de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r4, %tid.x; + and.b32 %r1, %r4, 127; + shl.b32 %r2, %r1, 1; + or.b32 %r5, %r2, 1; + .loc 1 20 28 + mov.u32 %r3, %ctaid.x; + .loc 1 20 46 + mul.wide.s32 %rd1, %r3, 256; + cvt.u64.u32 %rd11, %r2; + .loc 1 21 23 + or.b64 %rd2, %rd1, %rd11; + .loc 1 24 30 + shl.b64 %rd12, %rd2, 3; + add.s64 %rd9, %rd10, %rd12; + mov.pred %p11, -1; + .loc 1 24 35 + mov.u64 %rd7, 0x0; + mov.u64 %rd8, 0x0; + @%p11 ld.global.v2.b64 { %rd7, %rd8 }, [ %rd9 + 0 ]; + .loc 1 26 19 + setp.eq.s64 %p2, %rd8, -1; + setp.eq.s64 %p3, %rd7, -1; + .loc 1 28 32 + selp.b64 %rd13, 0, %rd7, %p3; + selp.b64 %rd14, 0, %rd8, %p2; + .loc 1 29 18 + add.s64 %rd15, %rd14, 50257; + add.s64 %rd16, %rd13, 50257; + .loc 1 30 18 + setp.lt.s64 %p4, %rd14, 0; + setp.lt.s64 %p5, %rd13, 0; + .loc 1 31 32 + selp.b64 %rd4, %rd16, %rd13, %p5; + selp.b64 %rd3, %rd15, %rd14, %p4; + .loc 1 32 36 + setp.lt.u64 %p6, %rd3, 50257; + setp.lt.u64 %p7, %rd4, 50257; + mov.u32 %r6, global_smem; + add.s32 %r7, %r6, %r2; + selp.u16 %rs1, 1, 0, %p7; + st.shared.u8 [%r7], %rs1; + cvt.u64.u32 %rd5, %r5; + selp.u16 %rs2, 1, 0, %p6; + st.shared.u8 [%r7+1], %rs2; + bar.sync 0; + add.s32 %r8, %r6, %r1; + ld.shared.u8 %rs3, [%r8]; + setp.ne.s16 %p8, %rs3, 0; + ld.shared.u8 %rs4, [%r8+128]; + setp.ne.s16 %p9, %rs4, 0; + .loc 1 32 51 + and.pred %p10, %p8, %p9; + @%p10 bra $L__BB0_2; + mov.u64 %rd17, assertMessage_0; + cvta.global.u64 %rd18, %rd17; + mov.u64 %rd19, assertFile_0; + cvta.global.u64 %rd20, %rd19; + mov.u64 %rd21, assertFunc_0; + cvta.global.u64 %rd22, %rd21; + mov.b32 %r9, 883; + mov.u64 %rd23, 1; + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd18; + .param .b64 param1; + st.param.b64 [param1+0], %rd20; + .param .b32 param2; + st.param.b32 [param2+0], %r9; + .param .b64 param3; + st.param.b64 [param3+0], %rd22; + .param .b64 param4; + st.param.b64 [param4+0], %rd23; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 1 +$L__BB0_2: + .loc 1 21 23 + or.b64 %rd26, %rd1, %rd5; + .loc 1 34 25 + shl.b64 %rd27, %rd4, 2; + add.s64 %rd28, %rd6, %rd27; + mul.lo.s64 %rd29, %rd2, 201028; + add.s64 %rd30, %rd28, %rd29; + shl.b64 %rd31, %rd3, 2; + add.s64 %rd32, %rd6, %rd31; + mul.lo.s64 %rd33, %rd26, 201028; + add.s64 %rd34, %rd32, %rd33; + .loc 1 34 51 + bar.sync 0; + shl.b32 %r12, %r2, 3; + add.s32 %r14, %r6, %r12; + st.shared.u64 [%r14], %rd30; + st.shared.u64 [%r14+8], %rd34; + bar.sync 0; + shl.b32 %r15, %r1, 3; + add.s32 %r16, %r6, %r15; + ld.shared.u64 %rd24, [%r16]; + ld.shared.u64 %rd25, [%r16+1024]; + mov.b32 %r10, -1082130432; + @%p11 st.global.b32 [ %rd24 + 0 ], { %r10 }; + @%p11 st.global.b32 [ %rd25 + 0 ], { %r10 }; + .loc 1 34 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 104 +.b8 108 +.b8 114 +.b8 107 +.b8 103 +.b8 112 +.b8 118 +.b8 118 +.b8 98 +.b8 100 +.b8 105 +.b8 122 +.b8 100 +.b8 122 +.b8 55 +.b8 115 +.b8 108 +.b8 108 +.b8 113 +.b8 117 +.b8 101 +.b8 116 +.b8 50 +.b8 106 +.b8 55 +.b8 122 +.b8 104 +.b8 116 +.b8 101 +.b8 115 +.b8 54 +.b8 109 +.b8 101 +.b8 104 +.b8 54 +.b8 107 +.b8 101 +.b8 110 +.b8 114 +.b8 113 +.b8 120 +.b8 111 +.b8 118 +.b8 50 +.b8 54 +.b8 109 +.b8 115 +.b8 119 +.b8 118 +.b8 119 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 104 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ttgir b/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..b51e06edcd7ccd95b8ea5a2d9941d08b43197ac2 --- /dev/null +++ b/.triton/dump/f5d48c7e9f2699ef21617a065eeff857/triton_.ttgir @@ -0,0 +1,38 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<50257> : tensor<256xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<256xi64, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<256xi64, #blocked> + %cst_2 = arith.constant dense<-1.000000e+00> : tensor<256xf32, #blocked1> + %c256_i64 = arith.constant 256 : i64 + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c256_i64 : i64 + %3 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %4 = arith.extsi %3 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %5 = tt.splat %2 : (i64) -> tensor<256xi64, #blocked> + %6 = arith.addi %5, %4 : tensor<256xi64, #blocked> + %7 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %8 = tt.addptr %7, %6 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xi64, #blocked> + %10 = arith.cmpi ne, %9, %cst_1 : tensor<256xi64, #blocked> + %11 = arith.select %10, %9, %cst_0 : tensor<256xi1, #blocked>, tensor<256xi64, #blocked> + %12 = arith.addi %11, %cst : tensor<256xi64, #blocked> + %13 = arith.cmpi slt, %11, %cst_0 : tensor<256xi64, #blocked> + %14 = arith.select %13, %12, %11 : tensor<256xi1, #blocked>, tensor<256xi64, #blocked> + %15 = arith.cmpi sge, %14, %cst_0 : tensor<256xi64, #blocked> + %16 = arith.cmpi slt, %14, %cst : tensor<256xi64, #blocked> + %17 = arith.andi %15, %16 : tensor<256xi1, #blocked> + %18 = triton_gpu.convert_layout %17 : (tensor<256xi1, #blocked>) -> tensor<256xi1, #blocked1> + tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "", "_call_with_frames_removed", 883 : tensor<256xi1, #blocked1> + %19 = arith.muli %6, %cst : tensor<256xi64, #blocked> + %20 = arith.addi %14, %19 : tensor<256xi64, #blocked> + %21 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %22 = tt.addptr %21, %20 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %23 = triton_gpu.convert_layout %22 : (tensor<256x!tt.ptr, #blocked>) -> tensor<256x!tt.ptr, #blocked1> + tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.cubin b/.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6dad16c7861d96710e209531e575edd30a82a867 Binary files /dev/null and b/.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.cubin differ