0-hero's picture
Add files using upload-large-folder tool
b2f8fe7 verified
raw
history blame
50.1 kB
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
define void @triton__0d1d2d3d4de5(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i64 %5) local_unnamed_addr !dbg !7 {
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
%8 = lshr i32 %7, 5, !dbg !10
%urem = and i32 %7, 255, !dbg !10
%9 = or i32 %urem, 256, !dbg !10
%10 = or i32 %urem, 512, !dbg !10
%11 = or i32 %urem, 768, !dbg !10
%12 = or i32 %urem, 1024, !dbg !10
%13 = or i32 %urem, 1280, !dbg !10
%14 = or i32 %urem, 1536, !dbg !10
%15 = or i32 %urem, 1792, !dbg !10
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !11
%17 = sext i32 %16 to i64, !dbg !12
%18 = insertelement <8 x i32> poison, i32 %urem, i64 0
%19 = insertelement <8 x i32> %18, i32 %9, i64 1
%20 = insertelement <8 x i32> %19, i32 %10, i64 2
%21 = insertelement <8 x i32> %20, i32 %11, i64 3
%22 = insertelement <8 x i32> %21, i32 %12, i64 4
%23 = insertelement <8 x i32> %22, i32 %13, i64 5
%24 = insertelement <8 x i32> %23, i32 %14, i64 6
%25 = insertelement <8 x i32> %24, i32 %15, i64 7
%26 = zext <8 x i32> %25 to <8 x i64>
%27 = mul nsw i64 %17, 50257, !dbg !13
%invariant.gep = getelementptr i16, ptr addrspace(1) %0, i64 %27, !dbg !14
br label %28, !dbg !14
28: ; preds = %6, %28
%29 = phi i32 [ 0, %6 ], [ %81, %28 ]
%30 = phi <8 x float> [ <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, %6 ], [ %80, %28 ]
%31 = zext nneg i32 %29 to i64, !dbg !15
%32 = fcmp ord <8 x float> %30, zeroinitializer, !dbg !16
%33 = insertelement <8 x i64> poison, i64 %31, i64 0, !dbg !15
%34 = shufflevector <8 x i64> %33, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !15
%35 = or <8 x i64> %34, %26, !dbg !15
%36 = icmp ult <8 x i64> %35, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !20
%37 = extractelement <8 x i64> %35, i64 0, !dbg !21
%gep = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !21
%38 = extractelement <8 x i64> %35, i64 1, !dbg !21
%gep21 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %38, !dbg !21
%39 = extractelement <8 x i64> %35, i64 2, !dbg !21
%gep23 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %39, !dbg !21
%40 = extractelement <8 x i64> %35, i64 3, !dbg !21
%gep25 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %40, !dbg !21
%41 = extractelement <8 x i64> %35, i64 4, !dbg !21
%gep27 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %41, !dbg !21
%42 = extractelement <8 x i64> %35, i64 5, !dbg !21
%gep29 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %42, !dbg !21
%43 = extractelement <8 x i64> %35, i64 6, !dbg !21
%gep31 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %43, !dbg !21
%44 = extractelement <8 x i64> %35, i64 7, !dbg !21
%gep33 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %44, !dbg !21
%45 = extractelement <8 x i1> %36, i64 0, !dbg !22
%46 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep, i1 %45, i16 0, i1 %45) #5, !dbg !22
%47 = extractelement <8 x i1> %36, i64 1, !dbg !22
%48 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep21, i1 %47, i16 0, i1 %47) #5, !dbg !22
%49 = extractelement <8 x i1> %36, i64 2, !dbg !22
%50 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep23, i1 %49, i16 0, i1 %49) #5, !dbg !22
%51 = extractelement <8 x i1> %36, i64 3, !dbg !22
%52 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep25, i1 %51, i16 0, i1 %51) #5, !dbg !22
%53 = extractelement <8 x i1> %36, i64 4, !dbg !22
%54 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep27, i1 %53, i16 0, i1 %53) #5, !dbg !22
%55 = extractelement <8 x i1> %36, i64 5, !dbg !22
%56 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep29, i1 %55, i16 0, i1 %55) #5, !dbg !22
%57 = extractelement <8 x i1> %36, i64 6, !dbg !22
%58 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep31, i1 %57, i16 0, i1 %57) #5, !dbg !22
%59 = extractelement <8 x i1> %36, i64 7, !dbg !22
%60 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep33, i1 %59, i16 0, i1 %59) #5, !dbg !22
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #5, !dbg !23
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #5, !dbg !23
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #5, !dbg !23
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #5, !dbg !23
%65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #5, !dbg !23
%66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #5, !dbg !23
%67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #5, !dbg !23
%68 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #5, !dbg !23
%69 = insertelement <8 x float> poison, float %61, i64 0, !dbg !24
%70 = insertelement <8 x float> %69, float %62, i64 1, !dbg !24
%71 = insertelement <8 x float> %70, float %63, i64 2, !dbg !24
%72 = insertelement <8 x float> %71, float %64, i64 3, !dbg !24
%73 = insertelement <8 x float> %72, float %65, i64 4, !dbg !24
%74 = insertelement <8 x float> %73, float %66, i64 5, !dbg !24
%75 = insertelement <8 x float> %74, float %67, i64 6, !dbg !24
%76 = insertelement <8 x float> %75, float %68, i64 7, !dbg !24
%77 = fcmp ule <8 x float> %30, %76, !dbg !24
%78 = and <8 x i1> %32, %77, !dbg !25
%79 = and <8 x i1> %36, %78, !dbg !26
%80 = select <8 x i1> %79, <8 x float> %76, <8 x float> %30, !dbg !26
%81 = add nuw nsw i32 %29, 2048, !dbg !14
%82 = icmp ult i32 %29, 48209, !dbg !14
br i1 %82, label %28, label %83, !dbg !14
83: ; preds = %28
%84 = and i32 %7, 31, !dbg !10
%85 = and i32 %8, 7, !dbg !10
%86 = extractelement <8 x float> %80, i64 0, !dbg !27
%87 = extractelement <8 x float> %80, i64 1, !dbg !27
%88 = fcmp ogt float %86, %87, !dbg !27
%89 = fcmp uno float %86, 0.000000e+00, !dbg !31
%90 = or i1 %88, %89, !dbg !32
%91 = select i1 %90, float %86, float %87, !dbg !33
%92 = extractelement <8 x float> %80, i64 2, !dbg !27
%93 = fcmp ogt float %91, %92, !dbg !27
%94 = fcmp uno float %91, 0.000000e+00, !dbg !31
%95 = or i1 %93, %94, !dbg !32
%96 = select i1 %95, float %91, float %92, !dbg !33
%97 = extractelement <8 x float> %80, i64 3, !dbg !27
%98 = fcmp ogt float %96, %97, !dbg !27
%99 = fcmp uno float %96, 0.000000e+00, !dbg !31
%100 = or i1 %98, %99, !dbg !32
%101 = select i1 %100, float %96, float %97, !dbg !33
%102 = extractelement <8 x float> %80, i64 4, !dbg !27
%103 = fcmp ogt float %101, %102, !dbg !27
%104 = fcmp uno float %101, 0.000000e+00, !dbg !31
%105 = or i1 %103, %104, !dbg !32
%106 = select i1 %105, float %101, float %102, !dbg !33
%107 = extractelement <8 x float> %80, i64 5, !dbg !27
%108 = fcmp ogt float %106, %107, !dbg !27
%109 = fcmp uno float %106, 0.000000e+00, !dbg !31
%110 = or i1 %108, %109, !dbg !32
%111 = select i1 %110, float %106, float %107, !dbg !33
%112 = extractelement <8 x float> %80, i64 6, !dbg !27
%113 = fcmp ogt float %111, %112, !dbg !27
%114 = fcmp uno float %111, 0.000000e+00, !dbg !31
%115 = or i1 %113, %114, !dbg !32
%116 = select i1 %115, float %111, float %112, !dbg !33
%117 = extractelement <8 x float> %80, i64 7, !dbg !27
%118 = fcmp ogt float %116, %117, !dbg !27
%119 = fcmp uno float %116, 0.000000e+00, !dbg !31
%120 = or i1 %118, %119, !dbg !32
%121 = select i1 %120, float %116, float %117, !dbg !33
%122 = bitcast float %121 to i32, !dbg !34
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 16, i32 31), !dbg !34
%124 = bitcast i32 %123 to float, !dbg !34
%125 = fcmp ogt float %121, %124, !dbg !27
%126 = fcmp uno float %121, 0.000000e+00, !dbg !31
%127 = or i1 %126, %125, !dbg !32
%128 = select i1 %127, float %121, float %124, !dbg !33
%129 = bitcast float %128 to i32, !dbg !34
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !34
%131 = bitcast i32 %130 to float, !dbg !34
%132 = fcmp ogt float %128, %131, !dbg !27
%133 = fcmp uno float %128, 0.000000e+00, !dbg !31
%134 = or i1 %132, %133, !dbg !32
%135 = select i1 %134, float %128, float %131, !dbg !33
%136 = bitcast float %135 to i32, !dbg !34
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !34
%138 = bitcast i32 %137 to float, !dbg !34
%139 = fcmp ogt float %135, %138, !dbg !27
%140 = fcmp uno float %135, 0.000000e+00, !dbg !31
%141 = or i1 %139, %140, !dbg !32
%142 = select i1 %141, float %135, float %138, !dbg !33
%143 = bitcast float %142 to i32, !dbg !34
%144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 2, i32 31), !dbg !34
%145 = bitcast i32 %144 to float, !dbg !34
%146 = fcmp ogt float %142, %145, !dbg !27
%147 = fcmp uno float %142, 0.000000e+00, !dbg !31
%148 = or i1 %146, %147, !dbg !32
%149 = select i1 %148, float %142, float %145, !dbg !33
%150 = bitcast float %149 to i32, !dbg !34
%151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 1, i32 31), !dbg !34
%152 = bitcast i32 %151 to float, !dbg !34
%153 = fcmp ogt float %149, %152, !dbg !27
%154 = fcmp uno float %149, 0.000000e+00, !dbg !31
%155 = or i1 %153, %154, !dbg !32
%156 = select i1 %155, float %149, float %152, !dbg !33
%157 = icmp eq i32 %84, 0, !dbg !34
%158 = zext nneg i32 %85 to i64, !dbg !34
%159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !34
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %156, i1 %157) #5, !dbg !34
tail call void @llvm.nvvm.barrier0(), !dbg !34
%160 = icmp slt i32 %7, 8, !dbg !34
%161 = sext i32 %7 to i64, !dbg !34
%162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !34
%163 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %162, i1 %160) #5, !dbg !34
%164 = bitcast float %163 to i32, !dbg !34
%165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 4, i32 31), !dbg !34
%166 = bitcast i32 %165 to float, !dbg !34
%167 = fcmp ogt float %163, %166, !dbg !27
%168 = fcmp uno float %163, 0.000000e+00, !dbg !31
%169 = or i1 %168, %167, !dbg !32
%170 = select i1 %169, float %163, float %166, !dbg !33
%171 = bitcast float %170 to i32, !dbg !34
%172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 2, i32 31), !dbg !34
%173 = bitcast i32 %172 to float, !dbg !34
%174 = fcmp ogt float %170, %173, !dbg !27
%175 = fcmp uno float %170, 0.000000e+00, !dbg !31
%176 = or i1 %174, %175, !dbg !32
%177 = select i1 %176, float %170, float %173, !dbg !33
%178 = bitcast float %177 to i32, !dbg !34
%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !34
%180 = bitcast i32 %179 to float, !dbg !34
%181 = fcmp ogt float %177, %180, !dbg !27
%182 = fcmp uno float %177, 0.000000e+00, !dbg !31
%183 = or i1 %181, %182, !dbg !32
%184 = select i1 %183, float %177, float %180, !dbg !33
%185 = and i32 %7, 7, !dbg !34
%186 = icmp eq i32 %185, 0, !dbg !34
%187 = and i1 %160, %186, !dbg !34
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %162, float %184, i1 %187) #5, !dbg !34
tail call void @llvm.nvvm.barrier0(), !dbg !34
%188 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !34
tail call void @llvm.nvvm.barrier0(), !dbg !36
%189 = insertelement <1 x float> undef, float %188, i64 0, !dbg !36
store <1 x float> %189, ptr addrspace(3) @global_smem, align 4, !dbg !36
tail call void @llvm.nvvm.barrier0(), !dbg !36
%190 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !36
%191 = getelementptr float, ptr addrspace(1) %1, i64 %17, !dbg !37
%192 = icmp eq i32 %urem, 0, !dbg !38
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %190, ptr addrspace(1) %191, i1 %192) #5, !dbg !38
br label %193, !dbg !39
193: ; preds = %83, %193
%194 = phi i32 [ 0, %83 ], [ %267, %193 ]
%195 = phi <8 x float> [ zeroinitializer, %83 ], [ %266, %193 ]
%196 = zext nneg i32 %194 to i64, !dbg !40
%197 = insertelement <8 x i64> poison, i64 %196, i64 0, !dbg !40
%198 = shufflevector <8 x i64> %197, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !40
%199 = or <8 x i64> %198, %26, !dbg !40
%200 = icmp ult <8 x i64> %199, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !41
%201 = extractelement <8 x i64> %199, i64 0, !dbg !42
%gep35 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %201, !dbg !42
%202 = extractelement <8 x i64> %199, i64 1, !dbg !42
%gep37 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %202, !dbg !42
%203 = extractelement <8 x i64> %199, i64 2, !dbg !42
%gep39 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %203, !dbg !42
%204 = extractelement <8 x i64> %199, i64 3, !dbg !42
%gep41 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %204, !dbg !42
%205 = extractelement <8 x i64> %199, i64 4, !dbg !42
%gep43 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %205, !dbg !42
%206 = extractelement <8 x i64> %199, i64 5, !dbg !42
%gep45 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %206, !dbg !42
%207 = extractelement <8 x i64> %199, i64 6, !dbg !42
%gep47 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %207, !dbg !42
%208 = extractelement <8 x i64> %199, i64 7, !dbg !42
%gep49 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %208, !dbg !42
%209 = extractelement <8 x i1> %200, i64 0, !dbg !43
%210 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep35, i1 %209, i16 0, i1 %209) #5, !dbg !43
%211 = extractelement <8 x i1> %200, i64 1, !dbg !43
%212 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep37, i1 %211, i16 0, i1 %211) #5, !dbg !43
%213 = extractelement <8 x i1> %200, i64 2, !dbg !43
%214 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep39, i1 %213, i16 0, i1 %213) #5, !dbg !43
%215 = extractelement <8 x i1> %200, i64 3, !dbg !43
%216 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep41, i1 %215, i16 0, i1 %215) #5, !dbg !43
%217 = extractelement <8 x i1> %200, i64 4, !dbg !43
%218 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep43, i1 %217, i16 0, i1 %217) #5, !dbg !43
%219 = extractelement <8 x i1> %200, i64 5, !dbg !43
%220 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep45, i1 %219, i16 0, i1 %219) #5, !dbg !43
%221 = extractelement <8 x i1> %200, i64 6, !dbg !43
%222 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep47, i1 %221, i16 0, i1 %221) #5, !dbg !43
%223 = extractelement <8 x i1> %200, i64 7, !dbg !43
%224 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep49, i1 %223, i16 0, i1 %223) #5, !dbg !43
%225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %210) #5, !dbg !44
%226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %212) #5, !dbg !44
%227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %214) #5, !dbg !44
%228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %216) #5, !dbg !44
%229 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %218) #5, !dbg !44
%230 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %220) #5, !dbg !44
%231 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %222) #5, !dbg !44
%232 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %224) #5, !dbg !44
%233 = fsub float %225, %188, !dbg !45
%234 = fsub float %226, %188, !dbg !45
%235 = fsub float %227, %188, !dbg !45
%236 = fsub float %228, %188, !dbg !45
%237 = fsub float %229, %188, !dbg !45
%238 = fsub float %230, %188, !dbg !45
%239 = fsub float %231, %188, !dbg !45
%240 = fsub float %232, %188, !dbg !45
%241 = fmul float %233, 0x3FF7154760000000, !dbg !46
%242 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %241) #5, !dbg !46
%243 = fmul float %234, 0x3FF7154760000000, !dbg !46
%244 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %243) #5, !dbg !46
%245 = fmul float %235, 0x3FF7154760000000, !dbg !46
%246 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %245) #5, !dbg !46
%247 = fmul float %236, 0x3FF7154760000000, !dbg !46
%248 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %247) #5, !dbg !46
%249 = fmul float %237, 0x3FF7154760000000, !dbg !46
%250 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %249) #5, !dbg !46
%251 = fmul float %238, 0x3FF7154760000000, !dbg !46
%252 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %251) #5, !dbg !46
%253 = fmul float %239, 0x3FF7154760000000, !dbg !46
%254 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %253) #5, !dbg !46
%255 = fmul float %240, 0x3FF7154760000000, !dbg !46
%256 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %255) #5, !dbg !46
%257 = insertelement <8 x float> poison, float %242, i64 0, !dbg !47
%258 = insertelement <8 x float> %257, float %244, i64 1, !dbg !47
%259 = insertelement <8 x float> %258, float %246, i64 2, !dbg !47
%260 = insertelement <8 x float> %259, float %248, i64 3, !dbg !47
%261 = insertelement <8 x float> %260, float %250, i64 4, !dbg !47
%262 = insertelement <8 x float> %261, float %252, i64 5, !dbg !47
%263 = insertelement <8 x float> %262, float %254, i64 6, !dbg !47
%264 = insertelement <8 x float> %263, float %256, i64 7, !dbg !47
%265 = select <8 x i1> %200, <8 x float> %264, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !47
%266 = fadd <8 x float> %195, %265, !dbg !47
%267 = add nuw nsw i32 %194, 2048, !dbg !39
%268 = icmp ult i32 %194, 48209, !dbg !39
br i1 %268, label %193, label %269, !dbg !39
269: ; preds = %193
tail call void @llvm.nvvm.barrier0(), !dbg !48
%shift = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%270 = fadd <8 x float> %266, %shift, !dbg !52
%shift95 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%271 = fadd <8 x float> %shift95, %270, !dbg !52
%shift96 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%272 = fadd <8 x float> %shift96, %271, !dbg !52
%shift97 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%273 = fadd <8 x float> %shift97, %272, !dbg !52
%shift98 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%274 = fadd <8 x float> %shift98, %273, !dbg !52
%shift99 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%275 = fadd <8 x float> %shift99, %274, !dbg !52
%shift100 = shufflevector <8 x float> %266, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !52
%276 = fadd <8 x float> %shift100, %275, !dbg !52
%277 = extractelement <8 x float> %276, i64 0, !dbg !52
%278 = bitcast float %277 to i32, !dbg !48
%279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 16, i32 31), !dbg !48
%280 = bitcast i32 %279 to float, !dbg !48
%281 = fadd float %277, %280, !dbg !52
%282 = bitcast float %281 to i32, !dbg !48
%283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 8, i32 31), !dbg !48
%284 = bitcast i32 %283 to float, !dbg !48
%285 = fadd float %281, %284, !dbg !52
%286 = bitcast float %285 to i32, !dbg !48
%287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 4, i32 31), !dbg !48
%288 = bitcast i32 %287 to float, !dbg !48
%289 = fadd float %285, %288, !dbg !52
%290 = bitcast float %289 to i32, !dbg !48
%291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !48
%292 = bitcast i32 %291 to float, !dbg !48
%293 = fadd float %289, %292, !dbg !52
%294 = bitcast float %293 to i32, !dbg !48
%295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 1, i32 31), !dbg !48
%296 = bitcast i32 %295 to float, !dbg !48
%297 = fadd float %293, %296, !dbg !52
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %297, i1 %157) #5, !dbg !48
tail call void @llvm.nvvm.barrier0(), !dbg !48
%298 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %162, i1 %160) #5, !dbg !48
%299 = bitcast float %298 to i32, !dbg !48
%300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 4, i32 31), !dbg !48
%301 = bitcast i32 %300 to float, !dbg !48
%302 = fadd float %298, %301, !dbg !52
%303 = bitcast float %302 to i32, !dbg !48
%304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !48
%305 = bitcast i32 %304 to float, !dbg !48
%306 = fadd float %302, %305, !dbg !52
%307 = bitcast float %306 to i32, !dbg !48
%308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 1, i32 31), !dbg !48
%309 = bitcast i32 %308 to float, !dbg !48
%310 = fadd float %306, %309, !dbg !52
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %162, float %310, i1 %187) #5, !dbg !48
tail call void @llvm.nvvm.barrier0(), !dbg !48
%311 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !48
tail call void @llvm.nvvm.barrier0(), !dbg !56
%312 = insertelement <1 x float> undef, float %311, i64 0, !dbg !56
store <1 x float> %312, ptr addrspace(3) @global_smem, align 4, !dbg !56
tail call void @llvm.nvvm.barrier0(), !dbg !56
%313 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !56
%314 = getelementptr float, ptr addrspace(1) %2, i64 %17, !dbg !57
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %313, ptr addrspace(1) %314, i1 %192) #5, !dbg !58
%315 = fcmp olt float %311, 0x3810000000000000, !dbg !59
%316 = fmul float %311, 0x4160000000000000, !dbg !59
%.02.i = select i1 %315, float %316, float %311, !dbg !59
%i.i.0.i = select i1 %315, float -2.300000e+01, float 0.000000e+00, !dbg !59
%317 = bitcast float %.02.i to i32, !dbg !59
%318 = add i32 %317, -1059760811, !dbg !59
%319 = and i32 %318, -8388608, !dbg !59
%320 = sub i32 %317, %319, !dbg !59
%321 = bitcast i32 %320 to float, !dbg !59
%322 = sitofp i32 %319 to float, !dbg !59
%323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not.i = icmp eq i32 %323, 0, !dbg !59
%324 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %322, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !59
%325 = tail call float @llvm.nvvm.fma.rn.f(float %322, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !59
%.08.i = select i1 %.not.i, float %325, float %324, !dbg !59
%326 = fadd float %321, -1.000000e+00, !dbg !59
%327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not1.i = icmp eq i32 %327, 0, !dbg !59
%328 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %326, float 0x3FC2073EC0000000) #5, !dbg !59
%329 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %326, float 0x3FC2073EC0000000) #5, !dbg !59
%.010.i = select i1 %.not1.i, float %329, float %328, !dbg !59
%330 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not2.i = icmp eq i32 %330, 0, !dbg !59
%331 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %326, float 0xBFBF19B980000000) #5, !dbg !59
%332 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %326, float 0xBFBF19B980000000) #5, !dbg !59
%.011.i = select i1 %.not2.i, float %332, float %331, !dbg !59
%333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not3.i = icmp eq i32 %333, 0, !dbg !59
%334 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %326, float 0x3FC1E52AA0000000) #5, !dbg !59
%335 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %326, float 0x3FC1E52AA0000000) #5, !dbg !59
%.012.i = select i1 %.not3.i, float %335, float %334, !dbg !59
%336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not4.i = icmp eq i32 %336, 0, !dbg !59
%337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %326, float 0xBFC55B1720000000) #5, !dbg !59
%338 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %326, float 0xBFC55B1720000000) #5, !dbg !59
%.09.i = select i1 %.not4.i, float %338, float %337, !dbg !59
%339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not5.i = icmp eq i32 %339, 0, !dbg !59
%340 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %326, float 0x3FC99DA160000000) #5, !dbg !59
%341 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %326, float 0x3FC99DA160000000) #5, !dbg !59
%.05.i = select i1 %.not5.i, float %341, float %340, !dbg !59
%342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not6.i = icmp eq i32 %342, 0, !dbg !59
%343 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %326, float 0xBFCFFFE440000000) #5, !dbg !59
%344 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %326, float 0xBFCFFFE440000000) #5, !dbg !59
%.01.i = select i1 %.not6.i, float %344, float %343, !dbg !59
%345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not7.i = icmp eq i32 %345, 0, !dbg !59
%346 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %326, float 0x3FD5554F00000000) #5, !dbg !59
%347 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %326, float 0x3FD5554F00000000) #5, !dbg !59
%.0.i = select i1 %.not7.i, float %347, float %346, !dbg !59
%348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not8.i = icmp eq i32 %348, 0, !dbg !59
%349 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i, float %326, float -5.000000e-01) #5, !dbg !59
%350 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i, float %326, float -5.000000e-01) #5, !dbg !59
%.07.i = select i1 %.not8.i, float %350, float %349, !dbg !59
%351 = fmul float %326, %.07.i, !dbg !59
%352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not9.i = icmp eq i32 %352, 0, !dbg !59
%353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float %326, float %326) #5, !dbg !59
%354 = tail call float @llvm.nvvm.fma.rn.f(float %351, float %326, float %326) #5, !dbg !59
%.06.i = select i1 %.not9.i, float %354, float %353, !dbg !59
%355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not10.i = icmp eq i32 %355, 0, !dbg !59
%356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !59
%357 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !59
%.04.i = select i1 %.not10.i, float %357, float %356, !dbg !59
%358 = icmp ugt i32 %317, 2139095039, !dbg !59
br i1 %358, label %__nv_fmaf_rn.exit.i.i, label %__nv_logf.exit, !dbg !59
__nv_fmaf_rn.exit.i.i: ; preds = %269
%359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !59
%.not11.i = icmp eq i32 %359, 0, !dbg !59
%360 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !59
%361 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !59
%.03.i = select i1 %.not11.i, float %361, float %360, !dbg !59
br label %__nv_logf.exit, !dbg !59
__nv_logf.exit: ; preds = %269, %__nv_fmaf_rn.exit.i.i
%r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %.04.i, %269 ], !dbg !59
%362 = fcmp oeq float %.02.i, 0.000000e+00, !dbg !59
%r.i.1.i = select i1 %362, float 0xFFF0000000000000, float %r.i.0.i, !dbg !59
%363 = extractelement <8 x i64> %26, i64 0, !dbg !60
%364 = extractelement <8 x i64> %26, i64 1, !dbg !60
%365 = extractelement <8 x i64> %26, i64 2, !dbg !60
%366 = extractelement <8 x i64> %26, i64 3, !dbg !60
%367 = extractelement <8 x i64> %26, i64 4, !dbg !60
%368 = extractelement <8 x i64> %26, i64 5, !dbg !60
%369 = extractelement <8 x i64> %26, i64 6, !dbg !60
%370 = extractelement <8 x i64> %26, i64 7, !dbg !60
br label %371, !dbg !61
371: ; preds = %__nv_logf.exit, %371
%372 = phi i32 [ 0, %__nv_logf.exit ], [ %454, %371 ]
%373 = zext nneg i32 %372 to i64, !dbg !60
%374 = or i64 %363, %373, !dbg !60
%375 = or i64 %364, %373, !dbg !60
%376 = or i64 %365, %373, !dbg !60
%377 = or i64 %366, %373, !dbg !60
%378 = or i64 %367, %373, !dbg !60
%379 = or i64 %368, %373, !dbg !60
%380 = or i64 %369, %373, !dbg !60
%381 = or i64 %370, %373, !dbg !60
%382 = icmp ult i64 %374, 50257, !dbg !62
%383 = icmp ult i64 %375, 50257, !dbg !62
%384 = icmp ult i64 %376, 50257, !dbg !62
%385 = icmp ult i64 %377, 50257, !dbg !62
%386 = icmp ult i64 %378, 50257, !dbg !62
%387 = icmp ult i64 %379, 50257, !dbg !62
%388 = icmp ult i64 %380, 50257, !dbg !62
%389 = icmp ult i64 %381, 50257, !dbg !62
%390 = add nsw i64 %374, %27, !dbg !63
%391 = add nsw i64 %375, %27, !dbg !63
%392 = add nsw i64 %376, %27, !dbg !63
%393 = add nsw i64 %377, %27, !dbg !63
%394 = add nsw i64 %378, %27, !dbg !63
%395 = add nsw i64 %379, %27, !dbg !63
%396 = add nsw i64 %380, %27, !dbg !63
%397 = add nsw i64 %381, %27, !dbg !63
%398 = getelementptr i16, ptr addrspace(1) %0, i64 %390, !dbg !64
%399 = getelementptr i16, ptr addrspace(1) %0, i64 %391, !dbg !64
%400 = getelementptr i16, ptr addrspace(1) %0, i64 %392, !dbg !64
%401 = getelementptr i16, ptr addrspace(1) %0, i64 %393, !dbg !64
%402 = getelementptr i16, ptr addrspace(1) %0, i64 %394, !dbg !64
%403 = getelementptr i16, ptr addrspace(1) %0, i64 %395, !dbg !64
%404 = getelementptr i16, ptr addrspace(1) %0, i64 %396, !dbg !64
%405 = getelementptr i16, ptr addrspace(1) %0, i64 %397, !dbg !64
%406 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %398, i1 %382, i16 0, i1 %382) #5, !dbg !65
%407 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %399, i1 %383, i16 0, i1 %383) #5, !dbg !65
%408 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %384, i16 0, i1 %384) #5, !dbg !65
%409 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %385, i16 0, i1 %385) #5, !dbg !65
%410 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %386, i16 0, i1 %386) #5, !dbg !65
%411 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %387, i16 0, i1 %387) #5, !dbg !65
%412 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %388, i16 0, i1 %388) #5, !dbg !65
%413 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %389, i16 0, i1 %389) #5, !dbg !65
%414 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %406) #5, !dbg !66
%415 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %407) #5, !dbg !66
%416 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %408) #5, !dbg !66
%417 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %409) #5, !dbg !66
%418 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %410) #5, !dbg !66
%419 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %411) #5, !dbg !66
%420 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %412) #5, !dbg !66
%421 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %413) #5, !dbg !66
%422 = fsub float %414, %188, !dbg !67
%423 = fsub float %415, %188, !dbg !67
%424 = fsub float %416, %188, !dbg !67
%425 = fsub float %417, %188, !dbg !67
%426 = fsub float %418, %188, !dbg !67
%427 = fsub float %419, %188, !dbg !67
%428 = fsub float %420, %188, !dbg !67
%429 = fsub float %421, %188, !dbg !67
%430 = fsub float %422, %r.i.1.i, !dbg !68
%431 = fsub float %423, %r.i.1.i, !dbg !68
%432 = fsub float %424, %r.i.1.i, !dbg !68
%433 = fsub float %425, %r.i.1.i, !dbg !68
%434 = fsub float %426, %r.i.1.i, !dbg !68
%435 = fsub float %427, %r.i.1.i, !dbg !68
%436 = fsub float %428, %r.i.1.i, !dbg !68
%437 = fsub float %429, %r.i.1.i, !dbg !68
%438 = getelementptr i16, ptr addrspace(1) %3, i64 %390, !dbg !69
%439 = getelementptr i16, ptr addrspace(1) %3, i64 %391, !dbg !69
%440 = getelementptr i16, ptr addrspace(1) %3, i64 %392, !dbg !69
%441 = getelementptr i16, ptr addrspace(1) %3, i64 %393, !dbg !69
%442 = getelementptr i16, ptr addrspace(1) %3, i64 %394, !dbg !69
%443 = getelementptr i16, ptr addrspace(1) %3, i64 %395, !dbg !69
%444 = getelementptr i16, ptr addrspace(1) %3, i64 %396, !dbg !69
%445 = getelementptr i16, ptr addrspace(1) %3, i64 %397, !dbg !69
%446 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #5, !dbg !70
%447 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #5, !dbg !70
%448 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #5, !dbg !70
%449 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %433) #5, !dbg !70
%450 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %434) #5, !dbg !70
%451 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %435) #5, !dbg !70
%452 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %436) #5, !dbg !70
%453 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %437) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %446, ptr addrspace(1) %438, i1 %382) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %447, ptr addrspace(1) %439, i1 %383) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %448, ptr addrspace(1) %440, i1 %384) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %449, ptr addrspace(1) %441, i1 %385) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %450, ptr addrspace(1) %442, i1 %386) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %451, ptr addrspace(1) %443, i1 %387) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %452, ptr addrspace(1) %444, i1 %388) #5, !dbg !70
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %453, ptr addrspace(1) %445, i1 %389) #5, !dbg !70
%454 = add nuw nsw i32 %372, 2048, !dbg !61
%455 = icmp ult i32 %372, 48209, !dbg !61
br i1 %455, label %371, label %456, !dbg !61
456: ; preds = %371
ret void, !dbg !71
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline nounwind
define float @__nv_logf(float %a) local_unnamed_addr #3 {
__nv_fmaf_rn.exit10.i:
%0 = fcmp olt float %a, 0x3810000000000000
%1 = fmul float %a, 0x4160000000000000
%.02 = select i1 %0, float %1, float %a
%i.i.0 = select i1 %0, float -2.300000e+01, float 0.000000e+00
%2 = bitcast float %.02 to i32
%3 = add i32 %2, -1059760811
%4 = and i32 %3, -8388608
%5 = sub i32 %2, %4
%6 = bitcast i32 %5 to float
%7 = sitofp i32 %4 to float
%8 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not = icmp eq i32 %8, 0
%9 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %7, float 0x3E80000000000000, float %i.i.0) #5
%10 = tail call float @llvm.nvvm.fma.rn.f(float %7, float 0x3E80000000000000, float %i.i.0) #5
%.08 = select i1 %.not, float %10, float %9
%11 = fadd float %6, -1.000000e+00
%12 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not1 = icmp eq i32 %12, 0
%13 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5
%14 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5
%.010 = select i1 %.not1, float %14, float %13
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not2 = icmp eq i32 %15, 0
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010, float %11, float 0xBFBF19B980000000) #5
%17 = tail call float @llvm.nvvm.fma.rn.f(float %.010, float %11, float 0xBFBF19B980000000) #5
%.011 = select i1 %.not2, float %17, float %16
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not3 = icmp eq i32 %18, 0
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5
%.012 = select i1 %.not3, float %20, float %19
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not4 = icmp eq i32 %21, 0
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012, float %11, float 0xBFC55B1720000000) #5
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.012, float %11, float 0xBFC55B1720000000) #5
%.09 = select i1 %.not4, float %23, float %22
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not5 = icmp eq i32 %24, 0
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09, float %11, float 0x3FC99DA160000000) #5
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.09, float %11, float 0x3FC99DA160000000) #5
%.05 = select i1 %.not5, float %26, float %25
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not6 = icmp eq i32 %27, 0
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %11, float 0xBFCFFFE440000000) #5
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %11, float 0xBFCFFFE440000000) #5
%.01 = select i1 %.not6, float %29, float %28
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not7 = icmp eq i32 %30, 0
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01, float %11, float 0x3FD5554F00000000) #5
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.01, float %11, float 0x3FD5554F00000000) #5
%.0 = select i1 %.not7, float %32, float %31
%33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not8 = icmp eq i32 %33, 0
%34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0, float %11, float -5.000000e-01) #5
%35 = tail call float @llvm.nvvm.fma.rn.f(float %.0, float %11, float -5.000000e-01) #5
%.07 = select i1 %.not8, float %35, float %34
%36 = fmul float %11, %.07
%37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not9 = icmp eq i32 %37, 0
%38 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %36, float %11, float %11) #5
%39 = tail call float @llvm.nvvm.fma.rn.f(float %36, float %11, float %11) #5
%.06 = select i1 %.not9, float %39, float %38
%40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not10 = icmp eq i32 %40, 0
%41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08, float 0x3FE62E4300000000, float %.06) #5
%42 = tail call float @llvm.nvvm.fma.rn.f(float %.08, float 0x3FE62E4300000000, float %.06) #5
%.04 = select i1 %.not10, float %42, float %41
%43 = icmp ugt i32 %2, 2139095039
br i1 %43, label %__nv_fmaf_rn.exit.i, label %__internal_accurate_logf.exit
__nv_fmaf_rn.exit.i: ; preds = %__nv_fmaf_rn.exit10.i
%44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
%.not11 = icmp eq i32 %44, 0
%45 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5
%46 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5
%.03 = select i1 %.not11, float %46, float %45
br label %__internal_accurate_logf.exit
__internal_accurate_logf.exit: ; preds = %__nv_fmaf_rn.exit.i, %__nv_fmaf_rn.exit10.i
%r.i.0 = phi float [ %.03, %__nv_fmaf_rn.exit.i ], [ %.04, %__nv_fmaf_rn.exit10.i ]
%47 = fcmp oeq float %.02, 0.000000e+00
%r.i.1 = select i1 %47, float 0xFFF0000000000000, float %r.i.0
ret float %r.i.1
}
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "cgu6pijtlte2d3yicmpedfh2c7wgxsaexd6ichnxwbwh4deqe6ck.py", directory: "/tmp/torchinductor_root/gu")
!4 = !{ptr @triton__0d1d2d3d4de5, !"kernel", i32 1}
!5 = !{ptr @triton__0d1d2d3d4de5, !"maxntidx", i32 256}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5", linkageName: "triton__0d1d2d3d4de5", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 24, column: 33, scope: !7)
!11 = !DILocation(line: 21, column: 28, scope: !7)
!12 = !DILocation(line: 21, column: 34, scope: !7)
!13 = !DILocation(line: 31, column: 46, scope: !7)
!14 = !DILocation(line: 27, column: 36, scope: !7)
!15 = !DILocation(line: 28, column: 27, scope: !7)
!16 = !DILocation(line: 38, column: 21, scope: !17, inlinedAt: !19)
!17 = distinct !DILexicalBlockFile(scope: !7, file: !18, discriminator: 0)
!18 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
!19 = !DILocation(line: 34, column: 45, scope: !17)
!20 = !DILocation(line: 29, column: 25, scope: !7)
!21 = !DILocation(line: 31, column: 34, scope: !7)
!22 = !DILocation(line: 31, column: 52, scope: !7)
!23 = !DILocation(line: 31, column: 103, scope: !7)
!24 = !DILocation(line: 36, column: 15, scope: !17, inlinedAt: !19)
!25 = !DILocation(line: 38, column: 16, scope: !17, inlinedAt: !19)
!26 = !DILocation(line: 0, scope: !7)
!27 = !DILocation(line: 36, column: 15, scope: !28, inlinedAt: !29)
!28 = distinct !DILexicalBlockFile(scope: !17, file: !18, discriminator: 0)
!29 = !DILocation(line: 49, column: 29, scope: !28, inlinedAt: !30)
!30 = !DILocation(line: 36, column: 38, scope: !28)
!31 = !DILocation(line: 38, column: 21, scope: !28, inlinedAt: !29)
!32 = !DILocation(line: 38, column: 16, scope: !28, inlinedAt: !29)
!33 = !DILocation(line: 39, column: 29, scope: !28, inlinedAt: !29)
!34 = !DILocation(line: 49, column: 29, scope: !17, inlinedAt: !35)
!35 = !DILocation(line: 36, column: 38, scope: !17)
!36 = !DILocation(line: 36, column: 41, scope: !7)
!37 = !DILocation(line: 37, column: 25, scope: !7)
!38 = !DILocation(line: 37, column: 36, scope: !7)
!39 = !DILocation(line: 39, column: 36, scope: !7)
!40 = !DILocation(line: 40, column: 27, scope: !7)
!41 = !DILocation(line: 41, column: 25, scope: !7)
!42 = !DILocation(line: 43, column: 34, scope: !7)
!43 = !DILocation(line: 43, column: 52, scope: !7)
!44 = !DILocation(line: 43, column: 103, scope: !7)
!45 = !DILocation(line: 45, column: 22, scope: !7)
!46 = !DILocation(line: 46, column: 22, scope: !7)
!47 = !DILocation(line: 49, column: 40, scope: !7)
!48 = !DILocation(line: 243, column: 36, scope: !49, inlinedAt: !51)
!49 = distinct !DILexicalBlockFile(scope: !7, file: !50, discriminator: 0)
!50 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!51 = !DILocation(line: 50, column: 27, scope: !49)
!52 = !DILocation(line: 233, column: 15, scope: !53, inlinedAt: !54)
!53 = distinct !DILexicalBlockFile(scope: !49, file: !50, discriminator: 0)
!54 = !DILocation(line: 243, column: 36, scope: !53, inlinedAt: !55)
!55 = !DILocation(line: 50, column: 27, scope: !53)
!56 = !DILocation(line: 50, column: 30, scope: !7)
!57 = !DILocation(line: 51, column: 25, scope: !7)
!58 = !DILocation(line: 51, column: 37, scope: !7)
!59 = !DILocation(line: 59, column: 23, scope: !7)
!60 = !DILocation(line: 53, column: 27, scope: !7)
!61 = !DILocation(line: 52, column: 36, scope: !7)
!62 = !DILocation(line: 54, column: 25, scope: !7)
!63 = !DILocation(line: 56, column: 41, scope: !7)
!64 = !DILocation(line: 56, column: 35, scope: !7)
!65 = !DILocation(line: 56, column: 53, scope: !7)
!66 = !DILocation(line: 56, column: 105, scope: !7)
!67 = !DILocation(line: 58, column: 24, scope: !7)
!68 = !DILocation(line: 60, column: 24, scope: !7)
!69 = !DILocation(line: 62, column: 29, scope: !7)
!70 = !DILocation(line: 62, column: 54, scope: !7)
!71 = !DILocation(line: 52, column: 4, scope: !7)