0-hero's picture
Add files using upload-large-folder tool
b7591d4 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%11 = and i32 %10, 31, !dbg !8
%12 = lshr i32 %10, 6, !dbg !8
%13 = and i32 %12, 3, !dbg !8
%14 = or i32 %13, 4, !dbg !8
%15 = or i32 %13, 8, !dbg !8
%16 = or i32 %13, 12, !dbg !8
%17 = or i32 %13, 16, !dbg !8
%18 = or i32 %13, 20, !dbg !8
%19 = or i32 %13, 24, !dbg !8
%20 = or i32 %13, 28, !dbg !8
%21 = or i32 %13, 32, !dbg !8
%22 = or i32 %13, 36, !dbg !8
%23 = or i32 %13, 40, !dbg !8
%24 = or i32 %13, 44, !dbg !8
%25 = or i32 %13, 48, !dbg !8
%26 = or i32 %13, 52, !dbg !8
%27 = or i32 %13, 56, !dbg !8
%28 = or i32 %13, 60, !dbg !8
%urem = and i32 %10, 63, !dbg !9
%29 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
%30 = sext i32 %29 to i64, !dbg !11
%31 = shl nsw i64 %30, 6, !dbg !12
%32 = zext nneg i32 %13 to i64
%33 = zext nneg i32 %14 to i64
%34 = zext nneg i32 %15 to i64
%35 = zext nneg i32 %16 to i64
%36 = zext nneg i32 %17 to i64
%37 = zext nneg i32 %18 to i64
%38 = zext nneg i32 %19 to i64
%39 = zext nneg i32 %20 to i64
%40 = zext nneg i32 %21 to i64
%41 = zext nneg i32 %22 to i64
%42 = zext nneg i32 %23 to i64
%43 = zext nneg i32 %24 to i64
%44 = zext nneg i32 %25 to i64
%45 = zext nneg i32 %26 to i64
%46 = zext nneg i32 %27 to i64
%47 = zext nneg i32 %28 to i64
%48 = or i64 %31, %32, !dbg !13
%49 = or i64 %31, %33, !dbg !13
%50 = or i64 %31, %34, !dbg !13
%51 = or i64 %31, %35, !dbg !13
%52 = or i64 %31, %36, !dbg !13
%53 = or i64 %31, %37, !dbg !13
%54 = or i64 %31, %38, !dbg !13
%55 = or i64 %31, %39, !dbg !13
%56 = or i64 %31, %40, !dbg !13
%57 = or i64 %31, %41, !dbg !13
%58 = or i64 %31, %42, !dbg !13
%59 = or i64 %31, %43, !dbg !13
%60 = or i64 %31, %44, !dbg !13
%61 = or i64 %31, %45, !dbg !13
%62 = or i64 %31, %46, !dbg !13
%63 = or i64 %31, %47, !dbg !13
%64 = getelementptr i64, ptr addrspace(1) %1, i64 %48, !dbg !14
%65 = getelementptr i64, ptr addrspace(1) %1, i64 %49, !dbg !14
%66 = getelementptr i64, ptr addrspace(1) %1, i64 %50, !dbg !14
%67 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !14
%68 = getelementptr i64, ptr addrspace(1) %1, i64 %52, !dbg !14
%69 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !14
%70 = getelementptr i64, ptr addrspace(1) %1, i64 %54, !dbg !14
%71 = getelementptr i64, ptr addrspace(1) %1, i64 %55, !dbg !14
%72 = getelementptr i64, ptr addrspace(1) %1, i64 %56, !dbg !14
%73 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !14
%74 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !14
%75 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !14
%76 = getelementptr i64, ptr addrspace(1) %1, i64 %60, !dbg !14
%77 = getelementptr i64, ptr addrspace(1) %1, i64 %61, !dbg !14
%78 = getelementptr i64, ptr addrspace(1) %1, i64 %62, !dbg !14
%79 = getelementptr i64, ptr addrspace(1) %1, i64 %63, !dbg !14
%80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %64, i1 true) #3, !dbg !15
%81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %65, i1 true) #3, !dbg !15
%82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !15
%83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %67, i1 true) #3, !dbg !15
%84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %68, i1 true) #3, !dbg !15
%85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %69, i1 true) #3, !dbg !15
%86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %70, i1 true) #3, !dbg !15
%87 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %71, i1 true) #3, !dbg !15
%88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !15
%89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %73, i1 true) #3, !dbg !15
%90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %74, i1 true) #3, !dbg !15
%91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %75, i1 true) #3, !dbg !15
%92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !15
%93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %77, i1 true) #3, !dbg !15
%94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %78, i1 true) #3, !dbg !15
%95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %79, i1 true) #3, !dbg !15
%96 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16
%97 = bitcast i32 %96 to float, !dbg !16
%98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17
%99 = bitcast i32 %98 to float, !dbg !17
%100 = mul nsw i64 %48, 50257, !dbg !18
%101 = mul nsw i64 %49, 50257, !dbg !18
%102 = mul nsw i64 %50, 50257, !dbg !18
%103 = mul nsw i64 %51, 50257, !dbg !18
%104 = mul nsw i64 %52, 50257, !dbg !18
%105 = mul nsw i64 %53, 50257, !dbg !18
%106 = mul nsw i64 %54, 50257, !dbg !18
%107 = mul nsw i64 %55, 50257, !dbg !18
%108 = mul nsw i64 %56, 50257, !dbg !18
%109 = mul nsw i64 %57, 50257, !dbg !18
%110 = mul nsw i64 %58, 50257, !dbg !18
%111 = mul nsw i64 %59, 50257, !dbg !18
%112 = mul nsw i64 %60, 50257, !dbg !18
%113 = mul nsw i64 %61, 50257, !dbg !18
%114 = mul nsw i64 %62, 50257, !dbg !18
%115 = mul nsw i64 %63, 50257, !dbg !18
%116 = insertelement <16 x i64> poison, i64 %80, i64 0, !dbg !19
%117 = insertelement <16 x i64> %116, i64 %81, i64 1, !dbg !19
%118 = insertelement <16 x i64> %117, i64 %82, i64 2, !dbg !19
%119 = insertelement <16 x i64> %118, i64 %83, i64 3, !dbg !19
%120 = insertelement <16 x i64> %119, i64 %84, i64 4, !dbg !19
%121 = insertelement <16 x i64> %120, i64 %85, i64 5, !dbg !19
%122 = insertelement <16 x i64> %121, i64 %86, i64 6, !dbg !19
%123 = insertelement <16 x i64> %122, i64 %87, i64 7, !dbg !19
%124 = insertelement <16 x i64> %123, i64 %88, i64 8, !dbg !19
%125 = insertelement <16 x i64> %124, i64 %89, i64 9, !dbg !19
%126 = insertelement <16 x i64> %125, i64 %90, i64 10, !dbg !19
%127 = insertelement <16 x i64> %126, i64 %91, i64 11, !dbg !19
%128 = insertelement <16 x i64> %127, i64 %92, i64 12, !dbg !19
%129 = insertelement <16 x i64> %128, i64 %93, i64 13, !dbg !19
%130 = insertelement <16 x i64> %129, i64 %94, i64 14, !dbg !19
%131 = insertelement <16 x i64> %130, i64 %95, i64 15, !dbg !19
%132 = icmp eq <16 x i64> %131, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
%133 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %97, float %99) #3, !dbg !20
%134 = insertelement <16 x float> poison, float %133, i64 0, !dbg !21
%135 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> zeroinitializer, !dbg !21
%136 = select <16 x i1> %132, <16 x float> zeroinitializer, <16 x float> %135, !dbg !21
%137 = getelementptr float, ptr addrspace(1) %0, i64 %100
%138 = getelementptr float, ptr addrspace(1) %0, i64 %101
%139 = getelementptr float, ptr addrspace(1) %0, i64 %102
%140 = getelementptr float, ptr addrspace(1) %0, i64 %103
%141 = getelementptr float, ptr addrspace(1) %0, i64 %104
%142 = getelementptr float, ptr addrspace(1) %0, i64 %105
%143 = getelementptr float, ptr addrspace(1) %0, i64 %106
%144 = getelementptr float, ptr addrspace(1) %0, i64 %107
%145 = getelementptr float, ptr addrspace(1) %0, i64 %108
%146 = getelementptr float, ptr addrspace(1) %0, i64 %109
%147 = getelementptr float, ptr addrspace(1) %0, i64 %110
%148 = getelementptr float, ptr addrspace(1) %0, i64 %111
%149 = getelementptr float, ptr addrspace(1) %0, i64 %112
%150 = getelementptr float, ptr addrspace(1) %0, i64 %113
%151 = getelementptr float, ptr addrspace(1) %0, i64 %114
%152 = getelementptr float, ptr addrspace(1) %0, i64 %115
br label %153, !dbg !22
153: ; preds = %9, %153
%154 = phi i32 [ 0, %9 ], [ %213, %153 ]
%155 = phi <16 x float> [ zeroinitializer, %9 ], [ %212, %153 ]
%156 = or i32 %154, %urem, !dbg !23
%157 = zext nneg i32 %156 to i64, !dbg !23
%158 = icmp ult i32 %156, 50257, !dbg !24
%159 = getelementptr float, ptr addrspace(1) %137, i64 %157, !dbg !25
%160 = getelementptr float, ptr addrspace(1) %138, i64 %157, !dbg !25
%161 = getelementptr float, ptr addrspace(1) %139, i64 %157, !dbg !25
%162 = getelementptr float, ptr addrspace(1) %140, i64 %157, !dbg !25
%163 = getelementptr float, ptr addrspace(1) %141, i64 %157, !dbg !25
%164 = getelementptr float, ptr addrspace(1) %142, i64 %157, !dbg !25
%165 = getelementptr float, ptr addrspace(1) %143, i64 %157, !dbg !25
%166 = getelementptr float, ptr addrspace(1) %144, i64 %157, !dbg !25
%167 = getelementptr float, ptr addrspace(1) %145, i64 %157, !dbg !25
%168 = getelementptr float, ptr addrspace(1) %146, i64 %157, !dbg !25
%169 = getelementptr float, ptr addrspace(1) %147, i64 %157, !dbg !25
%170 = getelementptr float, ptr addrspace(1) %148, i64 %157, !dbg !25
%171 = getelementptr float, ptr addrspace(1) %149, i64 %157, !dbg !25
%172 = getelementptr float, ptr addrspace(1) %150, i64 %157, !dbg !25
%173 = getelementptr float, ptr addrspace(1) %151, i64 %157, !dbg !25
%174 = getelementptr float, ptr addrspace(1) %152, i64 %157, !dbg !25
%175 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 %158, i32 0, i1 %158) #3, !dbg !26
%176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %160, i1 %158, i32 0, i1 %158) #3, !dbg !26
%177 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %161, i1 %158, i32 0, i1 %158) #3, !dbg !26
%178 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %162, i1 %158, i32 0, i1 %158) #3, !dbg !26
%179 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %163, i1 %158, i32 0, i1 %158) #3, !dbg !26
%180 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %164, i1 %158, i32 0, i1 %158) #3, !dbg !26
%181 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %165, i1 %158, i32 0, i1 %158) #3, !dbg !26
%182 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %166, i1 %158, i32 0, i1 %158) #3, !dbg !26
%183 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %167, i1 %158, i32 0, i1 %158) #3, !dbg !26
%184 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %168, i1 %158, i32 0, i1 %158) #3, !dbg !26
%185 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %169, i1 %158, i32 0, i1 %158) #3, !dbg !26
%186 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %170, i1 %158, i32 0, i1 %158) #3, !dbg !26
%187 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %171, i1 %158, i32 0, i1 %158) #3, !dbg !26
%188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %172, i1 %158, i32 0, i1 %158) #3, !dbg !26
%189 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %173, i1 %158, i32 0, i1 %158) #3, !dbg !26
%190 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %174, i1 %158, i32 0, i1 %158) #3, !dbg !26
%191 = insertelement <16 x i32> poison, i32 %175, i64 0, !dbg !26
%192 = insertelement <16 x i32> %191, i32 %176, i64 1, !dbg !26
%193 = insertelement <16 x i32> %192, i32 %177, i64 2, !dbg !26
%194 = insertelement <16 x i32> %193, i32 %178, i64 3, !dbg !26
%195 = insertelement <16 x i32> %194, i32 %179, i64 4, !dbg !26
%196 = insertelement <16 x i32> %195, i32 %180, i64 5, !dbg !26
%197 = insertelement <16 x i32> %196, i32 %181, i64 6, !dbg !26
%198 = insertelement <16 x i32> %197, i32 %182, i64 7, !dbg !26
%199 = insertelement <16 x i32> %198, i32 %183, i64 8, !dbg !26
%200 = insertelement <16 x i32> %199, i32 %184, i64 9, !dbg !26
%201 = insertelement <16 x i32> %200, i32 %185, i64 10, !dbg !26
%202 = insertelement <16 x i32> %201, i32 %186, i64 11, !dbg !26
%203 = insertelement <16 x i32> %202, i32 %187, i64 12, !dbg !26
%204 = insertelement <16 x i32> %203, i32 %188, i64 13, !dbg !26
%205 = insertelement <16 x i32> %204, i32 %189, i64 14, !dbg !26
%206 = insertelement <16 x i32> %205, i32 %190, i64 15, !dbg !26
%207 = bitcast <16 x i32> %206 to <16 x float>, !dbg !26
%208 = fmul <16 x float> %136, %207, !dbg !27
%209 = insertelement <16 x i1> poison, i1 %158, i64 0, !dbg !28
%210 = shufflevector <16 x i1> %209, <16 x i1> poison, <16 x i32> zeroinitializer, !dbg !28
%211 = select <16 x i1> %210, <16 x float> %208, <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
%212 = fadd <16 x float> %155, %211, !dbg !28
%213 = add nuw nsw i32 %154, 64, !dbg !22
%214 = icmp ult i32 %154, 50193, !dbg !22
br i1 %214, label %153, label %215, !dbg !22
215: ; preds = %153
%216 = lshr i32 %10, 5, !dbg !8
%217 = and i32 %216, 1, !dbg !9
%218 = extractelement <16 x float> %212, i64 0, !dbg !29
%219 = bitcast float %218 to i32, !dbg !29
%220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 16, i32 31), !dbg !29
%221 = bitcast i32 %220 to float, !dbg !29
%222 = fadd float %218, %221, !dbg !33
%223 = bitcast float %222 to i32, !dbg !29
%224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 8, i32 31), !dbg !29
%225 = bitcast i32 %224 to float, !dbg !29
%226 = fadd float %222, %225, !dbg !33
%227 = bitcast float %226 to i32, !dbg !29
%228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !29
%229 = bitcast i32 %228 to float, !dbg !29
%230 = fadd float %226, %229, !dbg !33
%231 = bitcast float %230 to i32, !dbg !29
%232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 2, i32 31), !dbg !29
%233 = bitcast i32 %232 to float, !dbg !29
%234 = fadd float %230, %233, !dbg !33
%235 = bitcast float %234 to i32, !dbg !29
%236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !29
%237 = bitcast i32 %236 to float, !dbg !29
%238 = fadd float %234, %237, !dbg !33
%239 = extractelement <16 x float> %212, i64 1, !dbg !29
%240 = bitcast float %239 to i32, !dbg !29
%241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !29
%242 = bitcast i32 %241 to float, !dbg !29
%243 = fadd float %239, %242, !dbg !33
%244 = bitcast float %243 to i32, !dbg !29
%245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 8, i32 31), !dbg !29
%246 = bitcast i32 %245 to float, !dbg !29
%247 = fadd float %243, %246, !dbg !33
%248 = bitcast float %247 to i32, !dbg !29
%249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 4, i32 31), !dbg !29
%250 = bitcast i32 %249 to float, !dbg !29
%251 = fadd float %247, %250, !dbg !33
%252 = bitcast float %251 to i32, !dbg !29
%253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 2, i32 31), !dbg !29
%254 = bitcast i32 %253 to float, !dbg !29
%255 = fadd float %251, %254, !dbg !33
%256 = bitcast float %255 to i32, !dbg !29
%257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 1, i32 31), !dbg !29
%258 = bitcast i32 %257 to float, !dbg !29
%259 = fadd float %255, %258, !dbg !33
%260 = extractelement <16 x float> %212, i64 2, !dbg !29
%261 = bitcast float %260 to i32, !dbg !29
%262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 16, i32 31), !dbg !29
%263 = bitcast i32 %262 to float, !dbg !29
%264 = fadd float %260, %263, !dbg !33
%265 = bitcast float %264 to i32, !dbg !29
%266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !29
%267 = bitcast i32 %266 to float, !dbg !29
%268 = fadd float %264, %267, !dbg !33
%269 = bitcast float %268 to i32, !dbg !29
%270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !29
%271 = bitcast i32 %270 to float, !dbg !29
%272 = fadd float %268, %271, !dbg !33
%273 = bitcast float %272 to i32, !dbg !29
%274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 2, i32 31), !dbg !29
%275 = bitcast i32 %274 to float, !dbg !29
%276 = fadd float %272, %275, !dbg !33
%277 = bitcast float %276 to i32, !dbg !29
%278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 1, i32 31), !dbg !29
%279 = bitcast i32 %278 to float, !dbg !29
%280 = fadd float %276, %279, !dbg !33
%281 = extractelement <16 x float> %212, i64 3, !dbg !29
%282 = bitcast float %281 to i32, !dbg !29
%283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 16, i32 31), !dbg !29
%284 = bitcast i32 %283 to float, !dbg !29
%285 = fadd float %281, %284, !dbg !33
%286 = bitcast float %285 to i32, !dbg !29
%287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 8, i32 31), !dbg !29
%288 = bitcast i32 %287 to float, !dbg !29
%289 = fadd float %285, %288, !dbg !33
%290 = bitcast float %289 to i32, !dbg !29
%291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 4, i32 31), !dbg !29
%292 = bitcast i32 %291 to float, !dbg !29
%293 = fadd float %289, %292, !dbg !33
%294 = bitcast float %293 to i32, !dbg !29
%295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !29
%296 = bitcast i32 %295 to float, !dbg !29
%297 = fadd float %293, %296, !dbg !33
%298 = bitcast float %297 to i32, !dbg !29
%299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 1, i32 31), !dbg !29
%300 = bitcast i32 %299 to float, !dbg !29
%301 = fadd float %297, %300, !dbg !33
%302 = extractelement <16 x float> %212, i64 4, !dbg !29
%303 = bitcast float %302 to i32, !dbg !29
%304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 16, i32 31), !dbg !29
%305 = bitcast i32 %304 to float, !dbg !29
%306 = fadd float %302, %305, !dbg !33
%307 = bitcast float %306 to i32, !dbg !29
%308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 8, i32 31), !dbg !29
%309 = bitcast i32 %308 to float, !dbg !29
%310 = fadd float %306, %309, !dbg !33
%311 = bitcast float %310 to i32, !dbg !29
%312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 4, i32 31), !dbg !29
%313 = bitcast i32 %312 to float, !dbg !29
%314 = fadd float %310, %313, !dbg !33
%315 = bitcast float %314 to i32, !dbg !29
%316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 2, i32 31), !dbg !29
%317 = bitcast i32 %316 to float, !dbg !29
%318 = fadd float %314, %317, !dbg !33
%319 = bitcast float %318 to i32, !dbg !29
%320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 1, i32 31), !dbg !29
%321 = bitcast i32 %320 to float, !dbg !29
%322 = fadd float %318, %321, !dbg !33
%323 = extractelement <16 x float> %212, i64 5, !dbg !29
%324 = bitcast float %323 to i32, !dbg !29
%325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 16, i32 31), !dbg !29
%326 = bitcast i32 %325 to float, !dbg !29
%327 = fadd float %323, %326, !dbg !33
%328 = bitcast float %327 to i32, !dbg !29
%329 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 8, i32 31), !dbg !29
%330 = bitcast i32 %329 to float, !dbg !29
%331 = fadd float %327, %330, !dbg !33
%332 = bitcast float %331 to i32, !dbg !29
%333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %332, i32 4, i32 31), !dbg !29
%334 = bitcast i32 %333 to float, !dbg !29
%335 = fadd float %331, %334, !dbg !33
%336 = bitcast float %335 to i32, !dbg !29
%337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !29
%338 = bitcast i32 %337 to float, !dbg !29
%339 = fadd float %335, %338, !dbg !33
%340 = bitcast float %339 to i32, !dbg !29
%341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 1, i32 31), !dbg !29
%342 = bitcast i32 %341 to float, !dbg !29
%343 = fadd float %339, %342, !dbg !33
%344 = extractelement <16 x float> %212, i64 6, !dbg !29
%345 = bitcast float %344 to i32, !dbg !29
%346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 16, i32 31), !dbg !29
%347 = bitcast i32 %346 to float, !dbg !29
%348 = fadd float %344, %347, !dbg !33
%349 = bitcast float %348 to i32, !dbg !29
%350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 8, i32 31), !dbg !29
%351 = bitcast i32 %350 to float, !dbg !29
%352 = fadd float %348, %351, !dbg !33
%353 = bitcast float %352 to i32, !dbg !29
%354 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %353, i32 4, i32 31), !dbg !29
%355 = bitcast i32 %354 to float, !dbg !29
%356 = fadd float %352, %355, !dbg !33
%357 = bitcast float %356 to i32, !dbg !29
%358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !29
%359 = bitcast i32 %358 to float, !dbg !29
%360 = fadd float %356, %359, !dbg !33
%361 = bitcast float %360 to i32, !dbg !29
%362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !29
%363 = bitcast i32 %362 to float, !dbg !29
%364 = fadd float %360, %363, !dbg !33
%365 = extractelement <16 x float> %212, i64 7, !dbg !29
%366 = bitcast float %365 to i32, !dbg !29
%367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 16, i32 31), !dbg !29
%368 = bitcast i32 %367 to float, !dbg !29
%369 = fadd float %365, %368, !dbg !33
%370 = bitcast float %369 to i32, !dbg !29
%371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 8, i32 31), !dbg !29
%372 = bitcast i32 %371 to float, !dbg !29
%373 = fadd float %369, %372, !dbg !33
%374 = bitcast float %373 to i32, !dbg !29
%375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %374, i32 4, i32 31), !dbg !29
%376 = bitcast i32 %375 to float, !dbg !29
%377 = fadd float %373, %376, !dbg !33
%378 = bitcast float %377 to i32, !dbg !29
%379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 2, i32 31), !dbg !29
%380 = bitcast i32 %379 to float, !dbg !29
%381 = fadd float %377, %380, !dbg !33
%382 = bitcast float %381 to i32, !dbg !29
%383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !29
%384 = bitcast i32 %383 to float, !dbg !29
%385 = fadd float %381, %384, !dbg !33
%386 = extractelement <16 x float> %212, i64 8, !dbg !29
%387 = bitcast float %386 to i32, !dbg !29
%388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 16, i32 31), !dbg !29
%389 = bitcast i32 %388 to float, !dbg !29
%390 = fadd float %386, %389, !dbg !33
%391 = bitcast float %390 to i32, !dbg !29
%392 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %391, i32 8, i32 31), !dbg !29
%393 = bitcast i32 %392 to float, !dbg !29
%394 = fadd float %390, %393, !dbg !33
%395 = bitcast float %394 to i32, !dbg !29
%396 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 4, i32 31), !dbg !29
%397 = bitcast i32 %396 to float, !dbg !29
%398 = fadd float %394, %397, !dbg !33
%399 = bitcast float %398 to i32, !dbg !29
%400 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %399, i32 2, i32 31), !dbg !29
%401 = bitcast i32 %400 to float, !dbg !29
%402 = fadd float %398, %401, !dbg !33
%403 = bitcast float %402 to i32, !dbg !29
%404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 1, i32 31), !dbg !29
%405 = bitcast i32 %404 to float, !dbg !29
%406 = fadd float %402, %405, !dbg !33
%407 = extractelement <16 x float> %212, i64 9, !dbg !29
%408 = bitcast float %407 to i32, !dbg !29
%409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 16, i32 31), !dbg !29
%410 = bitcast i32 %409 to float, !dbg !29
%411 = fadd float %407, %410, !dbg !33
%412 = bitcast float %411 to i32, !dbg !29
%413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 8, i32 31), !dbg !29
%414 = bitcast i32 %413 to float, !dbg !29
%415 = fadd float %411, %414, !dbg !33
%416 = bitcast float %415 to i32, !dbg !29
%417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %416, i32 4, i32 31), !dbg !29
%418 = bitcast i32 %417 to float, !dbg !29
%419 = fadd float %415, %418, !dbg !33
%420 = bitcast float %419 to i32, !dbg !29
%421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 2, i32 31), !dbg !29
%422 = bitcast i32 %421 to float, !dbg !29
%423 = fadd float %419, %422, !dbg !33
%424 = bitcast float %423 to i32, !dbg !29
%425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !29
%426 = bitcast i32 %425 to float, !dbg !29
%427 = fadd float %423, %426, !dbg !33
%428 = extractelement <16 x float> %212, i64 10, !dbg !29
%429 = bitcast float %428 to i32, !dbg !29
%430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !29
%431 = bitcast i32 %430 to float, !dbg !29
%432 = fadd float %428, %431, !dbg !33
%433 = bitcast float %432 to i32, !dbg !29
%434 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %433, i32 8, i32 31), !dbg !29
%435 = bitcast i32 %434 to float, !dbg !29
%436 = fadd float %432, %435, !dbg !33
%437 = bitcast float %436 to i32, !dbg !29
%438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 4, i32 31), !dbg !29
%439 = bitcast i32 %438 to float, !dbg !29
%440 = fadd float %436, %439, !dbg !33
%441 = bitcast float %440 to i32, !dbg !29
%442 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 2, i32 31), !dbg !29
%443 = bitcast i32 %442 to float, !dbg !29
%444 = fadd float %440, %443, !dbg !33
%445 = bitcast float %444 to i32, !dbg !29
%446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !29
%447 = bitcast i32 %446 to float, !dbg !29
%448 = fadd float %444, %447, !dbg !33
%449 = extractelement <16 x float> %212, i64 11, !dbg !29
%450 = bitcast float %449 to i32, !dbg !29
%451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %450, i32 16, i32 31), !dbg !29
%452 = bitcast i32 %451 to float, !dbg !29
%453 = fadd float %449, %452, !dbg !33
%454 = bitcast float %453 to i32, !dbg !29
%455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 8, i32 31), !dbg !29
%456 = bitcast i32 %455 to float, !dbg !29
%457 = fadd float %453, %456, !dbg !33
%458 = bitcast float %457 to i32, !dbg !29
%459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 4, i32 31), !dbg !29
%460 = bitcast i32 %459 to float, !dbg !29
%461 = fadd float %457, %460, !dbg !33
%462 = bitcast float %461 to i32, !dbg !29
%463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 2, i32 31), !dbg !29
%464 = bitcast i32 %463 to float, !dbg !29
%465 = fadd float %461, %464, !dbg !33
%466 = bitcast float %465 to i32, !dbg !29
%467 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %466, i32 1, i32 31), !dbg !29
%468 = bitcast i32 %467 to float, !dbg !29
%469 = fadd float %465, %468, !dbg !33
%470 = extractelement <16 x float> %212, i64 12, !dbg !29
%471 = bitcast float %470 to i32, !dbg !29
%472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !29
%473 = bitcast i32 %472 to float, !dbg !29
%474 = fadd float %470, %473, !dbg !33
%475 = bitcast float %474 to i32, !dbg !29
%476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 8, i32 31), !dbg !29
%477 = bitcast i32 %476 to float, !dbg !29
%478 = fadd float %474, %477, !dbg !33
%479 = bitcast float %478 to i32, !dbg !29
%480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %479, i32 4, i32 31), !dbg !29
%481 = bitcast i32 %480 to float, !dbg !29
%482 = fadd float %478, %481, !dbg !33
%483 = bitcast float %482 to i32, !dbg !29
%484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 2, i32 31), !dbg !29
%485 = bitcast i32 %484 to float, !dbg !29
%486 = fadd float %482, %485, !dbg !33
%487 = bitcast float %486 to i32, !dbg !29
%488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !29
%489 = bitcast i32 %488 to float, !dbg !29
%490 = fadd float %486, %489, !dbg !33
%491 = extractelement <16 x float> %212, i64 13, !dbg !29
%492 = bitcast float %491 to i32, !dbg !29
%493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 16, i32 31), !dbg !29
%494 = bitcast i32 %493 to float, !dbg !29
%495 = fadd float %491, %494, !dbg !33
%496 = bitcast float %495 to i32, !dbg !29
%497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 8, i32 31), !dbg !29
%498 = bitcast i32 %497 to float, !dbg !29
%499 = fadd float %495, %498, !dbg !33
%500 = bitcast float %499 to i32, !dbg !29
%501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 4, i32 31), !dbg !29
%502 = bitcast i32 %501 to float, !dbg !29
%503 = fadd float %499, %502, !dbg !33
%504 = bitcast float %503 to i32, !dbg !29
%505 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 2, i32 31), !dbg !29
%506 = bitcast i32 %505 to float, !dbg !29
%507 = fadd float %503, %506, !dbg !33
%508 = bitcast float %507 to i32, !dbg !29
%509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %508, i32 1, i32 31), !dbg !29
%510 = bitcast i32 %509 to float, !dbg !29
%511 = fadd float %507, %510, !dbg !33
%512 = extractelement <16 x float> %212, i64 14, !dbg !29
%513 = bitcast float %512 to i32, !dbg !29
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 16, i32 31), !dbg !29
%515 = bitcast i32 %514 to float, !dbg !29
%516 = fadd float %512, %515, !dbg !33
%517 = bitcast float %516 to i32, !dbg !29
%518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !29
%519 = bitcast i32 %518 to float, !dbg !29
%520 = fadd float %516, %519, !dbg !33
%521 = bitcast float %520 to i32, !dbg !29
%522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 4, i32 31), !dbg !29
%523 = bitcast i32 %522 to float, !dbg !29
%524 = fadd float %520, %523, !dbg !33
%525 = bitcast float %524 to i32, !dbg !29
%526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %525, i32 2, i32 31), !dbg !29
%527 = bitcast i32 %526 to float, !dbg !29
%528 = fadd float %524, %527, !dbg !33
%529 = bitcast float %528 to i32, !dbg !29
%530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 1, i32 31), !dbg !29
%531 = bitcast i32 %530 to float, !dbg !29
%532 = fadd float %528, %531, !dbg !33
%533 = extractelement <16 x float> %212, i64 15, !dbg !29
%534 = bitcast float %533 to i32, !dbg !29
%535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 16, i32 31), !dbg !29
%536 = bitcast i32 %535 to float, !dbg !29
%537 = fadd float %533, %536, !dbg !33
%538 = bitcast float %537 to i32, !dbg !29
%539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !29
%540 = bitcast i32 %539 to float, !dbg !29
%541 = fadd float %537, %540, !dbg !33
%542 = bitcast float %541 to i32, !dbg !29
%543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 4, i32 31), !dbg !29
%544 = bitcast i32 %543 to float, !dbg !29
%545 = fadd float %541, %544, !dbg !33
%546 = bitcast float %545 to i32, !dbg !29
%547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 2, i32 31), !dbg !29
%548 = bitcast i32 %547 to float, !dbg !29
%549 = fadd float %545, %548, !dbg !33
%550 = bitcast float %549 to i32, !dbg !29
%551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 1, i32 31), !dbg !29
%552 = bitcast i32 %551 to float, !dbg !29
%553 = fadd float %549, %552, !dbg !33
%554 = icmp eq i32 %11, 0, !dbg !29
%555 = shl nuw nsw i32 %13, 1, !dbg !29
%556 = or i32 %555, %217, !dbg !29
%557 = zext nneg i32 %556 to i64, !dbg !29
%558 = getelementptr float, ptr addrspace(3) @global_smem, i64 %557, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %558, float %238, i1 %554) #3, !dbg !29
%559 = shl nuw nsw i32 %14, 1, !dbg !29
%560 = or i32 %559, %217, !dbg !29
%561 = zext nneg i32 %560 to i64, !dbg !29
%562 = getelementptr float, ptr addrspace(3) @global_smem, i64 %561, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %562, float %259, i1 %554) #3, !dbg !29
%563 = shl nuw nsw i32 %15, 1, !dbg !29
%564 = or i32 %563, %217, !dbg !29
%565 = zext nneg i32 %564 to i64, !dbg !29
%566 = getelementptr float, ptr addrspace(3) @global_smem, i64 %565, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %566, float %280, i1 %554) #3, !dbg !29
%567 = shl nuw nsw i32 %16, 1, !dbg !29
%568 = or i32 %567, %217, !dbg !29
%569 = zext nneg i32 %568 to i64, !dbg !29
%570 = getelementptr float, ptr addrspace(3) @global_smem, i64 %569, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %570, float %301, i1 %554) #3, !dbg !29
%571 = shl nuw nsw i32 %17, 1, !dbg !29
%572 = or i32 %571, %217, !dbg !29
%573 = zext nneg i32 %572 to i64, !dbg !29
%574 = getelementptr float, ptr addrspace(3) @global_smem, i64 %573, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %574, float %322, i1 %554) #3, !dbg !29
%575 = shl nuw nsw i32 %18, 1, !dbg !29
%576 = or i32 %575, %217, !dbg !29
%577 = zext nneg i32 %576 to i64, !dbg !29
%578 = getelementptr float, ptr addrspace(3) @global_smem, i64 %577, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %578, float %343, i1 %554) #3, !dbg !29
%579 = shl nuw nsw i32 %19, 1, !dbg !29
%580 = or i32 %579, %217, !dbg !29
%581 = zext nneg i32 %580 to i64, !dbg !29
%582 = getelementptr float, ptr addrspace(3) @global_smem, i64 %581, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %582, float %364, i1 %554) #3, !dbg !29
%583 = shl nuw nsw i32 %20, 1, !dbg !29
%584 = or i32 %583, %217, !dbg !29
%585 = zext nneg i32 %584 to i64, !dbg !29
%586 = getelementptr float, ptr addrspace(3) @global_smem, i64 %585, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %586, float %385, i1 %554) #3, !dbg !29
%587 = shl nuw nsw i32 %21, 1, !dbg !29
%588 = or i32 %587, %217, !dbg !29
%589 = zext nneg i32 %588 to i64, !dbg !29
%590 = getelementptr float, ptr addrspace(3) @global_smem, i64 %589, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %590, float %406, i1 %554) #3, !dbg !29
%591 = shl nuw nsw i32 %22, 1, !dbg !29
%592 = or i32 %591, %217, !dbg !29
%593 = zext nneg i32 %592 to i64, !dbg !29
%594 = getelementptr float, ptr addrspace(3) @global_smem, i64 %593, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %594, float %427, i1 %554) #3, !dbg !29
%595 = shl nuw nsw i32 %23, 1, !dbg !29
%596 = or i32 %595, %217, !dbg !29
%597 = zext nneg i32 %596 to i64, !dbg !29
%598 = getelementptr float, ptr addrspace(3) @global_smem, i64 %597, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %598, float %448, i1 %554) #3, !dbg !29
%599 = shl nuw nsw i32 %24, 1, !dbg !29
%600 = or i32 %599, %217, !dbg !29
%601 = zext nneg i32 %600 to i64, !dbg !29
%602 = getelementptr float, ptr addrspace(3) @global_smem, i64 %601, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %602, float %469, i1 %554) #3, !dbg !29
%603 = shl nuw nsw i32 %25, 1, !dbg !29
%604 = or i32 %603, %217, !dbg !29
%605 = zext nneg i32 %604 to i64, !dbg !29
%606 = getelementptr float, ptr addrspace(3) @global_smem, i64 %605, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %606, float %490, i1 %554) #3, !dbg !29
%607 = shl nuw nsw i32 %26, 1, !dbg !29
%608 = or i32 %607, %217, !dbg !29
%609 = zext nneg i32 %608 to i64, !dbg !29
%610 = getelementptr float, ptr addrspace(3) @global_smem, i64 %609, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %610, float %511, i1 %554) #3, !dbg !29
%611 = shl nuw nsw i32 %27, 1, !dbg !29
%612 = or i32 %611, %217, !dbg !29
%613 = zext nneg i32 %612 to i64, !dbg !29
%614 = getelementptr float, ptr addrspace(3) @global_smem, i64 %613, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %614, float %532, i1 %554) #3, !dbg !29
%615 = shl nuw nsw i32 %28, 1, !dbg !29
%616 = or i32 %615, %217, !dbg !29
%617 = zext nneg i32 %616 to i64, !dbg !29
%618 = getelementptr float, ptr addrspace(3) @global_smem, i64 %617, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %618, float %553, i1 %554) #3, !dbg !29
tail call void @llvm.nvvm.barrier0(), !dbg !29
%619 = icmp slt i32 %10, 128, !dbg !29
%620 = sext i32 %10 to i64, !dbg !29
%621 = getelementptr float, ptr addrspace(3) @global_smem, i64 %620, !dbg !29
%622 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %621, i1 %619) #3, !dbg !29
%623 = bitcast float %622 to i32, !dbg !29
%624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !29
%625 = bitcast i32 %624 to float, !dbg !29
%626 = fadd float %622, %625, !dbg !33
%627 = and i32 %10, 1, !dbg !29
%628 = icmp eq i32 %627, 0, !dbg !29
%629 = and i1 %619, %628, !dbg !29
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %621, float %626, i1 %629) #3, !dbg !29
tail call void @llvm.nvvm.barrier0(), !dbg !29
%630 = zext nneg i32 %555 to i64, !dbg !29
%631 = getelementptr float, ptr addrspace(3) @global_smem, i64 %630, !dbg !29
%632 = load float, ptr addrspace(3) %631, align 4, !dbg !29
%633 = zext nneg i32 %559 to i64, !dbg !29
%634 = getelementptr float, ptr addrspace(3) @global_smem, i64 %633, !dbg !29
%635 = load float, ptr addrspace(3) %634, align 4, !dbg !29
%636 = zext nneg i32 %563 to i64, !dbg !29
%637 = getelementptr float, ptr addrspace(3) @global_smem, i64 %636, !dbg !29
%638 = load float, ptr addrspace(3) %637, align 4, !dbg !29
%639 = zext nneg i32 %567 to i64, !dbg !29
%640 = getelementptr float, ptr addrspace(3) @global_smem, i64 %639, !dbg !29
%641 = load float, ptr addrspace(3) %640, align 4, !dbg !29
%642 = zext nneg i32 %571 to i64, !dbg !29
%643 = getelementptr float, ptr addrspace(3) @global_smem, i64 %642, !dbg !29
%644 = load float, ptr addrspace(3) %643, align 4, !dbg !29
%645 = zext nneg i32 %575 to i64, !dbg !29
%646 = getelementptr float, ptr addrspace(3) @global_smem, i64 %645, !dbg !29
%647 = load float, ptr addrspace(3) %646, align 4, !dbg !29
%648 = zext nneg i32 %579 to i64, !dbg !29
%649 = getelementptr float, ptr addrspace(3) @global_smem, i64 %648, !dbg !29
%650 = load float, ptr addrspace(3) %649, align 4, !dbg !29
%651 = zext nneg i32 %583 to i64, !dbg !29
%652 = getelementptr float, ptr addrspace(3) @global_smem, i64 %651, !dbg !29
%653 = load float, ptr addrspace(3) %652, align 4, !dbg !29
%654 = zext nneg i32 %587 to i64, !dbg !29
%655 = getelementptr float, ptr addrspace(3) @global_smem, i64 %654, !dbg !29
%656 = load float, ptr addrspace(3) %655, align 4, !dbg !29
%657 = zext nneg i32 %591 to i64, !dbg !29
%658 = getelementptr float, ptr addrspace(3) @global_smem, i64 %657, !dbg !29
%659 = load float, ptr addrspace(3) %658, align 4, !dbg !29
%660 = zext nneg i32 %595 to i64, !dbg !29
%661 = getelementptr float, ptr addrspace(3) @global_smem, i64 %660, !dbg !29
%662 = load float, ptr addrspace(3) %661, align 4, !dbg !29
%663 = zext nneg i32 %599 to i64, !dbg !29
%664 = getelementptr float, ptr addrspace(3) @global_smem, i64 %663, !dbg !29
%665 = load float, ptr addrspace(3) %664, align 4, !dbg !29
%666 = zext nneg i32 %603 to i64, !dbg !29
%667 = getelementptr float, ptr addrspace(3) @global_smem, i64 %666, !dbg !29
%668 = load float, ptr addrspace(3) %667, align 4, !dbg !29
%669 = zext nneg i32 %607 to i64, !dbg !29
%670 = getelementptr float, ptr addrspace(3) @global_smem, i64 %669, !dbg !29
%671 = load float, ptr addrspace(3) %670, align 4, !dbg !29
%672 = zext nneg i32 %611 to i64, !dbg !29
%673 = getelementptr float, ptr addrspace(3) @global_smem, i64 %672, !dbg !29
%674 = load float, ptr addrspace(3) %673, align 4, !dbg !29
%675 = zext nneg i32 %615 to i64, !dbg !29
%676 = getelementptr float, ptr addrspace(3) @global_smem, i64 %675, !dbg !29
%677 = load float, ptr addrspace(3) %676, align 4, !dbg !29
%678 = extractelement <16 x float> %136, i64 0, !dbg !37
%679 = extractelement <16 x float> %136, i64 1, !dbg !37
%680 = extractelement <16 x float> %136, i64 2, !dbg !37
%681 = extractelement <16 x float> %136, i64 3, !dbg !37
%682 = extractelement <16 x float> %136, i64 4, !dbg !37
%683 = extractelement <16 x float> %136, i64 5, !dbg !37
%684 = extractelement <16 x float> %136, i64 6, !dbg !37
%685 = extractelement <16 x float> %136, i64 7, !dbg !37
%686 = extractelement <16 x float> %136, i64 8, !dbg !37
%687 = extractelement <16 x float> %136, i64 9, !dbg !37
%688 = extractelement <16 x float> %136, i64 10, !dbg !37
%689 = extractelement <16 x float> %136, i64 11, !dbg !37
%690 = extractelement <16 x float> %136, i64 12, !dbg !37
%691 = extractelement <16 x float> %136, i64 13, !dbg !37
%692 = extractelement <16 x float> %136, i64 14, !dbg !37
%693 = extractelement <16 x float> %136, i64 15, !dbg !37
br label %694, !dbg !38
694: ; preds = %215, %694
%695 = phi i32 [ 0, %215 ], [ %987, %694 ]
%696 = or i32 %695, %urem, !dbg !39
%697 = zext nneg i32 %696 to i64, !dbg !39
%698 = icmp ult i32 %696, 50257, !dbg !40
%699 = add nsw i64 %100, %697, !dbg !41
%700 = add nsw i64 %101, %697, !dbg !41
%701 = add nsw i64 %102, %697, !dbg !41
%702 = add nsw i64 %103, %697, !dbg !41
%703 = add nsw i64 %104, %697, !dbg !41
%704 = add nsw i64 %105, %697, !dbg !41
%705 = add nsw i64 %106, %697, !dbg !41
%706 = add nsw i64 %107, %697, !dbg !41
%707 = add nsw i64 %108, %697, !dbg !41
%708 = add nsw i64 %109, %697, !dbg !41
%709 = add nsw i64 %110, %697, !dbg !41
%710 = add nsw i64 %111, %697, !dbg !41
%711 = add nsw i64 %112, %697, !dbg !41
%712 = add nsw i64 %113, %697, !dbg !41
%713 = add nsw i64 %114, %697, !dbg !41
%714 = add nsw i64 %115, %697, !dbg !41
%715 = getelementptr i16, ptr addrspace(1) %4, i64 %699, !dbg !42
%716 = getelementptr i16, ptr addrspace(1) %4, i64 %700, !dbg !42
%717 = getelementptr i16, ptr addrspace(1) %4, i64 %701, !dbg !42
%718 = getelementptr i16, ptr addrspace(1) %4, i64 %702, !dbg !42
%719 = getelementptr i16, ptr addrspace(1) %4, i64 %703, !dbg !42
%720 = getelementptr i16, ptr addrspace(1) %4, i64 %704, !dbg !42
%721 = getelementptr i16, ptr addrspace(1) %4, i64 %705, !dbg !42
%722 = getelementptr i16, ptr addrspace(1) %4, i64 %706, !dbg !42
%723 = getelementptr i16, ptr addrspace(1) %4, i64 %707, !dbg !42
%724 = getelementptr i16, ptr addrspace(1) %4, i64 %708, !dbg !42
%725 = getelementptr i16, ptr addrspace(1) %4, i64 %709, !dbg !42
%726 = getelementptr i16, ptr addrspace(1) %4, i64 %710, !dbg !42
%727 = getelementptr i16, ptr addrspace(1) %4, i64 %711, !dbg !42
%728 = getelementptr i16, ptr addrspace(1) %4, i64 %712, !dbg !42
%729 = getelementptr i16, ptr addrspace(1) %4, i64 %713, !dbg !42
%730 = getelementptr i16, ptr addrspace(1) %4, i64 %714, !dbg !42
%731 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %715, i1 %698, i16 0, i1 %698) #3, !dbg !43
%732 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %716, i1 %698, i16 0, i1 %698) #3, !dbg !43
%733 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %717, i1 %698, i16 0, i1 %698) #3, !dbg !43
%734 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %718, i1 %698, i16 0, i1 %698) #3, !dbg !43
%735 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %719, i1 %698, i16 0, i1 %698) #3, !dbg !43
%736 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %720, i1 %698, i16 0, i1 %698) #3, !dbg !43
%737 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %721, i1 %698, i16 0, i1 %698) #3, !dbg !43
%738 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %722, i1 %698, i16 0, i1 %698) #3, !dbg !43
%739 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %723, i1 %698, i16 0, i1 %698) #3, !dbg !43
%740 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %724, i1 %698, i16 0, i1 %698) #3, !dbg !43
%741 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %725, i1 %698, i16 0, i1 %698) #3, !dbg !43
%742 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %726, i1 %698, i16 0, i1 %698) #3, !dbg !43
%743 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %727, i1 %698, i16 0, i1 %698) #3, !dbg !43
%744 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %728, i1 %698, i16 0, i1 %698) #3, !dbg !43
%745 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %729, i1 %698, i16 0, i1 %698) #3, !dbg !43
%746 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %730, i1 %698, i16 0, i1 %698) #3, !dbg !43
%747 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %731) #3, !dbg !44
%748 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %732) #3, !dbg !44
%749 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %733) #3, !dbg !44
%750 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %734) #3, !dbg !44
%751 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %735) #3, !dbg !44
%752 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %736) #3, !dbg !44
%753 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %737) #3, !dbg !44
%754 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %738) #3, !dbg !44
%755 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %739) #3, !dbg !44
%756 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %740) #3, !dbg !44
%757 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %741) #3, !dbg !44
%758 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %742) #3, !dbg !44
%759 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %743) #3, !dbg !44
%760 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %744) #3, !dbg !44
%761 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %745) #3, !dbg !44
%762 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %746) #3, !dbg !44
%763 = getelementptr float, ptr addrspace(1) %0, i64 %699, !dbg !45
%764 = getelementptr float, ptr addrspace(1) %0, i64 %700, !dbg !45
%765 = getelementptr float, ptr addrspace(1) %0, i64 %701, !dbg !45
%766 = getelementptr float, ptr addrspace(1) %0, i64 %702, !dbg !45
%767 = getelementptr float, ptr addrspace(1) %0, i64 %703, !dbg !45
%768 = getelementptr float, ptr addrspace(1) %0, i64 %704, !dbg !45
%769 = getelementptr float, ptr addrspace(1) %0, i64 %705, !dbg !45
%770 = getelementptr float, ptr addrspace(1) %0, i64 %706, !dbg !45
%771 = getelementptr float, ptr addrspace(1) %0, i64 %707, !dbg !45
%772 = getelementptr float, ptr addrspace(1) %0, i64 %708, !dbg !45
%773 = getelementptr float, ptr addrspace(1) %0, i64 %709, !dbg !45
%774 = getelementptr float, ptr addrspace(1) %0, i64 %710, !dbg !45
%775 = getelementptr float, ptr addrspace(1) %0, i64 %711, !dbg !45
%776 = getelementptr float, ptr addrspace(1) %0, i64 %712, !dbg !45
%777 = getelementptr float, ptr addrspace(1) %0, i64 %713, !dbg !45
%778 = getelementptr float, ptr addrspace(1) %0, i64 %714, !dbg !45
%779 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %763, i1 %698, i32 0, i1 %698) #3, !dbg !46
%780 = bitcast i32 %779 to float, !dbg !46
%781 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %764, i1 %698, i32 0, i1 %698) #3, !dbg !46
%782 = bitcast i32 %781 to float, !dbg !46
%783 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %765, i1 %698, i32 0, i1 %698) #3, !dbg !46
%784 = bitcast i32 %783 to float, !dbg !46
%785 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %766, i1 %698, i32 0, i1 %698) #3, !dbg !46
%786 = bitcast i32 %785 to float, !dbg !46
%787 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %767, i1 %698, i32 0, i1 %698) #3, !dbg !46
%788 = bitcast i32 %787 to float, !dbg !46
%789 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %768, i1 %698, i32 0, i1 %698) #3, !dbg !46
%790 = bitcast i32 %789 to float, !dbg !46
%791 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %769, i1 %698, i32 0, i1 %698) #3, !dbg !46
%792 = bitcast i32 %791 to float, !dbg !46
%793 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %770, i1 %698, i32 0, i1 %698) #3, !dbg !46
%794 = bitcast i32 %793 to float, !dbg !46
%795 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %771, i1 %698, i32 0, i1 %698) #3, !dbg !46
%796 = bitcast i32 %795 to float, !dbg !46
%797 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %772, i1 %698, i32 0, i1 %698) #3, !dbg !46
%798 = bitcast i32 %797 to float, !dbg !46
%799 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %773, i1 %698, i32 0, i1 %698) #3, !dbg !46
%800 = bitcast i32 %799 to float, !dbg !46
%801 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %774, i1 %698, i32 0, i1 %698) #3, !dbg !46
%802 = bitcast i32 %801 to float, !dbg !46
%803 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %775, i1 %698, i32 0, i1 %698) #3, !dbg !46
%804 = bitcast i32 %803 to float, !dbg !46
%805 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %776, i1 %698, i32 0, i1 %698) #3, !dbg !46
%806 = bitcast i32 %805 to float, !dbg !46
%807 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %777, i1 %698, i32 0, i1 %698) #3, !dbg !46
%808 = bitcast i32 %807 to float, !dbg !46
%809 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %778, i1 %698, i32 0, i1 %698) #3, !dbg !46
%810 = bitcast i32 %809 to float, !dbg !46
%811 = getelementptr i16, ptr addrspace(1) %5, i64 %699, !dbg !47
%812 = getelementptr i16, ptr addrspace(1) %5, i64 %700, !dbg !47
%813 = getelementptr i16, ptr addrspace(1) %5, i64 %701, !dbg !47
%814 = getelementptr i16, ptr addrspace(1) %5, i64 %702, !dbg !47
%815 = getelementptr i16, ptr addrspace(1) %5, i64 %703, !dbg !47
%816 = getelementptr i16, ptr addrspace(1) %5, i64 %704, !dbg !47
%817 = getelementptr i16, ptr addrspace(1) %5, i64 %705, !dbg !47
%818 = getelementptr i16, ptr addrspace(1) %5, i64 %706, !dbg !47
%819 = getelementptr i16, ptr addrspace(1) %5, i64 %707, !dbg !47
%820 = getelementptr i16, ptr addrspace(1) %5, i64 %708, !dbg !47
%821 = getelementptr i16, ptr addrspace(1) %5, i64 %709, !dbg !47
%822 = getelementptr i16, ptr addrspace(1) %5, i64 %710, !dbg !47
%823 = getelementptr i16, ptr addrspace(1) %5, i64 %711, !dbg !47
%824 = getelementptr i16, ptr addrspace(1) %5, i64 %712, !dbg !47
%825 = getelementptr i16, ptr addrspace(1) %5, i64 %713, !dbg !47
%826 = getelementptr i16, ptr addrspace(1) %5, i64 %714, !dbg !47
%827 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %811, i1 %698, i16 0, i1 %698) #3, !dbg !48
%828 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %812, i1 %698, i16 0, i1 %698) #3, !dbg !48
%829 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %813, i1 %698, i16 0, i1 %698) #3, !dbg !48
%830 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %814, i1 %698, i16 0, i1 %698) #3, !dbg !48
%831 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %815, i1 %698, i16 0, i1 %698) #3, !dbg !48
%832 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %816, i1 %698, i16 0, i1 %698) #3, !dbg !48
%833 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %817, i1 %698, i16 0, i1 %698) #3, !dbg !48
%834 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %818, i1 %698, i16 0, i1 %698) #3, !dbg !48
%835 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %819, i1 %698, i16 0, i1 %698) #3, !dbg !48
%836 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %820, i1 %698, i16 0, i1 %698) #3, !dbg !48
%837 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %821, i1 %698, i16 0, i1 %698) #3, !dbg !48
%838 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %822, i1 %698, i16 0, i1 %698) #3, !dbg !48
%839 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %823, i1 %698, i16 0, i1 %698) #3, !dbg !48
%840 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %824, i1 %698, i16 0, i1 %698) #3, !dbg !48
%841 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %825, i1 %698, i16 0, i1 %698) #3, !dbg !48
%842 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %826, i1 %698, i16 0, i1 %698) #3, !dbg !48
%843 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %827) #3, !dbg !49
%844 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %828) #3, !dbg !49
%845 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %829) #3, !dbg !49
%846 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %830) #3, !dbg !49
%847 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %831) #3, !dbg !49
%848 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %832) #3, !dbg !49
%849 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %833) #3, !dbg !49
%850 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %834) #3, !dbg !49
%851 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %835) #3, !dbg !49
%852 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %836) #3, !dbg !49
%853 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %837) #3, !dbg !49
%854 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %838) #3, !dbg !49
%855 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %839) #3, !dbg !49
%856 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %840) #3, !dbg !49
%857 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %841) #3, !dbg !49
%858 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %842) #3, !dbg !49
%859 = fmul float %678, %780, !dbg !37
%860 = fmul float %679, %782, !dbg !37
%861 = fmul float %680, %784, !dbg !37
%862 = fmul float %681, %786, !dbg !37
%863 = fmul float %682, %788, !dbg !37
%864 = fmul float %683, %790, !dbg !37
%865 = fmul float %684, %792, !dbg !37
%866 = fmul float %685, %794, !dbg !37
%867 = fmul float %686, %796, !dbg !37
%868 = fmul float %687, %798, !dbg !37
%869 = fmul float %688, %800, !dbg !37
%870 = fmul float %689, %802, !dbg !37
%871 = fmul float %690, %804, !dbg !37
%872 = fmul float %691, %806, !dbg !37
%873 = fmul float %692, %808, !dbg !37
%874 = fmul float %693, %810, !dbg !37
%875 = fmul float %843, 0x3FF7154760000000, !dbg !50
%876 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %875) #3, !dbg !50
%877 = fmul float %844, 0x3FF7154760000000, !dbg !50
%878 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %877) #3, !dbg !50
%879 = fmul float %845, 0x3FF7154760000000, !dbg !50
%880 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %879) #3, !dbg !50
%881 = fmul float %846, 0x3FF7154760000000, !dbg !50
%882 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %881) #3, !dbg !50
%883 = fmul float %847, 0x3FF7154760000000, !dbg !50
%884 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %883) #3, !dbg !50
%885 = fmul float %848, 0x3FF7154760000000, !dbg !50
%886 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %885) #3, !dbg !50
%887 = fmul float %849, 0x3FF7154760000000, !dbg !50
%888 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %887) #3, !dbg !50
%889 = fmul float %850, 0x3FF7154760000000, !dbg !50
%890 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %889) #3, !dbg !50
%891 = fmul float %851, 0x3FF7154760000000, !dbg !50
%892 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %891) #3, !dbg !50
%893 = fmul float %852, 0x3FF7154760000000, !dbg !50
%894 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %893) #3, !dbg !50
%895 = fmul float %853, 0x3FF7154760000000, !dbg !50
%896 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %895) #3, !dbg !50
%897 = fmul float %854, 0x3FF7154760000000, !dbg !50
%898 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %897) #3, !dbg !50
%899 = fmul float %855, 0x3FF7154760000000, !dbg !50
%900 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %899) #3, !dbg !50
%901 = fmul float %856, 0x3FF7154760000000, !dbg !50
%902 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %901) #3, !dbg !50
%903 = fmul float %857, 0x3FF7154760000000, !dbg !50
%904 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %903) #3, !dbg !50
%905 = fmul float %858, 0x3FF7154760000000, !dbg !50
%906 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %905) #3, !dbg !50
%907 = fmul float %632, %876, !dbg !51
%908 = fmul float %635, %878, !dbg !51
%909 = fmul float %638, %880, !dbg !51
%910 = fmul float %641, %882, !dbg !51
%911 = fmul float %644, %884, !dbg !51
%912 = fmul float %647, %886, !dbg !51
%913 = fmul float %650, %888, !dbg !51
%914 = fmul float %653, %890, !dbg !51
%915 = fmul float %656, %892, !dbg !51
%916 = fmul float %659, %894, !dbg !51
%917 = fmul float %662, %896, !dbg !51
%918 = fmul float %665, %898, !dbg !51
%919 = fmul float %668, %900, !dbg !51
%920 = fmul float %671, %902, !dbg !51
%921 = fmul float %674, %904, !dbg !51
%922 = fmul float %677, %906, !dbg !51
%923 = fsub float %859, %907, !dbg !52
%924 = fsub float %860, %908, !dbg !52
%925 = fsub float %861, %909, !dbg !52
%926 = fsub float %862, %910, !dbg !52
%927 = fsub float %863, %911, !dbg !52
%928 = fsub float %864, %912, !dbg !52
%929 = fsub float %865, %913, !dbg !52
%930 = fsub float %866, %914, !dbg !52
%931 = fsub float %867, %915, !dbg !52
%932 = fsub float %868, %916, !dbg !52
%933 = fsub float %869, %917, !dbg !52
%934 = fsub float %870, %918, !dbg !52
%935 = fsub float %871, %919, !dbg !52
%936 = fsub float %872, %920, !dbg !52
%937 = fsub float %873, %921, !dbg !52
%938 = fsub float %874, %922, !dbg !52
%939 = fadd float %747, %923, !dbg !53
%940 = fadd float %748, %924, !dbg !53
%941 = fadd float %749, %925, !dbg !53
%942 = fadd float %750, %926, !dbg !53
%943 = fadd float %751, %927, !dbg !53
%944 = fadd float %752, %928, !dbg !53
%945 = fadd float %753, %929, !dbg !53
%946 = fadd float %754, %930, !dbg !53
%947 = fadd float %755, %931, !dbg !53
%948 = fadd float %756, %932, !dbg !53
%949 = fadd float %757, %933, !dbg !53
%950 = fadd float %758, %934, !dbg !53
%951 = fadd float %759, %935, !dbg !53
%952 = fadd float %760, %936, !dbg !53
%953 = fadd float %761, %937, !dbg !53
%954 = fadd float %762, %938, !dbg !53
%955 = getelementptr i16, ptr addrspace(1) %6, i64 %699, !dbg !54
%956 = getelementptr i16, ptr addrspace(1) %6, i64 %700, !dbg !54
%957 = getelementptr i16, ptr addrspace(1) %6, i64 %701, !dbg !54
%958 = getelementptr i16, ptr addrspace(1) %6, i64 %702, !dbg !54
%959 = getelementptr i16, ptr addrspace(1) %6, i64 %703, !dbg !54
%960 = getelementptr i16, ptr addrspace(1) %6, i64 %704, !dbg !54
%961 = getelementptr i16, ptr addrspace(1) %6, i64 %705, !dbg !54
%962 = getelementptr i16, ptr addrspace(1) %6, i64 %706, !dbg !54
%963 = getelementptr i16, ptr addrspace(1) %6, i64 %707, !dbg !54
%964 = getelementptr i16, ptr addrspace(1) %6, i64 %708, !dbg !54
%965 = getelementptr i16, ptr addrspace(1) %6, i64 %709, !dbg !54
%966 = getelementptr i16, ptr addrspace(1) %6, i64 %710, !dbg !54
%967 = getelementptr i16, ptr addrspace(1) %6, i64 %711, !dbg !54
%968 = getelementptr i16, ptr addrspace(1) %6, i64 %712, !dbg !54
%969 = getelementptr i16, ptr addrspace(1) %6, i64 %713, !dbg !54
%970 = getelementptr i16, ptr addrspace(1) %6, i64 %714, !dbg !54
%971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %939) #3, !dbg !55
%972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %940) #3, !dbg !55
%973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %941) #3, !dbg !55
%974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %942) #3, !dbg !55
%975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %943) #3, !dbg !55
%976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %944) #3, !dbg !55
%977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %945) #3, !dbg !55
%978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %946) #3, !dbg !55
%979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #3, !dbg !55
%980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #3, !dbg !55
%981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #3, !dbg !55
%982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #3, !dbg !55
%983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #3, !dbg !55
%984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #3, !dbg !55
%985 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #3, !dbg !55
%986 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %971, ptr addrspace(1) %955, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %972, ptr addrspace(1) %956, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %973, ptr addrspace(1) %957, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %974, ptr addrspace(1) %958, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %975, ptr addrspace(1) %959, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %976, ptr addrspace(1) %960, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %977, ptr addrspace(1) %961, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %978, ptr addrspace(1) %962, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %979, ptr addrspace(1) %963, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %980, ptr addrspace(1) %964, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %981, ptr addrspace(1) %965, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %982, ptr addrspace(1) %966, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %983, ptr addrspace(1) %967, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %984, ptr addrspace(1) %968, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %985, ptr addrspace(1) %969, i1 %698) #3, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %986, ptr addrspace(1) %970, i1 %698) #3, !dbg !55
%987 = add nuw nsw i32 %695, 64, !dbg !38
%988 = icmp ult i32 %695, 50193, !dbg !38
br i1 %988, label %694, label %989, !dbg !38
989: ; preds = %694
ret void, !dbg !56
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 21, column: 28, scope: !5)
!11 = !DILocation(line: 21, column: 34, scope: !5)
!12 = !DILocation(line: 21, column: 46, scope: !5)
!13 = !DILocation(line: 22, column: 23, scope: !5)
!14 = !DILocation(line: 26, column: 30, scope: !5)
!15 = !DILocation(line: 26, column: 35, scope: !5)
!16 = !DILocation(line: 27, column: 19, scope: !5)
!17 = !DILocation(line: 29, column: 19, scope: !5)
!18 = !DILocation(line: 36, column: 46, scope: !5)
!19 = !DILocation(line: 38, column: 23, scope: !5)
!20 = !DILocation(line: 39, column: 22, scope: !5)
!21 = !DILocation(line: 41, column: 37, scope: !5)
!22 = !DILocation(line: 32, column: 36, scope: !5)
!23 = !DILocation(line: 33, column: 27, scope: !5)
!24 = !DILocation(line: 34, column: 25, scope: !5)
!25 = !DILocation(line: 36, column: 34, scope: !5)
!26 = !DILocation(line: 36, column: 52, scope: !5)
!27 = !DILocation(line: 42, column: 23, scope: !5)
!28 = !DILocation(line: 45, column: 40, scope: !5)
!29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
!30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!32 = !DILocation(line: 46, column: 27, scope: !30)
!33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
!34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
!35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
!36 = !DILocation(line: 46, column: 27, scope: !34)
!37 = !DILocation(line: 63, column: 24, scope: !5)
!38 = !DILocation(line: 51, column: 36, scope: !5)
!39 = !DILocation(line: 52, column: 27, scope: !5)
!40 = !DILocation(line: 53, column: 25, scope: !5)
!41 = !DILocation(line: 55, column: 41, scope: !5)
!42 = !DILocation(line: 55, column: 35, scope: !5)
!43 = !DILocation(line: 55, column: 53, scope: !5)
!44 = !DILocation(line: 55, column: 105, scope: !5)
!45 = !DILocation(line: 56, column: 35, scope: !5)
!46 = !DILocation(line: 56, column: 53, scope: !5)
!47 = !DILocation(line: 57, column: 35, scope: !5)
!48 = !DILocation(line: 57, column: 53, scope: !5)
!49 = !DILocation(line: 57, column: 105, scope: !5)
!50 = !DILocation(line: 65, column: 23, scope: !5)
!51 = !DILocation(line: 66, column: 24, scope: !5)
!52 = !DILocation(line: 67, column: 24, scope: !5)
!53 = !DILocation(line: 69, column: 24, scope: !5)
!54 = !DILocation(line: 70, column: 29, scope: !5)
!55 = !DILocation(line: 70, column: 54, scope: !5)
!56 = !DILocation(line: 51, column: 4, scope: !5)