; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external addrspace(3) global [0 x i8] define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %11 = and i32 %10, 31, !dbg !8 %12 = lshr i32 %10, 6, !dbg !8 %13 = and i32 %12, 3, !dbg !8 %14 = or i32 %13, 4, !dbg !8 %15 = or i32 %13, 8, !dbg !8 %16 = or i32 %13, 12, !dbg !8 %17 = or i32 %13, 16, !dbg !8 %18 = or i32 %13, 20, !dbg !8 %19 = or i32 %13, 24, !dbg !8 %20 = or i32 %13, 28, !dbg !8 %21 = or i32 %13, 32, !dbg !8 %22 = or i32 %13, 36, !dbg !8 %23 = or i32 %13, 40, !dbg !8 %24 = or i32 %13, 44, !dbg !8 %25 = or i32 %13, 48, !dbg !8 %26 = or i32 %13, 52, !dbg !8 %27 = or i32 %13, 56, !dbg !8 %28 = or i32 %13, 60, !dbg !8 %urem = and i32 %10, 63, !dbg !9 %29 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 %30 = sext i32 %29 to i64, !dbg !11 %31 = shl nsw i64 %30, 6, !dbg !12 %32 = zext nneg i32 %13 to i64 %33 = zext nneg i32 %14 to i64 %34 = zext nneg i32 %15 to i64 %35 = zext nneg i32 %16 to i64 %36 = zext nneg i32 %17 to i64 %37 = zext nneg i32 %18 to i64 %38 = zext nneg i32 %19 to i64 %39 = zext nneg i32 %20 to i64 %40 = zext nneg i32 %21 to i64 %41 = zext nneg i32 %22 to i64 %42 = zext nneg i32 %23 to i64 %43 = zext nneg i32 %24 to i64 %44 = zext nneg i32 %25 to i64 %45 = zext nneg i32 %26 to i64 %46 = zext nneg i32 %27 to i64 %47 = zext nneg i32 %28 to i64 %48 = or i64 %31, %32, !dbg !13 %49 = or i64 %31, %33, !dbg !13 %50 = or i64 %31, %34, !dbg !13 %51 = or i64 %31, %35, !dbg !13 %52 = or i64 %31, %36, !dbg !13 %53 = or i64 %31, %37, !dbg !13 %54 = or i64 %31, %38, !dbg !13 %55 = or i64 %31, %39, !dbg !13 %56 = or i64 %31, %40, !dbg !13 %57 = or i64 %31, %41, !dbg !13 %58 = or i64 %31, %42, !dbg !13 %59 = or i64 %31, %43, !dbg !13 %60 = or i64 %31, %44, !dbg !13 %61 = or i64 %31, %45, !dbg !13 %62 = or i64 %31, %46, !dbg !13 %63 = or i64 %31, %47, !dbg !13 %64 = getelementptr i64, ptr addrspace(1) %1, i64 %48, !dbg !14 %65 = getelementptr i64, ptr addrspace(1) %1, i64 %49, !dbg !14 %66 = getelementptr i64, ptr addrspace(1) %1, i64 %50, !dbg !14 %67 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !14 %68 = getelementptr i64, ptr addrspace(1) %1, i64 %52, !dbg !14 %69 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !14 %70 = getelementptr i64, ptr addrspace(1) %1, i64 %54, !dbg !14 %71 = getelementptr i64, ptr addrspace(1) %1, i64 %55, !dbg !14 %72 = getelementptr i64, ptr addrspace(1) %1, i64 %56, !dbg !14 %73 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !14 %74 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !14 %75 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !14 %76 = getelementptr i64, ptr addrspace(1) %1, i64 %60, !dbg !14 %77 = getelementptr i64, ptr addrspace(1) %1, i64 %61, !dbg !14 %78 = getelementptr i64, ptr addrspace(1) %1, i64 %62, !dbg !14 %79 = getelementptr i64, ptr addrspace(1) %1, i64 %63, !dbg !14 %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %64, i1 true) #3, !dbg !15 %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %65, i1 true) #3, !dbg !15 %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !15 %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %67, i1 true) #3, !dbg !15 %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %68, i1 true) #3, !dbg !15 %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %69, i1 true) #3, !dbg !15 %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %70, i1 true) #3, !dbg !15 %87 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %71, i1 true) #3, !dbg !15 %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !15 %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %73, i1 true) #3, !dbg !15 %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %74, i1 true) #3, !dbg !15 %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %75, i1 true) #3, !dbg !15 %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !15 %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %77, i1 true) #3, !dbg !15 %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %78, i1 true) #3, !dbg !15 %95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %79, i1 true) #3, !dbg !15 %96 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16 %97 = bitcast i32 %96 to float, !dbg !16 %98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17 %99 = bitcast i32 %98 to float, !dbg !17 %100 = mul nsw i64 %48, 50257, !dbg !18 %101 = mul nsw i64 %49, 50257, !dbg !18 %102 = mul nsw i64 %50, 50257, !dbg !18 %103 = mul nsw i64 %51, 50257, !dbg !18 %104 = mul nsw i64 %52, 50257, !dbg !18 %105 = mul nsw i64 %53, 50257, !dbg !18 %106 = mul nsw i64 %54, 50257, !dbg !18 %107 = mul nsw i64 %55, 50257, !dbg !18 %108 = mul nsw i64 %56, 50257, !dbg !18 %109 = mul nsw i64 %57, 50257, !dbg !18 %110 = mul nsw i64 %58, 50257, !dbg !18 %111 = mul nsw i64 %59, 50257, !dbg !18 %112 = mul nsw i64 %60, 50257, !dbg !18 %113 = mul nsw i64 %61, 50257, !dbg !18 %114 = mul nsw i64 %62, 50257, !dbg !18 %115 = mul nsw i64 %63, 50257, !dbg !18 %116 = insertelement <16 x i64> poison, i64 %80, i64 0, !dbg !19 %117 = insertelement <16 x i64> %116, i64 %81, i64 1, !dbg !19 %118 = insertelement <16 x i64> %117, i64 %82, i64 2, !dbg !19 %119 = insertelement <16 x i64> %118, i64 %83, i64 3, !dbg !19 %120 = insertelement <16 x i64> %119, i64 %84, i64 4, !dbg !19 %121 = insertelement <16 x i64> %120, i64 %85, i64 5, !dbg !19 %122 = insertelement <16 x i64> %121, i64 %86, i64 6, !dbg !19 %123 = insertelement <16 x i64> %122, i64 %87, i64 7, !dbg !19 %124 = insertelement <16 x i64> %123, i64 %88, i64 8, !dbg !19 %125 = insertelement <16 x i64> %124, i64 %89, i64 9, !dbg !19 %126 = insertelement <16 x i64> %125, i64 %90, i64 10, !dbg !19 %127 = insertelement <16 x i64> %126, i64 %91, i64 11, !dbg !19 %128 = insertelement <16 x i64> %127, i64 %92, i64 12, !dbg !19 %129 = insertelement <16 x i64> %128, i64 %93, i64 13, !dbg !19 %130 = insertelement <16 x i64> %129, i64 %94, i64 14, !dbg !19 %131 = insertelement <16 x i64> %130, i64 %95, i64 15, !dbg !19 %132 = icmp eq <16 x i64> %131, , !dbg !19 %133 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %97, float %99) #3, !dbg !20 %134 = insertelement <16 x float> poison, float %133, i64 0, !dbg !21 %135 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> zeroinitializer, !dbg !21 %136 = select <16 x i1> %132, <16 x float> zeroinitializer, <16 x float> %135, !dbg !21 %137 = getelementptr float, ptr addrspace(1) %0, i64 %100 %138 = getelementptr float, ptr addrspace(1) %0, i64 %101 %139 = getelementptr float, ptr addrspace(1) %0, i64 %102 %140 = getelementptr float, ptr addrspace(1) %0, i64 %103 %141 = getelementptr float, ptr addrspace(1) %0, i64 %104 %142 = getelementptr float, ptr addrspace(1) %0, i64 %105 %143 = getelementptr float, ptr addrspace(1) %0, i64 %106 %144 = getelementptr float, ptr addrspace(1) %0, i64 %107 %145 = getelementptr float, ptr addrspace(1) %0, i64 %108 %146 = getelementptr float, ptr addrspace(1) %0, i64 %109 %147 = getelementptr float, ptr addrspace(1) %0, i64 %110 %148 = getelementptr float, ptr addrspace(1) %0, i64 %111 %149 = getelementptr float, ptr addrspace(1) %0, i64 %112 %150 = getelementptr float, ptr addrspace(1) %0, i64 %113 %151 = getelementptr float, ptr addrspace(1) %0, i64 %114 %152 = getelementptr float, ptr addrspace(1) %0, i64 %115 br label %153, !dbg !22 153: ; preds = %9, %153 %154 = phi i32 [ 0, %9 ], [ %213, %153 ] %155 = phi <16 x float> [ zeroinitializer, %9 ], [ %212, %153 ] %156 = or i32 %154, %urem, !dbg !23 %157 = zext nneg i32 %156 to i64, !dbg !23 %158 = icmp ult i32 %156, 50257, !dbg !24 %159 = getelementptr float, ptr addrspace(1) %137, i64 %157, !dbg !25 %160 = getelementptr float, ptr addrspace(1) %138, i64 %157, !dbg !25 %161 = getelementptr float, ptr addrspace(1) %139, i64 %157, !dbg !25 %162 = getelementptr float, ptr addrspace(1) %140, i64 %157, !dbg !25 %163 = getelementptr float, ptr addrspace(1) %141, i64 %157, !dbg !25 %164 = getelementptr float, ptr addrspace(1) %142, i64 %157, !dbg !25 %165 = getelementptr float, ptr addrspace(1) %143, i64 %157, !dbg !25 %166 = getelementptr float, ptr addrspace(1) %144, i64 %157, !dbg !25 %167 = getelementptr float, ptr addrspace(1) %145, i64 %157, !dbg !25 %168 = getelementptr float, ptr addrspace(1) %146, i64 %157, !dbg !25 %169 = getelementptr float, ptr addrspace(1) %147, i64 %157, !dbg !25 %170 = getelementptr float, ptr addrspace(1) %148, i64 %157, !dbg !25 %171 = getelementptr float, ptr addrspace(1) %149, i64 %157, !dbg !25 %172 = getelementptr float, ptr addrspace(1) %150, i64 %157, !dbg !25 %173 = getelementptr float, ptr addrspace(1) %151, i64 %157, !dbg !25 %174 = getelementptr float, ptr addrspace(1) %152, i64 %157, !dbg !25 %175 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 %158, i32 0, i1 %158) #3, !dbg !26 %176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %160, i1 %158, i32 0, i1 %158) #3, !dbg !26 %177 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %161, i1 %158, i32 0, i1 %158) #3, !dbg !26 %178 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %162, i1 %158, i32 0, i1 %158) #3, !dbg !26 %179 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %163, i1 %158, i32 0, i1 %158) #3, !dbg !26 %180 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %164, i1 %158, i32 0, i1 %158) #3, !dbg !26 %181 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %165, i1 %158, i32 0, i1 %158) #3, !dbg !26 %182 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %166, i1 %158, i32 0, i1 %158) #3, !dbg !26 %183 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %167, i1 %158, i32 0, i1 %158) #3, !dbg !26 %184 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %168, i1 %158, i32 0, i1 %158) #3, !dbg !26 %185 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %169, i1 %158, i32 0, i1 %158) #3, !dbg !26 %186 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %170, i1 %158, i32 0, i1 %158) #3, !dbg !26 %187 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %171, i1 %158, i32 0, i1 %158) #3, !dbg !26 %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %172, i1 %158, i32 0, i1 %158) #3, !dbg !26 %189 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %173, i1 %158, i32 0, i1 %158) #3, !dbg !26 %190 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %174, i1 %158, i32 0, i1 %158) #3, !dbg !26 %191 = insertelement <16 x i32> poison, i32 %175, i64 0, !dbg !26 %192 = insertelement <16 x i32> %191, i32 %176, i64 1, !dbg !26 %193 = insertelement <16 x i32> %192, i32 %177, i64 2, !dbg !26 %194 = insertelement <16 x i32> %193, i32 %178, i64 3, !dbg !26 %195 = insertelement <16 x i32> %194, i32 %179, i64 4, !dbg !26 %196 = insertelement <16 x i32> %195, i32 %180, i64 5, !dbg !26 %197 = insertelement <16 x i32> %196, i32 %181, i64 6, !dbg !26 %198 = insertelement <16 x i32> %197, i32 %182, i64 7, !dbg !26 %199 = insertelement <16 x i32> %198, i32 %183, i64 8, !dbg !26 %200 = insertelement <16 x i32> %199, i32 %184, i64 9, !dbg !26 %201 = insertelement <16 x i32> %200, i32 %185, i64 10, !dbg !26 %202 = insertelement <16 x i32> %201, i32 %186, i64 11, !dbg !26 %203 = insertelement <16 x i32> %202, i32 %187, i64 12, !dbg !26 %204 = insertelement <16 x i32> %203, i32 %188, i64 13, !dbg !26 %205 = insertelement <16 x i32> %204, i32 %189, i64 14, !dbg !26 %206 = insertelement <16 x i32> %205, i32 %190, i64 15, !dbg !26 %207 = bitcast <16 x i32> %206 to <16 x float>, !dbg !26 %208 = fmul <16 x float> %136, %207, !dbg !27 %209 = insertelement <16 x i1> poison, i1 %158, i64 0, !dbg !28 %210 = shufflevector <16 x i1> %209, <16 x i1> poison, <16 x i32> zeroinitializer, !dbg !28 %211 = select <16 x i1> %210, <16 x float> %208, <16 x float> , !dbg !28 %212 = fadd <16 x float> %155, %211, !dbg !28 %213 = add nuw nsw i32 %154, 64, !dbg !22 %214 = icmp ult i32 %154, 50193, !dbg !22 br i1 %214, label %153, label %215, !dbg !22 215: ; preds = %153 %216 = lshr i32 %10, 5, !dbg !8 %217 = and i32 %216, 1, !dbg !9 %218 = extractelement <16 x float> %212, i64 0, !dbg !29 %219 = bitcast float %218 to i32, !dbg !29 %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 16, i32 31), !dbg !29 %221 = bitcast i32 %220 to float, !dbg !29 %222 = fadd float %218, %221, !dbg !33 %223 = bitcast float %222 to i32, !dbg !29 %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 8, i32 31), !dbg !29 %225 = bitcast i32 %224 to float, !dbg !29 %226 = fadd float %222, %225, !dbg !33 %227 = bitcast float %226 to i32, !dbg !29 %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !29 %229 = bitcast i32 %228 to float, !dbg !29 %230 = fadd float %226, %229, !dbg !33 %231 = bitcast float %230 to i32, !dbg !29 %232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 2, i32 31), !dbg !29 %233 = bitcast i32 %232 to float, !dbg !29 %234 = fadd float %230, %233, !dbg !33 %235 = bitcast float %234 to i32, !dbg !29 %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !29 %237 = bitcast i32 %236 to float, !dbg !29 %238 = fadd float %234, %237, !dbg !33 %239 = extractelement <16 x float> %212, i64 1, !dbg !29 %240 = bitcast float %239 to i32, !dbg !29 %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !29 %242 = bitcast i32 %241 to float, !dbg !29 %243 = fadd float %239, %242, !dbg !33 %244 = bitcast float %243 to i32, !dbg !29 %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 8, i32 31), !dbg !29 %246 = bitcast i32 %245 to float, !dbg !29 %247 = fadd float %243, %246, !dbg !33 %248 = bitcast float %247 to i32, !dbg !29 %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 4, i32 31), !dbg !29 %250 = bitcast i32 %249 to float, !dbg !29 %251 = fadd float %247, %250, !dbg !33 %252 = bitcast float %251 to i32, !dbg !29 %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 2, i32 31), !dbg !29 %254 = bitcast i32 %253 to float, !dbg !29 %255 = fadd float %251, %254, !dbg !33 %256 = bitcast float %255 to i32, !dbg !29 %257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 1, i32 31), !dbg !29 %258 = bitcast i32 %257 to float, !dbg !29 %259 = fadd float %255, %258, !dbg !33 %260 = extractelement <16 x float> %212, i64 2, !dbg !29 %261 = bitcast float %260 to i32, !dbg !29 %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 16, i32 31), !dbg !29 %263 = bitcast i32 %262 to float, !dbg !29 %264 = fadd float %260, %263, !dbg !33 %265 = bitcast float %264 to i32, !dbg !29 %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !29 %267 = bitcast i32 %266 to float, !dbg !29 %268 = fadd float %264, %267, !dbg !33 %269 = bitcast float %268 to i32, !dbg !29 %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !29 %271 = bitcast i32 %270 to float, !dbg !29 %272 = fadd float %268, %271, !dbg !33 %273 = bitcast float %272 to i32, !dbg !29 %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 2, i32 31), !dbg !29 %275 = bitcast i32 %274 to float, !dbg !29 %276 = fadd float %272, %275, !dbg !33 %277 = bitcast float %276 to i32, !dbg !29 %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 1, i32 31), !dbg !29 %279 = bitcast i32 %278 to float, !dbg !29 %280 = fadd float %276, %279, !dbg !33 %281 = extractelement <16 x float> %212, i64 3, !dbg !29 %282 = bitcast float %281 to i32, !dbg !29 %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 16, i32 31), !dbg !29 %284 = bitcast i32 %283 to float, !dbg !29 %285 = fadd float %281, %284, !dbg !33 %286 = bitcast float %285 to i32, !dbg !29 %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 8, i32 31), !dbg !29 %288 = bitcast i32 %287 to float, !dbg !29 %289 = fadd float %285, %288, !dbg !33 %290 = bitcast float %289 to i32, !dbg !29 %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 4, i32 31), !dbg !29 %292 = bitcast i32 %291 to float, !dbg !29 %293 = fadd float %289, %292, !dbg !33 %294 = bitcast float %293 to i32, !dbg !29 %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !29 %296 = bitcast i32 %295 to float, !dbg !29 %297 = fadd float %293, %296, !dbg !33 %298 = bitcast float %297 to i32, !dbg !29 %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 1, i32 31), !dbg !29 %300 = bitcast i32 %299 to float, !dbg !29 %301 = fadd float %297, %300, !dbg !33 %302 = extractelement <16 x float> %212, i64 4, !dbg !29 %303 = bitcast float %302 to i32, !dbg !29 %304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 16, i32 31), !dbg !29 %305 = bitcast i32 %304 to float, !dbg !29 %306 = fadd float %302, %305, !dbg !33 %307 = bitcast float %306 to i32, !dbg !29 %308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 8, i32 31), !dbg !29 %309 = bitcast i32 %308 to float, !dbg !29 %310 = fadd float %306, %309, !dbg !33 %311 = bitcast float %310 to i32, !dbg !29 %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 4, i32 31), !dbg !29 %313 = bitcast i32 %312 to float, !dbg !29 %314 = fadd float %310, %313, !dbg !33 %315 = bitcast float %314 to i32, !dbg !29 %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 2, i32 31), !dbg !29 %317 = bitcast i32 %316 to float, !dbg !29 %318 = fadd float %314, %317, !dbg !33 %319 = bitcast float %318 to i32, !dbg !29 %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 1, i32 31), !dbg !29 %321 = bitcast i32 %320 to float, !dbg !29 %322 = fadd float %318, %321, !dbg !33 %323 = extractelement <16 x float> %212, i64 5, !dbg !29 %324 = bitcast float %323 to i32, !dbg !29 %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 16, i32 31), !dbg !29 %326 = bitcast i32 %325 to float, !dbg !29 %327 = fadd float %323, %326, !dbg !33 %328 = bitcast float %327 to i32, !dbg !29 %329 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 8, i32 31), !dbg !29 %330 = bitcast i32 %329 to float, !dbg !29 %331 = fadd float %327, %330, !dbg !33 %332 = bitcast float %331 to i32, !dbg !29 %333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %332, i32 4, i32 31), !dbg !29 %334 = bitcast i32 %333 to float, !dbg !29 %335 = fadd float %331, %334, !dbg !33 %336 = bitcast float %335 to i32, !dbg !29 %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !29 %338 = bitcast i32 %337 to float, !dbg !29 %339 = fadd float %335, %338, !dbg !33 %340 = bitcast float %339 to i32, !dbg !29 %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 1, i32 31), !dbg !29 %342 = bitcast i32 %341 to float, !dbg !29 %343 = fadd float %339, %342, !dbg !33 %344 = extractelement <16 x float> %212, i64 6, !dbg !29 %345 = bitcast float %344 to i32, !dbg !29 %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 16, i32 31), !dbg !29 %347 = bitcast i32 %346 to float, !dbg !29 %348 = fadd float %344, %347, !dbg !33 %349 = bitcast float %348 to i32, !dbg !29 %350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 8, i32 31), !dbg !29 %351 = bitcast i32 %350 to float, !dbg !29 %352 = fadd float %348, %351, !dbg !33 %353 = bitcast float %352 to i32, !dbg !29 %354 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %353, i32 4, i32 31), !dbg !29 %355 = bitcast i32 %354 to float, !dbg !29 %356 = fadd float %352, %355, !dbg !33 %357 = bitcast float %356 to i32, !dbg !29 %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !29 %359 = bitcast i32 %358 to float, !dbg !29 %360 = fadd float %356, %359, !dbg !33 %361 = bitcast float %360 to i32, !dbg !29 %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !29 %363 = bitcast i32 %362 to float, !dbg !29 %364 = fadd float %360, %363, !dbg !33 %365 = extractelement <16 x float> %212, i64 7, !dbg !29 %366 = bitcast float %365 to i32, !dbg !29 %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 16, i32 31), !dbg !29 %368 = bitcast i32 %367 to float, !dbg !29 %369 = fadd float %365, %368, !dbg !33 %370 = bitcast float %369 to i32, !dbg !29 %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 8, i32 31), !dbg !29 %372 = bitcast i32 %371 to float, !dbg !29 %373 = fadd float %369, %372, !dbg !33 %374 = bitcast float %373 to i32, !dbg !29 %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %374, i32 4, i32 31), !dbg !29 %376 = bitcast i32 %375 to float, !dbg !29 %377 = fadd float %373, %376, !dbg !33 %378 = bitcast float %377 to i32, !dbg !29 %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 2, i32 31), !dbg !29 %380 = bitcast i32 %379 to float, !dbg !29 %381 = fadd float %377, %380, !dbg !33 %382 = bitcast float %381 to i32, !dbg !29 %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !29 %384 = bitcast i32 %383 to float, !dbg !29 %385 = fadd float %381, %384, !dbg !33 %386 = extractelement <16 x float> %212, i64 8, !dbg !29 %387 = bitcast float %386 to i32, !dbg !29 %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 16, i32 31), !dbg !29 %389 = bitcast i32 %388 to float, !dbg !29 %390 = fadd float %386, %389, !dbg !33 %391 = bitcast float %390 to i32, !dbg !29 %392 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %391, i32 8, i32 31), !dbg !29 %393 = bitcast i32 %392 to float, !dbg !29 %394 = fadd float %390, %393, !dbg !33 %395 = bitcast float %394 to i32, !dbg !29 %396 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 4, i32 31), !dbg !29 %397 = bitcast i32 %396 to float, !dbg !29 %398 = fadd float %394, %397, !dbg !33 %399 = bitcast float %398 to i32, !dbg !29 %400 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %399, i32 2, i32 31), !dbg !29 %401 = bitcast i32 %400 to float, !dbg !29 %402 = fadd float %398, %401, !dbg !33 %403 = bitcast float %402 to i32, !dbg !29 %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 1, i32 31), !dbg !29 %405 = bitcast i32 %404 to float, !dbg !29 %406 = fadd float %402, %405, !dbg !33 %407 = extractelement <16 x float> %212, i64 9, !dbg !29 %408 = bitcast float %407 to i32, !dbg !29 %409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 16, i32 31), !dbg !29 %410 = bitcast i32 %409 to float, !dbg !29 %411 = fadd float %407, %410, !dbg !33 %412 = bitcast float %411 to i32, !dbg !29 %413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 8, i32 31), !dbg !29 %414 = bitcast i32 %413 to float, !dbg !29 %415 = fadd float %411, %414, !dbg !33 %416 = bitcast float %415 to i32, !dbg !29 %417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %416, i32 4, i32 31), !dbg !29 %418 = bitcast i32 %417 to float, !dbg !29 %419 = fadd float %415, %418, !dbg !33 %420 = bitcast float %419 to i32, !dbg !29 %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 2, i32 31), !dbg !29 %422 = bitcast i32 %421 to float, !dbg !29 %423 = fadd float %419, %422, !dbg !33 %424 = bitcast float %423 to i32, !dbg !29 %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !29 %426 = bitcast i32 %425 to float, !dbg !29 %427 = fadd float %423, %426, !dbg !33 %428 = extractelement <16 x float> %212, i64 10, !dbg !29 %429 = bitcast float %428 to i32, !dbg !29 %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !29 %431 = bitcast i32 %430 to float, !dbg !29 %432 = fadd float %428, %431, !dbg !33 %433 = bitcast float %432 to i32, !dbg !29 %434 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %433, i32 8, i32 31), !dbg !29 %435 = bitcast i32 %434 to float, !dbg !29 %436 = fadd float %432, %435, !dbg !33 %437 = bitcast float %436 to i32, !dbg !29 %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 4, i32 31), !dbg !29 %439 = bitcast i32 %438 to float, !dbg !29 %440 = fadd float %436, %439, !dbg !33 %441 = bitcast float %440 to i32, !dbg !29 %442 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 2, i32 31), !dbg !29 %443 = bitcast i32 %442 to float, !dbg !29 %444 = fadd float %440, %443, !dbg !33 %445 = bitcast float %444 to i32, !dbg !29 %446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !29 %447 = bitcast i32 %446 to float, !dbg !29 %448 = fadd float %444, %447, !dbg !33 %449 = extractelement <16 x float> %212, i64 11, !dbg !29 %450 = bitcast float %449 to i32, !dbg !29 %451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %450, i32 16, i32 31), !dbg !29 %452 = bitcast i32 %451 to float, !dbg !29 %453 = fadd float %449, %452, !dbg !33 %454 = bitcast float %453 to i32, !dbg !29 %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 8, i32 31), !dbg !29 %456 = bitcast i32 %455 to float, !dbg !29 %457 = fadd float %453, %456, !dbg !33 %458 = bitcast float %457 to i32, !dbg !29 %459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 4, i32 31), !dbg !29 %460 = bitcast i32 %459 to float, !dbg !29 %461 = fadd float %457, %460, !dbg !33 %462 = bitcast float %461 to i32, !dbg !29 %463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 2, i32 31), !dbg !29 %464 = bitcast i32 %463 to float, !dbg !29 %465 = fadd float %461, %464, !dbg !33 %466 = bitcast float %465 to i32, !dbg !29 %467 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %466, i32 1, i32 31), !dbg !29 %468 = bitcast i32 %467 to float, !dbg !29 %469 = fadd float %465, %468, !dbg !33 %470 = extractelement <16 x float> %212, i64 12, !dbg !29 %471 = bitcast float %470 to i32, !dbg !29 %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !29 %473 = bitcast i32 %472 to float, !dbg !29 %474 = fadd float %470, %473, !dbg !33 %475 = bitcast float %474 to i32, !dbg !29 %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 8, i32 31), !dbg !29 %477 = bitcast i32 %476 to float, !dbg !29 %478 = fadd float %474, %477, !dbg !33 %479 = bitcast float %478 to i32, !dbg !29 %480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %479, i32 4, i32 31), !dbg !29 %481 = bitcast i32 %480 to float, !dbg !29 %482 = fadd float %478, %481, !dbg !33 %483 = bitcast float %482 to i32, !dbg !29 %484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 2, i32 31), !dbg !29 %485 = bitcast i32 %484 to float, !dbg !29 %486 = fadd float %482, %485, !dbg !33 %487 = bitcast float %486 to i32, !dbg !29 %488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !29 %489 = bitcast i32 %488 to float, !dbg !29 %490 = fadd float %486, %489, !dbg !33 %491 = extractelement <16 x float> %212, i64 13, !dbg !29 %492 = bitcast float %491 to i32, !dbg !29 %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 16, i32 31), !dbg !29 %494 = bitcast i32 %493 to float, !dbg !29 %495 = fadd float %491, %494, !dbg !33 %496 = bitcast float %495 to i32, !dbg !29 %497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 8, i32 31), !dbg !29 %498 = bitcast i32 %497 to float, !dbg !29 %499 = fadd float %495, %498, !dbg !33 %500 = bitcast float %499 to i32, !dbg !29 %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 4, i32 31), !dbg !29 %502 = bitcast i32 %501 to float, !dbg !29 %503 = fadd float %499, %502, !dbg !33 %504 = bitcast float %503 to i32, !dbg !29 %505 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 2, i32 31), !dbg !29 %506 = bitcast i32 %505 to float, !dbg !29 %507 = fadd float %503, %506, !dbg !33 %508 = bitcast float %507 to i32, !dbg !29 %509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %508, i32 1, i32 31), !dbg !29 %510 = bitcast i32 %509 to float, !dbg !29 %511 = fadd float %507, %510, !dbg !33 %512 = extractelement <16 x float> %212, i64 14, !dbg !29 %513 = bitcast float %512 to i32, !dbg !29 %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 16, i32 31), !dbg !29 %515 = bitcast i32 %514 to float, !dbg !29 %516 = fadd float %512, %515, !dbg !33 %517 = bitcast float %516 to i32, !dbg !29 %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !29 %519 = bitcast i32 %518 to float, !dbg !29 %520 = fadd float %516, %519, !dbg !33 %521 = bitcast float %520 to i32, !dbg !29 %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 4, i32 31), !dbg !29 %523 = bitcast i32 %522 to float, !dbg !29 %524 = fadd float %520, %523, !dbg !33 %525 = bitcast float %524 to i32, !dbg !29 %526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %525, i32 2, i32 31), !dbg !29 %527 = bitcast i32 %526 to float, !dbg !29 %528 = fadd float %524, %527, !dbg !33 %529 = bitcast float %528 to i32, !dbg !29 %530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 1, i32 31), !dbg !29 %531 = bitcast i32 %530 to float, !dbg !29 %532 = fadd float %528, %531, !dbg !33 %533 = extractelement <16 x float> %212, i64 15, !dbg !29 %534 = bitcast float %533 to i32, !dbg !29 %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 16, i32 31), !dbg !29 %536 = bitcast i32 %535 to float, !dbg !29 %537 = fadd float %533, %536, !dbg !33 %538 = bitcast float %537 to i32, !dbg !29 %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !29 %540 = bitcast i32 %539 to float, !dbg !29 %541 = fadd float %537, %540, !dbg !33 %542 = bitcast float %541 to i32, !dbg !29 %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 4, i32 31), !dbg !29 %544 = bitcast i32 %543 to float, !dbg !29 %545 = fadd float %541, %544, !dbg !33 %546 = bitcast float %545 to i32, !dbg !29 %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 2, i32 31), !dbg !29 %548 = bitcast i32 %547 to float, !dbg !29 %549 = fadd float %545, %548, !dbg !33 %550 = bitcast float %549 to i32, !dbg !29 %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 1, i32 31), !dbg !29 %552 = bitcast i32 %551 to float, !dbg !29 %553 = fadd float %549, %552, !dbg !33 %554 = icmp eq i32 %11, 0, !dbg !29 %555 = shl nuw nsw i32 %13, 1, !dbg !29 %556 = or i32 %555, %217, !dbg !29 %557 = zext nneg i32 %556 to i64, !dbg !29 %558 = getelementptr float, ptr addrspace(3) @global_smem, i64 %557, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %558, float %238, i1 %554) #3, !dbg !29 %559 = shl nuw nsw i32 %14, 1, !dbg !29 %560 = or i32 %559, %217, !dbg !29 %561 = zext nneg i32 %560 to i64, !dbg !29 %562 = getelementptr float, ptr addrspace(3) @global_smem, i64 %561, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %562, float %259, i1 %554) #3, !dbg !29 %563 = shl nuw nsw i32 %15, 1, !dbg !29 %564 = or i32 %563, %217, !dbg !29 %565 = zext nneg i32 %564 to i64, !dbg !29 %566 = getelementptr float, ptr addrspace(3) @global_smem, i64 %565, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %566, float %280, i1 %554) #3, !dbg !29 %567 = shl nuw nsw i32 %16, 1, !dbg !29 %568 = or i32 %567, %217, !dbg !29 %569 = zext nneg i32 %568 to i64, !dbg !29 %570 = getelementptr float, ptr addrspace(3) @global_smem, i64 %569, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %570, float %301, i1 %554) #3, !dbg !29 %571 = shl nuw nsw i32 %17, 1, !dbg !29 %572 = or i32 %571, %217, !dbg !29 %573 = zext nneg i32 %572 to i64, !dbg !29 %574 = getelementptr float, ptr addrspace(3) @global_smem, i64 %573, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %574, float %322, i1 %554) #3, !dbg !29 %575 = shl nuw nsw i32 %18, 1, !dbg !29 %576 = or i32 %575, %217, !dbg !29 %577 = zext nneg i32 %576 to i64, !dbg !29 %578 = getelementptr float, ptr addrspace(3) @global_smem, i64 %577, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %578, float %343, i1 %554) #3, !dbg !29 %579 = shl nuw nsw i32 %19, 1, !dbg !29 %580 = or i32 %579, %217, !dbg !29 %581 = zext nneg i32 %580 to i64, !dbg !29 %582 = getelementptr float, ptr addrspace(3) @global_smem, i64 %581, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %582, float %364, i1 %554) #3, !dbg !29 %583 = shl nuw nsw i32 %20, 1, !dbg !29 %584 = or i32 %583, %217, !dbg !29 %585 = zext nneg i32 %584 to i64, !dbg !29 %586 = getelementptr float, ptr addrspace(3) @global_smem, i64 %585, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %586, float %385, i1 %554) #3, !dbg !29 %587 = shl nuw nsw i32 %21, 1, !dbg !29 %588 = or i32 %587, %217, !dbg !29 %589 = zext nneg i32 %588 to i64, !dbg !29 %590 = getelementptr float, ptr addrspace(3) @global_smem, i64 %589, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %590, float %406, i1 %554) #3, !dbg !29 %591 = shl nuw nsw i32 %22, 1, !dbg !29 %592 = or i32 %591, %217, !dbg !29 %593 = zext nneg i32 %592 to i64, !dbg !29 %594 = getelementptr float, ptr addrspace(3) @global_smem, i64 %593, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %594, float %427, i1 %554) #3, !dbg !29 %595 = shl nuw nsw i32 %23, 1, !dbg !29 %596 = or i32 %595, %217, !dbg !29 %597 = zext nneg i32 %596 to i64, !dbg !29 %598 = getelementptr float, ptr addrspace(3) @global_smem, i64 %597, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %598, float %448, i1 %554) #3, !dbg !29 %599 = shl nuw nsw i32 %24, 1, !dbg !29 %600 = or i32 %599, %217, !dbg !29 %601 = zext nneg i32 %600 to i64, !dbg !29 %602 = getelementptr float, ptr addrspace(3) @global_smem, i64 %601, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %602, float %469, i1 %554) #3, !dbg !29 %603 = shl nuw nsw i32 %25, 1, !dbg !29 %604 = or i32 %603, %217, !dbg !29 %605 = zext nneg i32 %604 to i64, !dbg !29 %606 = getelementptr float, ptr addrspace(3) @global_smem, i64 %605, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %606, float %490, i1 %554) #3, !dbg !29 %607 = shl nuw nsw i32 %26, 1, !dbg !29 %608 = or i32 %607, %217, !dbg !29 %609 = zext nneg i32 %608 to i64, !dbg !29 %610 = getelementptr float, ptr addrspace(3) @global_smem, i64 %609, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %610, float %511, i1 %554) #3, !dbg !29 %611 = shl nuw nsw i32 %27, 1, !dbg !29 %612 = or i32 %611, %217, !dbg !29 %613 = zext nneg i32 %612 to i64, !dbg !29 %614 = getelementptr float, ptr addrspace(3) @global_smem, i64 %613, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %614, float %532, i1 %554) #3, !dbg !29 %615 = shl nuw nsw i32 %28, 1, !dbg !29 %616 = or i32 %615, %217, !dbg !29 %617 = zext nneg i32 %616 to i64, !dbg !29 %618 = getelementptr float, ptr addrspace(3) @global_smem, i64 %617, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %618, float %553, i1 %554) #3, !dbg !29 tail call void @llvm.nvvm.barrier0(), !dbg !29 %619 = icmp slt i32 %10, 128, !dbg !29 %620 = sext i32 %10 to i64, !dbg !29 %621 = getelementptr float, ptr addrspace(3) @global_smem, i64 %620, !dbg !29 %622 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %621, i1 %619) #3, !dbg !29 %623 = bitcast float %622 to i32, !dbg !29 %624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !29 %625 = bitcast i32 %624 to float, !dbg !29 %626 = fadd float %622, %625, !dbg !33 %627 = and i32 %10, 1, !dbg !29 %628 = icmp eq i32 %627, 0, !dbg !29 %629 = and i1 %619, %628, !dbg !29 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %621, float %626, i1 %629) #3, !dbg !29 tail call void @llvm.nvvm.barrier0(), !dbg !29 %630 = zext nneg i32 %555 to i64, !dbg !29 %631 = getelementptr float, ptr addrspace(3) @global_smem, i64 %630, !dbg !29 %632 = load float, ptr addrspace(3) %631, align 4, !dbg !29 %633 = zext nneg i32 %559 to i64, !dbg !29 %634 = getelementptr float, ptr addrspace(3) @global_smem, i64 %633, !dbg !29 %635 = load float, ptr addrspace(3) %634, align 4, !dbg !29 %636 = zext nneg i32 %563 to i64, !dbg !29 %637 = getelementptr float, ptr addrspace(3) @global_smem, i64 %636, !dbg !29 %638 = load float, ptr addrspace(3) %637, align 4, !dbg !29 %639 = zext nneg i32 %567 to i64, !dbg !29 %640 = getelementptr float, ptr addrspace(3) @global_smem, i64 %639, !dbg !29 %641 = load float, ptr addrspace(3) %640, align 4, !dbg !29 %642 = zext nneg i32 %571 to i64, !dbg !29 %643 = getelementptr float, ptr addrspace(3) @global_smem, i64 %642, !dbg !29 %644 = load float, ptr addrspace(3) %643, align 4, !dbg !29 %645 = zext nneg i32 %575 to i64, !dbg !29 %646 = getelementptr float, ptr addrspace(3) @global_smem, i64 %645, !dbg !29 %647 = load float, ptr addrspace(3) %646, align 4, !dbg !29 %648 = zext nneg i32 %579 to i64, !dbg !29 %649 = getelementptr float, ptr addrspace(3) @global_smem, i64 %648, !dbg !29 %650 = load float, ptr addrspace(3) %649, align 4, !dbg !29 %651 = zext nneg i32 %583 to i64, !dbg !29 %652 = getelementptr float, ptr addrspace(3) @global_smem, i64 %651, !dbg !29 %653 = load float, ptr addrspace(3) %652, align 4, !dbg !29 %654 = zext nneg i32 %587 to i64, !dbg !29 %655 = getelementptr float, ptr addrspace(3) @global_smem, i64 %654, !dbg !29 %656 = load float, ptr addrspace(3) %655, align 4, !dbg !29 %657 = zext nneg i32 %591 to i64, !dbg !29 %658 = getelementptr float, ptr addrspace(3) @global_smem, i64 %657, !dbg !29 %659 = load float, ptr addrspace(3) %658, align 4, !dbg !29 %660 = zext nneg i32 %595 to i64, !dbg !29 %661 = getelementptr float, ptr addrspace(3) @global_smem, i64 %660, !dbg !29 %662 = load float, ptr addrspace(3) %661, align 4, !dbg !29 %663 = zext nneg i32 %599 to i64, !dbg !29 %664 = getelementptr float, ptr addrspace(3) @global_smem, i64 %663, !dbg !29 %665 = load float, ptr addrspace(3) %664, align 4, !dbg !29 %666 = zext nneg i32 %603 to i64, !dbg !29 %667 = getelementptr float, ptr addrspace(3) @global_smem, i64 %666, !dbg !29 %668 = load float, ptr addrspace(3) %667, align 4, !dbg !29 %669 = zext nneg i32 %607 to i64, !dbg !29 %670 = getelementptr float, ptr addrspace(3) @global_smem, i64 %669, !dbg !29 %671 = load float, ptr addrspace(3) %670, align 4, !dbg !29 %672 = zext nneg i32 %611 to i64, !dbg !29 %673 = getelementptr float, ptr addrspace(3) @global_smem, i64 %672, !dbg !29 %674 = load float, ptr addrspace(3) %673, align 4, !dbg !29 %675 = zext nneg i32 %615 to i64, !dbg !29 %676 = getelementptr float, ptr addrspace(3) @global_smem, i64 %675, !dbg !29 %677 = load float, ptr addrspace(3) %676, align 4, !dbg !29 %678 = extractelement <16 x float> %136, i64 0, !dbg !37 %679 = extractelement <16 x float> %136, i64 1, !dbg !37 %680 = extractelement <16 x float> %136, i64 2, !dbg !37 %681 = extractelement <16 x float> %136, i64 3, !dbg !37 %682 = extractelement <16 x float> %136, i64 4, !dbg !37 %683 = extractelement <16 x float> %136, i64 5, !dbg !37 %684 = extractelement <16 x float> %136, i64 6, !dbg !37 %685 = extractelement <16 x float> %136, i64 7, !dbg !37 %686 = extractelement <16 x float> %136, i64 8, !dbg !37 %687 = extractelement <16 x float> %136, i64 9, !dbg !37 %688 = extractelement <16 x float> %136, i64 10, !dbg !37 %689 = extractelement <16 x float> %136, i64 11, !dbg !37 %690 = extractelement <16 x float> %136, i64 12, !dbg !37 %691 = extractelement <16 x float> %136, i64 13, !dbg !37 %692 = extractelement <16 x float> %136, i64 14, !dbg !37 %693 = extractelement <16 x float> %136, i64 15, !dbg !37 br label %694, !dbg !38 694: ; preds = %215, %694 %695 = phi i32 [ 0, %215 ], [ %987, %694 ] %696 = or i32 %695, %urem, !dbg !39 %697 = zext nneg i32 %696 to i64, !dbg !39 %698 = icmp ult i32 %696, 50257, !dbg !40 %699 = add nsw i64 %100, %697, !dbg !41 %700 = add nsw i64 %101, %697, !dbg !41 %701 = add nsw i64 %102, %697, !dbg !41 %702 = add nsw i64 %103, %697, !dbg !41 %703 = add nsw i64 %104, %697, !dbg !41 %704 = add nsw i64 %105, %697, !dbg !41 %705 = add nsw i64 %106, %697, !dbg !41 %706 = add nsw i64 %107, %697, !dbg !41 %707 = add nsw i64 %108, %697, !dbg !41 %708 = add nsw i64 %109, %697, !dbg !41 %709 = add nsw i64 %110, %697, !dbg !41 %710 = add nsw i64 %111, %697, !dbg !41 %711 = add nsw i64 %112, %697, !dbg !41 %712 = add nsw i64 %113, %697, !dbg !41 %713 = add nsw i64 %114, %697, !dbg !41 %714 = add nsw i64 %115, %697, !dbg !41 %715 = getelementptr i16, ptr addrspace(1) %4, i64 %699, !dbg !42 %716 = getelementptr i16, ptr addrspace(1) %4, i64 %700, !dbg !42 %717 = getelementptr i16, ptr addrspace(1) %4, i64 %701, !dbg !42 %718 = getelementptr i16, ptr addrspace(1) %4, i64 %702, !dbg !42 %719 = getelementptr i16, ptr addrspace(1) %4, i64 %703, !dbg !42 %720 = getelementptr i16, ptr addrspace(1) %4, i64 %704, !dbg !42 %721 = getelementptr i16, ptr addrspace(1) %4, i64 %705, !dbg !42 %722 = getelementptr i16, ptr addrspace(1) %4, i64 %706, !dbg !42 %723 = getelementptr i16, ptr addrspace(1) %4, i64 %707, !dbg !42 %724 = getelementptr i16, ptr addrspace(1) %4, i64 %708, !dbg !42 %725 = getelementptr i16, ptr addrspace(1) %4, i64 %709, !dbg !42 %726 = getelementptr i16, ptr addrspace(1) %4, i64 %710, !dbg !42 %727 = getelementptr i16, ptr addrspace(1) %4, i64 %711, !dbg !42 %728 = getelementptr i16, ptr addrspace(1) %4, i64 %712, !dbg !42 %729 = getelementptr i16, ptr addrspace(1) %4, i64 %713, !dbg !42 %730 = getelementptr i16, ptr addrspace(1) %4, i64 %714, !dbg !42 %731 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %715, i1 %698, i16 0, i1 %698) #3, !dbg !43 %732 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %716, i1 %698, i16 0, i1 %698) #3, !dbg !43 %733 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %717, i1 %698, i16 0, i1 %698) #3, !dbg !43 %734 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %718, i1 %698, i16 0, i1 %698) #3, !dbg !43 %735 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %719, i1 %698, i16 0, i1 %698) #3, !dbg !43 %736 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %720, i1 %698, i16 0, i1 %698) #3, !dbg !43 %737 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %721, i1 %698, i16 0, i1 %698) #3, !dbg !43 %738 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %722, i1 %698, i16 0, i1 %698) #3, !dbg !43 %739 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %723, i1 %698, i16 0, i1 %698) #3, !dbg !43 %740 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %724, i1 %698, i16 0, i1 %698) #3, !dbg !43 %741 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %725, i1 %698, i16 0, i1 %698) #3, !dbg !43 %742 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %726, i1 %698, i16 0, i1 %698) #3, !dbg !43 %743 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %727, i1 %698, i16 0, i1 %698) #3, !dbg !43 %744 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %728, i1 %698, i16 0, i1 %698) #3, !dbg !43 %745 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %729, i1 %698, i16 0, i1 %698) #3, !dbg !43 %746 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %730, i1 %698, i16 0, i1 %698) #3, !dbg !43 %747 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %731) #3, !dbg !44 %748 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %732) #3, !dbg !44 %749 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %733) #3, !dbg !44 %750 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %734) #3, !dbg !44 %751 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %735) #3, !dbg !44 %752 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %736) #3, !dbg !44 %753 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %737) #3, !dbg !44 %754 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %738) #3, !dbg !44 %755 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %739) #3, !dbg !44 %756 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %740) #3, !dbg !44 %757 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %741) #3, !dbg !44 %758 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %742) #3, !dbg !44 %759 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %743) #3, !dbg !44 %760 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %744) #3, !dbg !44 %761 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %745) #3, !dbg !44 %762 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %746) #3, !dbg !44 %763 = getelementptr float, ptr addrspace(1) %0, i64 %699, !dbg !45 %764 = getelementptr float, ptr addrspace(1) %0, i64 %700, !dbg !45 %765 = getelementptr float, ptr addrspace(1) %0, i64 %701, !dbg !45 %766 = getelementptr float, ptr addrspace(1) %0, i64 %702, !dbg !45 %767 = getelementptr float, ptr addrspace(1) %0, i64 %703, !dbg !45 %768 = getelementptr float, ptr addrspace(1) %0, i64 %704, !dbg !45 %769 = getelementptr float, ptr addrspace(1) %0, i64 %705, !dbg !45 %770 = getelementptr float, ptr addrspace(1) %0, i64 %706, !dbg !45 %771 = getelementptr float, ptr addrspace(1) %0, i64 %707, !dbg !45 %772 = getelementptr float, ptr addrspace(1) %0, i64 %708, !dbg !45 %773 = getelementptr float, ptr addrspace(1) %0, i64 %709, !dbg !45 %774 = getelementptr float, ptr addrspace(1) %0, i64 %710, !dbg !45 %775 = getelementptr float, ptr addrspace(1) %0, i64 %711, !dbg !45 %776 = getelementptr float, ptr addrspace(1) %0, i64 %712, !dbg !45 %777 = getelementptr float, ptr addrspace(1) %0, i64 %713, !dbg !45 %778 = getelementptr float, ptr addrspace(1) %0, i64 %714, !dbg !45 %779 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %763, i1 %698, i32 0, i1 %698) #3, !dbg !46 %780 = bitcast i32 %779 to float, !dbg !46 %781 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %764, i1 %698, i32 0, i1 %698) #3, !dbg !46 %782 = bitcast i32 %781 to float, !dbg !46 %783 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %765, i1 %698, i32 0, i1 %698) #3, !dbg !46 %784 = bitcast i32 %783 to float, !dbg !46 %785 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %766, i1 %698, i32 0, i1 %698) #3, !dbg !46 %786 = bitcast i32 %785 to float, !dbg !46 %787 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %767, i1 %698, i32 0, i1 %698) #3, !dbg !46 %788 = bitcast i32 %787 to float, !dbg !46 %789 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %768, i1 %698, i32 0, i1 %698) #3, !dbg !46 %790 = bitcast i32 %789 to float, !dbg !46 %791 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %769, i1 %698, i32 0, i1 %698) #3, !dbg !46 %792 = bitcast i32 %791 to float, !dbg !46 %793 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %770, i1 %698, i32 0, i1 %698) #3, !dbg !46 %794 = bitcast i32 %793 to float, !dbg !46 %795 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %771, i1 %698, i32 0, i1 %698) #3, !dbg !46 %796 = bitcast i32 %795 to float, !dbg !46 %797 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %772, i1 %698, i32 0, i1 %698) #3, !dbg !46 %798 = bitcast i32 %797 to float, !dbg !46 %799 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %773, i1 %698, i32 0, i1 %698) #3, !dbg !46 %800 = bitcast i32 %799 to float, !dbg !46 %801 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %774, i1 %698, i32 0, i1 %698) #3, !dbg !46 %802 = bitcast i32 %801 to float, !dbg !46 %803 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %775, i1 %698, i32 0, i1 %698) #3, !dbg !46 %804 = bitcast i32 %803 to float, !dbg !46 %805 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %776, i1 %698, i32 0, i1 %698) #3, !dbg !46 %806 = bitcast i32 %805 to float, !dbg !46 %807 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %777, i1 %698, i32 0, i1 %698) #3, !dbg !46 %808 = bitcast i32 %807 to float, !dbg !46 %809 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %778, i1 %698, i32 0, i1 %698) #3, !dbg !46 %810 = bitcast i32 %809 to float, !dbg !46 %811 = getelementptr i16, ptr addrspace(1) %5, i64 %699, !dbg !47 %812 = getelementptr i16, ptr addrspace(1) %5, i64 %700, !dbg !47 %813 = getelementptr i16, ptr addrspace(1) %5, i64 %701, !dbg !47 %814 = getelementptr i16, ptr addrspace(1) %5, i64 %702, !dbg !47 %815 = getelementptr i16, ptr addrspace(1) %5, i64 %703, !dbg !47 %816 = getelementptr i16, ptr addrspace(1) %5, i64 %704, !dbg !47 %817 = getelementptr i16, ptr addrspace(1) %5, i64 %705, !dbg !47 %818 = getelementptr i16, ptr addrspace(1) %5, i64 %706, !dbg !47 %819 = getelementptr i16, ptr addrspace(1) %5, i64 %707, !dbg !47 %820 = getelementptr i16, ptr addrspace(1) %5, i64 %708, !dbg !47 %821 = getelementptr i16, ptr addrspace(1) %5, i64 %709, !dbg !47 %822 = getelementptr i16, ptr addrspace(1) %5, i64 %710, !dbg !47 %823 = getelementptr i16, ptr addrspace(1) %5, i64 %711, !dbg !47 %824 = getelementptr i16, ptr addrspace(1) %5, i64 %712, !dbg !47 %825 = getelementptr i16, ptr addrspace(1) %5, i64 %713, !dbg !47 %826 = getelementptr i16, ptr addrspace(1) %5, i64 %714, !dbg !47 %827 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %811, i1 %698, i16 0, i1 %698) #3, !dbg !48 %828 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %812, i1 %698, i16 0, i1 %698) #3, !dbg !48 %829 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %813, i1 %698, i16 0, i1 %698) #3, !dbg !48 %830 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %814, i1 %698, i16 0, i1 %698) #3, !dbg !48 %831 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %815, i1 %698, i16 0, i1 %698) #3, !dbg !48 %832 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %816, i1 %698, i16 0, i1 %698) #3, !dbg !48 %833 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %817, i1 %698, i16 0, i1 %698) #3, !dbg !48 %834 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %818, i1 %698, i16 0, i1 %698) #3, !dbg !48 %835 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %819, i1 %698, i16 0, i1 %698) #3, !dbg !48 %836 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %820, i1 %698, i16 0, i1 %698) #3, !dbg !48 %837 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %821, i1 %698, i16 0, i1 %698) #3, !dbg !48 %838 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %822, i1 %698, i16 0, i1 %698) #3, !dbg !48 %839 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %823, i1 %698, i16 0, i1 %698) #3, !dbg !48 %840 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %824, i1 %698, i16 0, i1 %698) #3, !dbg !48 %841 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %825, i1 %698, i16 0, i1 %698) #3, !dbg !48 %842 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %826, i1 %698, i16 0, i1 %698) #3, !dbg !48 %843 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %827) #3, !dbg !49 %844 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %828) #3, !dbg !49 %845 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %829) #3, !dbg !49 %846 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %830) #3, !dbg !49 %847 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %831) #3, !dbg !49 %848 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %832) #3, !dbg !49 %849 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %833) #3, !dbg !49 %850 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %834) #3, !dbg !49 %851 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %835) #3, !dbg !49 %852 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %836) #3, !dbg !49 %853 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %837) #3, !dbg !49 %854 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %838) #3, !dbg !49 %855 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %839) #3, !dbg !49 %856 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %840) #3, !dbg !49 %857 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %841) #3, !dbg !49 %858 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %842) #3, !dbg !49 %859 = fmul float %678, %780, !dbg !37 %860 = fmul float %679, %782, !dbg !37 %861 = fmul float %680, %784, !dbg !37 %862 = fmul float %681, %786, !dbg !37 %863 = fmul float %682, %788, !dbg !37 %864 = fmul float %683, %790, !dbg !37 %865 = fmul float %684, %792, !dbg !37 %866 = fmul float %685, %794, !dbg !37 %867 = fmul float %686, %796, !dbg !37 %868 = fmul float %687, %798, !dbg !37 %869 = fmul float %688, %800, !dbg !37 %870 = fmul float %689, %802, !dbg !37 %871 = fmul float %690, %804, !dbg !37 %872 = fmul float %691, %806, !dbg !37 %873 = fmul float %692, %808, !dbg !37 %874 = fmul float %693, %810, !dbg !37 %875 = fmul float %843, 0x3FF7154760000000, !dbg !50 %876 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %875) #3, !dbg !50 %877 = fmul float %844, 0x3FF7154760000000, !dbg !50 %878 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %877) #3, !dbg !50 %879 = fmul float %845, 0x3FF7154760000000, !dbg !50 %880 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %879) #3, !dbg !50 %881 = fmul float %846, 0x3FF7154760000000, !dbg !50 %882 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %881) #3, !dbg !50 %883 = fmul float %847, 0x3FF7154760000000, !dbg !50 %884 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %883) #3, !dbg !50 %885 = fmul float %848, 0x3FF7154760000000, !dbg !50 %886 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %885) #3, !dbg !50 %887 = fmul float %849, 0x3FF7154760000000, !dbg !50 %888 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %887) #3, !dbg !50 %889 = fmul float %850, 0x3FF7154760000000, !dbg !50 %890 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %889) #3, !dbg !50 %891 = fmul float %851, 0x3FF7154760000000, !dbg !50 %892 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %891) #3, !dbg !50 %893 = fmul float %852, 0x3FF7154760000000, !dbg !50 %894 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %893) #3, !dbg !50 %895 = fmul float %853, 0x3FF7154760000000, !dbg !50 %896 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %895) #3, !dbg !50 %897 = fmul float %854, 0x3FF7154760000000, !dbg !50 %898 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %897) #3, !dbg !50 %899 = fmul float %855, 0x3FF7154760000000, !dbg !50 %900 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %899) #3, !dbg !50 %901 = fmul float %856, 0x3FF7154760000000, !dbg !50 %902 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %901) #3, !dbg !50 %903 = fmul float %857, 0x3FF7154760000000, !dbg !50 %904 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %903) #3, !dbg !50 %905 = fmul float %858, 0x3FF7154760000000, !dbg !50 %906 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %905) #3, !dbg !50 %907 = fmul float %632, %876, !dbg !51 %908 = fmul float %635, %878, !dbg !51 %909 = fmul float %638, %880, !dbg !51 %910 = fmul float %641, %882, !dbg !51 %911 = fmul float %644, %884, !dbg !51 %912 = fmul float %647, %886, !dbg !51 %913 = fmul float %650, %888, !dbg !51 %914 = fmul float %653, %890, !dbg !51 %915 = fmul float %656, %892, !dbg !51 %916 = fmul float %659, %894, !dbg !51 %917 = fmul float %662, %896, !dbg !51 %918 = fmul float %665, %898, !dbg !51 %919 = fmul float %668, %900, !dbg !51 %920 = fmul float %671, %902, !dbg !51 %921 = fmul float %674, %904, !dbg !51 %922 = fmul float %677, %906, !dbg !51 %923 = fsub float %859, %907, !dbg !52 %924 = fsub float %860, %908, !dbg !52 %925 = fsub float %861, %909, !dbg !52 %926 = fsub float %862, %910, !dbg !52 %927 = fsub float %863, %911, !dbg !52 %928 = fsub float %864, %912, !dbg !52 %929 = fsub float %865, %913, !dbg !52 %930 = fsub float %866, %914, !dbg !52 %931 = fsub float %867, %915, !dbg !52 %932 = fsub float %868, %916, !dbg !52 %933 = fsub float %869, %917, !dbg !52 %934 = fsub float %870, %918, !dbg !52 %935 = fsub float %871, %919, !dbg !52 %936 = fsub float %872, %920, !dbg !52 %937 = fsub float %873, %921, !dbg !52 %938 = fsub float %874, %922, !dbg !52 %939 = fadd float %747, %923, !dbg !53 %940 = fadd float %748, %924, !dbg !53 %941 = fadd float %749, %925, !dbg !53 %942 = fadd float %750, %926, !dbg !53 %943 = fadd float %751, %927, !dbg !53 %944 = fadd float %752, %928, !dbg !53 %945 = fadd float %753, %929, !dbg !53 %946 = fadd float %754, %930, !dbg !53 %947 = fadd float %755, %931, !dbg !53 %948 = fadd float %756, %932, !dbg !53 %949 = fadd float %757, %933, !dbg !53 %950 = fadd float %758, %934, !dbg !53 %951 = fadd float %759, %935, !dbg !53 %952 = fadd float %760, %936, !dbg !53 %953 = fadd float %761, %937, !dbg !53 %954 = fadd float %762, %938, !dbg !53 %955 = getelementptr i16, ptr addrspace(1) %6, i64 %699, !dbg !54 %956 = getelementptr i16, ptr addrspace(1) %6, i64 %700, !dbg !54 %957 = getelementptr i16, ptr addrspace(1) %6, i64 %701, !dbg !54 %958 = getelementptr i16, ptr addrspace(1) %6, i64 %702, !dbg !54 %959 = getelementptr i16, ptr addrspace(1) %6, i64 %703, !dbg !54 %960 = getelementptr i16, ptr addrspace(1) %6, i64 %704, !dbg !54 %961 = getelementptr i16, ptr addrspace(1) %6, i64 %705, !dbg !54 %962 = getelementptr i16, ptr addrspace(1) %6, i64 %706, !dbg !54 %963 = getelementptr i16, ptr addrspace(1) %6, i64 %707, !dbg !54 %964 = getelementptr i16, ptr addrspace(1) %6, i64 %708, !dbg !54 %965 = getelementptr i16, ptr addrspace(1) %6, i64 %709, !dbg !54 %966 = getelementptr i16, ptr addrspace(1) %6, i64 %710, !dbg !54 %967 = getelementptr i16, ptr addrspace(1) %6, i64 %711, !dbg !54 %968 = getelementptr i16, ptr addrspace(1) %6, i64 %712, !dbg !54 %969 = getelementptr i16, ptr addrspace(1) %6, i64 %713, !dbg !54 %970 = getelementptr i16, ptr addrspace(1) %6, i64 %714, !dbg !54 %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %939) #3, !dbg !55 %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %940) #3, !dbg !55 %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %941) #3, !dbg !55 %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %942) #3, !dbg !55 %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %943) #3, !dbg !55 %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %944) #3, !dbg !55 %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %945) #3, !dbg !55 %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %946) #3, !dbg !55 %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #3, !dbg !55 %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #3, !dbg !55 %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #3, !dbg !55 %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #3, !dbg !55 %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #3, !dbg !55 %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #3, !dbg !55 %985 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #3, !dbg !55 %986 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %971, ptr addrspace(1) %955, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %972, ptr addrspace(1) %956, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %973, ptr addrspace(1) %957, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %974, ptr addrspace(1) %958, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %975, ptr addrspace(1) %959, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %976, ptr addrspace(1) %960, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %977, ptr addrspace(1) %961, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %978, ptr addrspace(1) %962, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %979, ptr addrspace(1) %963, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %980, ptr addrspace(1) %964, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %981, ptr addrspace(1) %965, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %982, ptr addrspace(1) %966, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %983, ptr addrspace(1) %967, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %984, ptr addrspace(1) %968, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %985, ptr addrspace(1) %969, i1 %698) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %986, ptr addrspace(1) %970, i1 %698) #3, !dbg !55 %987 = add nuw nsw i32 %695, 64, !dbg !38 %988 = icmp ult i32 %695, 50193, !dbg !38 br i1 %988, label %694, label %989, !dbg !38 989: ; preds = %694 ret void, !dbg !56 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } attributes #2 = { convergent nocallback nounwind } attributes #3 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 22, column: 44, scope: !5) !9 = !DILocation(line: 24, column: 33, scope: !5) !10 = !DILocation(line: 21, column: 28, scope: !5) !11 = !DILocation(line: 21, column: 34, scope: !5) !12 = !DILocation(line: 21, column: 46, scope: !5) !13 = !DILocation(line: 22, column: 23, scope: !5) !14 = !DILocation(line: 26, column: 30, scope: !5) !15 = !DILocation(line: 26, column: 35, scope: !5) !16 = !DILocation(line: 27, column: 19, scope: !5) !17 = !DILocation(line: 29, column: 19, scope: !5) !18 = !DILocation(line: 36, column: 46, scope: !5) !19 = !DILocation(line: 38, column: 23, scope: !5) !20 = !DILocation(line: 39, column: 22, scope: !5) !21 = !DILocation(line: 41, column: 37, scope: !5) !22 = !DILocation(line: 32, column: 36, scope: !5) !23 = !DILocation(line: 33, column: 27, scope: !5) !24 = !DILocation(line: 34, column: 25, scope: !5) !25 = !DILocation(line: 36, column: 34, scope: !5) !26 = !DILocation(line: 36, column: 52, scope: !5) !27 = !DILocation(line: 42, column: 23, scope: !5) !28 = !DILocation(line: 45, column: 40, scope: !5) !29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32) !30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0) !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") !32 = !DILocation(line: 46, column: 27, scope: !30) !33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35) !34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0) !35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36) !36 = !DILocation(line: 46, column: 27, scope: !34) !37 = !DILocation(line: 63, column: 24, scope: !5) !38 = !DILocation(line: 51, column: 36, scope: !5) !39 = !DILocation(line: 52, column: 27, scope: !5) !40 = !DILocation(line: 53, column: 25, scope: !5) !41 = !DILocation(line: 55, column: 41, scope: !5) !42 = !DILocation(line: 55, column: 35, scope: !5) !43 = !DILocation(line: 55, column: 53, scope: !5) !44 = !DILocation(line: 55, column: 105, scope: !5) !45 = !DILocation(line: 56, column: 35, scope: !5) !46 = !DILocation(line: 56, column: 53, scope: !5) !47 = !DILocation(line: 57, column: 35, scope: !5) !48 = !DILocation(line: 57, column: 53, scope: !5) !49 = !DILocation(line: 57, column: 105, scope: !5) !50 = !DILocation(line: 65, column: 23, scope: !5) !51 = !DILocation(line: 66, column: 24, scope: !5) !52 = !DILocation(line: 67, column: 24, scope: !5) !53 = !DILocation(line: 69, column: 24, scope: !5) !54 = !DILocation(line: 70, column: 29, scope: !5) !55 = !DILocation(line: 70, column: 54, scope: !5) !56 = !DILocation(line: 51, column: 4, scope: !5)