|
|
|
source_filename = "LLVMDialectModule" |
|
|
|
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed" |
|
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>" |
|
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257" |
|
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" |
|
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>" |
|
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257" |
|
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] |
|
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 |
|
|
|
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr |
|
|
|
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 { |
|
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 |
|
%10 = lshr i32 %9, 5, !dbg !10 |
|
%11 = and i32 %10, 7, !dbg !10 |
|
%12 = and i32 %9, 15, !dbg !10 |
|
%13 = shl i32 %9, 3, !dbg !11 |
|
%14 = and i32 %13, 248, !dbg !11 |
|
%15 = or i32 %14, 4, !dbg !11 |
|
%urem = and i32 %9, 255, !dbg !11 |
|
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12 |
|
%17 = shl i32 %16, 4, !dbg !13 |
|
%18 = or i32 %17, %11, !dbg !14 |
|
%19 = or i32 %18, 8, !dbg !14 |
|
%20 = or i32 %17, %12, !dbg !14 |
|
%21 = sext i32 %18 to i64, !dbg !15 |
|
%22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15 |
|
%23 = sext i32 %19 to i64, !dbg !15 |
|
%24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15 |
|
%25 = sext i32 %20 to i64, !dbg !15 |
|
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15 |
|
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16 |
|
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16 |
|
%43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16 |
|
%44 = srem i32 %18, 512, !dbg !17 |
|
%45 = srem i32 %19, 512, !dbg !17 |
|
%46 = shl nsw i32 %44, 8, !dbg !18 |
|
%47 = shl nsw i32 %45, 8, !dbg !18 |
|
%48 = or i32 %46, %14, !dbg !19 |
|
%49 = or i32 %46, %15, !dbg !19 |
|
%50 = or i32 %47, %14, !dbg !19 |
|
%51 = or i32 %47, %15, !dbg !19 |
|
%52 = sext i32 %48 to i64, !dbg !20 |
|
%53 = getelementptr float, ptr addrspace(1) %2, i64 %52, !dbg !20 |
|
%54 = sext i32 %49 to i64, !dbg !20 |
|
%55 = getelementptr float, ptr addrspace(1) %2, i64 %54, !dbg !20 |
|
%56 = sext i32 %50 to i64, !dbg !20 |
|
%57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !20 |
|
%58 = sext i32 %51 to i64, !dbg !20 |
|
%59 = getelementptr float, ptr addrspace(1) %2, i64 %58, !dbg !20 |
|
%60 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21 |
|
%61 = extractvalue { i32, i32, i32, i32 } %60, 0, !dbg !21 |
|
%62 = extractvalue { i32, i32, i32, i32 } %60, 1, !dbg !21 |
|
%63 = extractvalue { i32, i32, i32, i32 } %60, 2, !dbg !21 |
|
%64 = extractvalue { i32, i32, i32, i32 } %60, 3, !dbg !21 |
|
%65 = bitcast i32 %61 to float, !dbg !21 |
|
%66 = bitcast i32 %62 to float, !dbg !21 |
|
%67 = bitcast i32 %63 to float, !dbg !21 |
|
%68 = bitcast i32 %64 to float, !dbg !21 |
|
%69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21 |
|
%70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !21 |
|
%71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !21 |
|
%72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !21 |
|
%73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !21 |
|
%74 = bitcast i32 %70 to float, !dbg !21 |
|
%75 = bitcast i32 %71 to float, !dbg !21 |
|
%76 = bitcast i32 %72 to float, !dbg !21 |
|
%77 = bitcast i32 %73 to float, !dbg !21 |
|
%78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21 |
|
%79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !21 |
|
%80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !21 |
|
%81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !21 |
|
%82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !21 |
|
%83 = bitcast i32 %79 to float, !dbg !21 |
|
%84 = bitcast i32 %80 to float, !dbg !21 |
|
%85 = bitcast i32 %81 to float, !dbg !21 |
|
%86 = bitcast i32 %82 to float, !dbg !21 |
|
%87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21 |
|
%88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !21 |
|
%89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !21 |
|
%90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !21 |
|
%91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !21 |
|
%92 = bitcast i32 %88 to float, !dbg !21 |
|
%93 = bitcast i32 %89 to float, !dbg !21 |
|
%94 = bitcast i32 %90 to float, !dbg !21 |
|
%95 = bitcast i32 %91 to float, !dbg !21 |
|
%96 = shl i32 %18, 8, !dbg !22 |
|
%97 = shl i32 %19, 8, !dbg !22 |
|
%98 = or i32 %96, %14, !dbg !23 |
|
%99 = or i32 %97, %14, !dbg !23 |
|
%100 = sext i32 %98 to i64, !dbg !24 |
|
%101 = getelementptr i16, ptr addrspace(1) %3, i64 %100, !dbg !24 |
|
%102 = sext i32 %99 to i64, !dbg !24 |
|
%103 = getelementptr i16, ptr addrspace(1) %3, i64 %102, !dbg !24 |
|
%104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25 |
|
%105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !25 |
|
%106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !25 |
|
%107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !25 |
|
%108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !25 |
|
%109 = trunc i32 %105 to i16, !dbg !25 |
|
%extelt.offset = lshr i32 %105, 16, !dbg !25 |
|
%110 = trunc i32 %extelt.offset to i16, !dbg !25 |
|
%111 = trunc i32 %106 to i16, !dbg !25 |
|
%extelt.offset1 = lshr i32 %106, 16, !dbg !25 |
|
%112 = trunc i32 %extelt.offset1 to i16, !dbg !25 |
|
%113 = trunc i32 %107 to i16, !dbg !25 |
|
%extelt.offset2 = lshr i32 %107, 16, !dbg !25 |
|
%114 = trunc i32 %extelt.offset2 to i16, !dbg !25 |
|
%115 = trunc i32 %108 to i16, !dbg !25 |
|
%extelt.offset3 = lshr i32 %108, 16, !dbg !25 |
|
%116 = trunc i32 %extelt.offset3 to i16, !dbg !25 |
|
%117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25 |
|
%118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !25 |
|
%119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !25 |
|
%120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !25 |
|
%121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !25 |
|
%122 = trunc i32 %118 to i16, !dbg !25 |
|
%extelt.offset4 = lshr i32 %118, 16, !dbg !25 |
|
%123 = trunc i32 %extelt.offset4 to i16, !dbg !25 |
|
%124 = trunc i32 %119 to i16, !dbg !25 |
|
%extelt.offset5 = lshr i32 %119, 16, !dbg !25 |
|
%125 = trunc i32 %extelt.offset5 to i16, !dbg !25 |
|
%126 = trunc i32 %120 to i16, !dbg !25 |
|
%extelt.offset6 = lshr i32 %120, 16, !dbg !25 |
|
%127 = trunc i32 %extelt.offset6 to i16, !dbg !25 |
|
%128 = trunc i32 %121 to i16, !dbg !25 |
|
%extelt.offset7 = lshr i32 %121, 16, !dbg !25 |
|
%129 = trunc i32 %extelt.offset7 to i16, !dbg !25 |
|
%130 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %109) #6, !dbg !26 |
|
%131 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %110) #6, !dbg !26 |
|
%132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #6, !dbg !26 |
|
%133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #6, !dbg !26 |
|
%134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !26 |
|
%135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !26 |
|
%136 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !26 |
|
%137 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !26 |
|
%138 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %122) #6, !dbg !26 |
|
%139 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %123) #6, !dbg !26 |
|
%140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %124) #6, !dbg !26 |
|
%141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %125) #6, !dbg !26 |
|
%142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %126) #6, !dbg !26 |
|
%143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %127) #6, !dbg !26 |
|
%144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #6, !dbg !26 |
|
%145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #6, !dbg !26 |
|
%146 = add i64 %43, 50257, !dbg !27 |
|
%147 = icmp slt i64 %27, 0, !dbg !28 |
|
%148 = icmp slt i64 %35, 0, !dbg !28 |
|
%149 = icmp slt i64 %43, 0, !dbg !28 |
|
%150 = select i1 %149, i64 %146, i64 %43, !dbg !29 |
|
%151 = icmp ugt i64 %150, 50256, !dbg !30 |
|
br i1 %151, label %152, label %153, !dbg !31 |
|
|
|
152: |
|
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31 |
|
br label %153, !dbg !31 |
|
|
|
153: |
|
%154 = shl i64 %27, 8, !dbg !32 |
|
%155 = add i64 %154, 12865792, !dbg !32 |
|
%156 = select i1 %147, i64 %155, i64 %154, !dbg !32 |
|
%157 = shl i64 %35, 8, !dbg !32 |
|
%158 = add i64 %157, 12865792, !dbg !32 |
|
%159 = select i1 %148, i64 %158, i64 %157, !dbg !32 |
|
%160 = zext nneg i32 %14 to i64 |
|
%161 = zext nneg i32 %15 to i64 |
|
%162 = or i64 %156, %160, !dbg !33 |
|
%163 = or i64 %156, %161, !dbg !33 |
|
%164 = or i64 %159, %160, !dbg !33 |
|
%165 = or i64 %159, %161, !dbg !33 |
|
%166 = getelementptr float, ptr addrspace(1) %1, i64 %162, !dbg !34 |
|
%167 = getelementptr float, ptr addrspace(1) %1, i64 %163, !dbg !34 |
|
%168 = getelementptr float, ptr addrspace(1) %1, i64 %164, !dbg !34 |
|
%169 = getelementptr float, ptr addrspace(1) %1, i64 %165, !dbg !34 |
|
%170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35 |
|
%171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !35 |
|
%172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !35 |
|
%173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !35 |
|
%174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !35 |
|
%175 = bitcast i32 %171 to float, !dbg !35 |
|
%176 = bitcast i32 %172 to float, !dbg !35 |
|
%177 = bitcast i32 %173 to float, !dbg !35 |
|
%178 = bitcast i32 %174 to float, !dbg !35 |
|
%179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35 |
|
%180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !35 |
|
%181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !35 |
|
%182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !35 |
|
%183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !35 |
|
%184 = bitcast i32 %180 to float, !dbg !35 |
|
%185 = bitcast i32 %181 to float, !dbg !35 |
|
%186 = bitcast i32 %182 to float, !dbg !35 |
|
%187 = bitcast i32 %183 to float, !dbg !35 |
|
%188 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35 |
|
%189 = extractvalue { i32, i32, i32, i32 } %188, 0, !dbg !35 |
|
%190 = extractvalue { i32, i32, i32, i32 } %188, 1, !dbg !35 |
|
%191 = extractvalue { i32, i32, i32, i32 } %188, 2, !dbg !35 |
|
%192 = extractvalue { i32, i32, i32, i32 } %188, 3, !dbg !35 |
|
%193 = bitcast i32 %189 to float, !dbg !35 |
|
%194 = bitcast i32 %190 to float, !dbg !35 |
|
%195 = bitcast i32 %191 to float, !dbg !35 |
|
%196 = bitcast i32 %192 to float, !dbg !35 |
|
%197 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35 |
|
%198 = extractvalue { i32, i32, i32, i32 } %197, 0, !dbg !35 |
|
%199 = extractvalue { i32, i32, i32, i32 } %197, 1, !dbg !35 |
|
%200 = extractvalue { i32, i32, i32, i32 } %197, 2, !dbg !35 |
|
%201 = extractvalue { i32, i32, i32, i32 } %197, 3, !dbg !35 |
|
%202 = bitcast i32 %198 to float, !dbg !35 |
|
%203 = bitcast i32 %199 to float, !dbg !35 |
|
%204 = bitcast i32 %200 to float, !dbg !35 |
|
%205 = bitcast i32 %201 to float, !dbg !35 |
|
%206 = fadd float %65, %175, !dbg !36 |
|
%207 = fadd float %66, %176, !dbg !36 |
|
%208 = fadd float %67, %177, !dbg !36 |
|
%209 = fadd float %68, %178, !dbg !36 |
|
%210 = fadd float %74, %184, !dbg !36 |
|
%211 = fadd float %75, %185, !dbg !36 |
|
%212 = fadd float %76, %186, !dbg !36 |
|
%213 = fadd float %77, %187, !dbg !36 |
|
%214 = fadd float %83, %193, !dbg !36 |
|
%215 = fadd float %84, %194, !dbg !36 |
|
%216 = fadd float %85, %195, !dbg !36 |
|
%217 = fadd float %86, %196, !dbg !36 |
|
%218 = fadd float %92, %202, !dbg !36 |
|
%219 = fadd float %93, %203, !dbg !36 |
|
%220 = fadd float %94, %204, !dbg !36 |
|
%221 = fadd float %95, %205, !dbg !36 |
|
%222 = fadd float %130, %206, !dbg !37 |
|
%223 = fadd float %131, %207, !dbg !37 |
|
%224 = fadd float %132, %208, !dbg !37 |
|
%225 = fadd float %133, %209, !dbg !37 |
|
%226 = fadd float %134, %210, !dbg !37 |
|
%227 = fadd float %135, %211, !dbg !37 |
|
%228 = fadd float %136, %212, !dbg !37 |
|
%229 = fadd float %137, %213, !dbg !37 |
|
%230 = fadd float %138, %214, !dbg !37 |
|
%231 = fadd float %139, %215, !dbg !37 |
|
%232 = fadd float %140, %216, !dbg !37 |
|
%233 = fadd float %141, %217, !dbg !37 |
|
%234 = fadd float %142, %218, !dbg !37 |
|
%235 = fadd float %143, %219, !dbg !37 |
|
%236 = fadd float %144, %220, !dbg !37 |
|
%237 = fadd float %145, %221, !dbg !37 |
|
%238 = fadd float %222, 0.000000e+00, !dbg !38 |
|
%239 = fadd float %223, 0.000000e+00, !dbg !38 |
|
%240 = fadd float %224, 0.000000e+00, !dbg !38 |
|
%241 = fadd float %225, 0.000000e+00, !dbg !38 |
|
%242 = fadd float %226, 0.000000e+00, !dbg !38 |
|
%243 = fadd float %227, 0.000000e+00, !dbg !38 |
|
%244 = fadd float %228, 0.000000e+00, !dbg !38 |
|
%245 = fadd float %229, 0.000000e+00, !dbg !38 |
|
%246 = fadd float %230, 0.000000e+00, !dbg !38 |
|
%247 = fadd float %231, 0.000000e+00, !dbg !38 |
|
%248 = fadd float %232, 0.000000e+00, !dbg !38 |
|
%249 = fadd float %233, 0.000000e+00, !dbg !38 |
|
%250 = fadd float %234, 0.000000e+00, !dbg !38 |
|
%251 = fadd float %235, 0.000000e+00, !dbg !38 |
|
%252 = fadd float %236, 0.000000e+00, !dbg !38 |
|
%253 = fadd float %237, 0.000000e+00, !dbg !38 |
|
%254 = fsub float %222, %238, !dbg !42 |
|
%255 = fsub float %223, %239, !dbg !42 |
|
%256 = fsub float %224, %240, !dbg !42 |
|
%257 = fsub float %225, %241, !dbg !42 |
|
%258 = fsub float %226, %242, !dbg !42 |
|
%259 = fsub float %227, %243, !dbg !42 |
|
%260 = fsub float %228, %244, !dbg !42 |
|
%261 = fsub float %229, %245, !dbg !42 |
|
%262 = fsub float %230, %246, !dbg !42 |
|
%263 = fsub float %231, %247, !dbg !42 |
|
%264 = fsub float %232, %248, !dbg !42 |
|
%265 = fsub float %233, %249, !dbg !42 |
|
%266 = fsub float %234, %250, !dbg !42 |
|
%267 = fsub float %235, %251, !dbg !42 |
|
%268 = fsub float %236, %252, !dbg !42 |
|
%269 = fsub float %237, %253, !dbg !42 |
|
%270 = fmul float %222, %254, !dbg !43 |
|
%271 = fmul float %223, %255, !dbg !43 |
|
%272 = fmul float %224, %256, !dbg !43 |
|
%273 = fmul float %225, %257, !dbg !43 |
|
%274 = fmul float %226, %258, !dbg !43 |
|
%275 = fmul float %227, %259, !dbg !43 |
|
%276 = fmul float %228, %260, !dbg !43 |
|
%277 = fmul float %229, %261, !dbg !43 |
|
%278 = fmul float %230, %262, !dbg !43 |
|
%279 = fmul float %231, %263, !dbg !43 |
|
%280 = fmul float %232, %264, !dbg !43 |
|
%281 = fmul float %233, %265, !dbg !43 |
|
%282 = fmul float %234, %266, !dbg !43 |
|
%283 = fmul float %235, %267, !dbg !43 |
|
%284 = fmul float %236, %268, !dbg !43 |
|
%285 = fmul float %237, %269, !dbg !43 |
|
%286 = fadd float %270, 0.000000e+00, !dbg !44 |
|
%287 = fadd float %271, 0.000000e+00, !dbg !44 |
|
%288 = fadd float %272, 0.000000e+00, !dbg !44 |
|
%289 = fadd float %273, 0.000000e+00, !dbg !44 |
|
%290 = fadd float %274, 0.000000e+00, !dbg !44 |
|
%291 = fadd float %275, 0.000000e+00, !dbg !44 |
|
%292 = fadd float %276, 0.000000e+00, !dbg !44 |
|
%293 = fadd float %277, 0.000000e+00, !dbg !44 |
|
%294 = fadd float %278, 0.000000e+00, !dbg !44 |
|
%295 = fadd float %279, 0.000000e+00, !dbg !44 |
|
%296 = fadd float %280, 0.000000e+00, !dbg !44 |
|
%297 = fadd float %281, 0.000000e+00, !dbg !44 |
|
%298 = fadd float %282, 0.000000e+00, !dbg !44 |
|
%299 = fadd float %283, 0.000000e+00, !dbg !44 |
|
%300 = fadd float %284, 0.000000e+00, !dbg !44 |
|
%301 = fadd float %285, 0.000000e+00, !dbg !44 |
|
%302 = fsub float %239, %238, !dbg !45 |
|
%303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49 |
|
%304 = fmul float %303, %302, !dbg !50 |
|
%305 = fadd float %238, %304, !dbg !51 |
|
%306 = fadd float %286, %287, !dbg !52 |
|
%307 = fmul float %302, %302, !dbg !53 |
|
%308 = fmul float %303, %307, !dbg !54 |
|
%309 = fadd float %308, %306, !dbg !55 |
|
%310 = fsub float %240, %305, !dbg !45 |
|
%311 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49 |
|
%312 = fmul float %311, %310, !dbg !50 |
|
%313 = fadd float %305, %312, !dbg !51 |
|
%314 = fadd float %288, %309, !dbg !52 |
|
%315 = fmul float %310, %310, !dbg !53 |
|
%316 = fmul float %315, 2.000000e+00, !dbg !56 |
|
%317 = fmul float %311, %316, !dbg !54 |
|
%318 = fadd float %314, %317, !dbg !55 |
|
%319 = fsub float %241, %313, !dbg !45 |
|
%320 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49 |
|
%321 = fmul float %320, %319, !dbg !50 |
|
%322 = fadd float %313, %321, !dbg !51 |
|
%323 = fadd float %289, %318, !dbg !52 |
|
%324 = fmul float %319, %319, !dbg !53 |
|
%325 = fmul float %324, 3.000000e+00, !dbg !56 |
|
%326 = fmul float %320, %325, !dbg !54 |
|
%327 = fadd float %323, %326, !dbg !55 |
|
%328 = fsub float %242, %322, !dbg !45 |
|
%329 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49 |
|
%330 = fmul float %329, %328, !dbg !50 |
|
%331 = fadd float %322, %330, !dbg !51 |
|
%332 = fadd float %290, %327, !dbg !52 |
|
%333 = fmul float %328, %328, !dbg !53 |
|
%334 = fmul float %333, 4.000000e+00, !dbg !56 |
|
%335 = fmul float %329, %334, !dbg !54 |
|
%336 = fadd float %332, %335, !dbg !55 |
|
%337 = fsub float %243, %331, !dbg !45 |
|
%338 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49 |
|
%339 = fmul float %338, %337, !dbg !50 |
|
%340 = fadd float %331, %339, !dbg !51 |
|
%341 = fadd float %291, %336, !dbg !52 |
|
%342 = fmul float %337, %337, !dbg !53 |
|
%343 = fmul float %342, 5.000000e+00, !dbg !56 |
|
%344 = fmul float %338, %343, !dbg !54 |
|
%345 = fadd float %341, %344, !dbg !55 |
|
%346 = fsub float %244, %340, !dbg !45 |
|
%347 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49 |
|
%348 = fmul float %347, %346, !dbg !50 |
|
%349 = fadd float %340, %348, !dbg !51 |
|
%350 = fadd float %292, %345, !dbg !52 |
|
%351 = fmul float %346, %346, !dbg !53 |
|
%352 = fmul float %351, 6.000000e+00, !dbg !56 |
|
%353 = fmul float %347, %352, !dbg !54 |
|
%354 = fadd float %350, %353, !dbg !55 |
|
%355 = fsub float %245, %349, !dbg !45 |
|
%356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49 |
|
%357 = fmul float %356, %355, !dbg !50 |
|
%358 = fadd float %349, %357, !dbg !51 |
|
%359 = fadd float %293, %354, !dbg !52 |
|
%360 = fmul float %355, %355, !dbg !53 |
|
%361 = fmul float %360, 7.000000e+00, !dbg !56 |
|
%362 = fmul float %356, %361, !dbg !54 |
|
%363 = fadd float %359, %362, !dbg !55 |
|
%364 = fsub float %247, %246, !dbg !45 |
|
%365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49 |
|
%366 = fmul float %364, %365, !dbg !50 |
|
%367 = fadd float %246, %366, !dbg !51 |
|
%368 = fadd float %294, %295, !dbg !52 |
|
%369 = fmul float %364, %364, !dbg !53 |
|
%370 = fmul float %369, %365, !dbg !54 |
|
%371 = fadd float %368, %370, !dbg !55 |
|
%372 = fsub float %248, %367, !dbg !45 |
|
%373 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49 |
|
%374 = fmul float %373, %372, !dbg !50 |
|
%375 = fadd float %367, %374, !dbg !51 |
|
%376 = fadd float %296, %371, !dbg !52 |
|
%377 = fmul float %372, %372, !dbg !53 |
|
%378 = fmul float %377, 2.000000e+00, !dbg !56 |
|
%379 = fmul float %373, %378, !dbg !54 |
|
%380 = fadd float %376, %379, !dbg !55 |
|
%381 = fsub float %249, %375, !dbg !45 |
|
%382 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49 |
|
%383 = fmul float %382, %381, !dbg !50 |
|
%384 = fadd float %375, %383, !dbg !51 |
|
%385 = fadd float %297, %380, !dbg !52 |
|
%386 = fmul float %381, %381, !dbg !53 |
|
%387 = fmul float %386, 3.000000e+00, !dbg !56 |
|
%388 = fmul float %382, %387, !dbg !54 |
|
%389 = fadd float %385, %388, !dbg !55 |
|
%390 = fsub float %250, %384, !dbg !45 |
|
%391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49 |
|
%392 = fmul float %391, %390, !dbg !50 |
|
%393 = fadd float %384, %392, !dbg !51 |
|
%394 = fadd float %298, %389, !dbg !52 |
|
%395 = fmul float %390, %390, !dbg !53 |
|
%396 = fmul float %395, 4.000000e+00, !dbg !56 |
|
%397 = fmul float %391, %396, !dbg !54 |
|
%398 = fadd float %394, %397, !dbg !55 |
|
%399 = fsub float %251, %393, !dbg !45 |
|
%400 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49 |
|
%401 = fmul float %400, %399, !dbg !50 |
|
%402 = fadd float %393, %401, !dbg !51 |
|
%403 = fadd float %299, %398, !dbg !52 |
|
%404 = fmul float %399, %399, !dbg !53 |
|
%405 = fmul float %404, 5.000000e+00, !dbg !56 |
|
%406 = fmul float %400, %405, !dbg !54 |
|
%407 = fadd float %403, %406, !dbg !55 |
|
%408 = fsub float %252, %402, !dbg !45 |
|
%409 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49 |
|
%410 = fmul float %409, %408, !dbg !50 |
|
%411 = fadd float %402, %410, !dbg !51 |
|
%412 = fadd float %300, %407, !dbg !52 |
|
%413 = fmul float %408, %408, !dbg !53 |
|
%414 = fmul float %413, 6.000000e+00, !dbg !56 |
|
%415 = fmul float %409, %414, !dbg !54 |
|
%416 = fadd float %412, %415, !dbg !55 |
|
%417 = fsub float %253, %411, !dbg !45 |
|
%418 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49 |
|
%419 = fmul float %418, %417, !dbg !50 |
|
%420 = fadd float %411, %419, !dbg !51 |
|
%421 = fadd float %301, %416, !dbg !52 |
|
%422 = fmul float %417, %417, !dbg !53 |
|
%423 = fmul float %422, 7.000000e+00, !dbg !56 |
|
%424 = fmul float %418, %423, !dbg !54 |
|
%425 = fadd float %421, %424, !dbg !55 |
|
%426 = bitcast float %358 to i32, !dbg !57 |
|
%427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 16, i32 31), !dbg !57 |
|
%428 = bitcast i32 %427 to float, !dbg !57 |
|
%429 = bitcast float %363 to i32, !dbg !57 |
|
%430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !57 |
|
%431 = bitcast i32 %430 to float, !dbg !57 |
|
%432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57 |
|
%433 = bitcast i32 %432 to float, !dbg !57 |
|
%434 = fsub float %428, %358, !dbg !45 |
|
%435 = fadd float %433, 8.000000e+00, !dbg !59 |
|
%436 = fcmp oeq float %435, 0.000000e+00, !dbg !60 |
|
%437 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %433, float %435) #6, !dbg !49 |
|
%438 = select i1 %436, float 0.000000e+00, float %437, !dbg !61 |
|
%439 = fmul float %438, %434, !dbg !50 |
|
%440 = fadd float %358, %439, !dbg !51 |
|
%441 = fadd float %363, %431, !dbg !52 |
|
%442 = fmul float %434, %434, !dbg !53 |
|
%443 = fmul float %442, 8.000000e+00, !dbg !56 |
|
%444 = fmul float %438, %443, !dbg !54 |
|
%445 = fadd float %441, %444, !dbg !55 |
|
%446 = bitcast float %440 to i32, !dbg !57 |
|
%447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !57 |
|
%448 = bitcast i32 %447 to float, !dbg !57 |
|
%449 = bitcast float %445 to i32, !dbg !57 |
|
%450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 8, i32 31), !dbg !57 |
|
%451 = bitcast i32 %450 to float, !dbg !57 |
|
%452 = bitcast float %435 to i32, !dbg !57 |
|
%453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 8, i32 31), !dbg !57 |
|
%454 = bitcast i32 %453 to float, !dbg !57 |
|
%455 = fsub float %448, %440, !dbg !45 |
|
%456 = fadd float %435, %454, !dbg !59 |
|
%457 = fcmp oeq float %456, 0.000000e+00, !dbg !60 |
|
%458 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %454, float %456) #6, !dbg !49 |
|
%459 = select i1 %457, float 0.000000e+00, float %458, !dbg !61 |
|
%460 = fmul float %459, %455, !dbg !50 |
|
%461 = fadd float %440, %460, !dbg !51 |
|
%462 = fadd float %445, %451, !dbg !52 |
|
%463 = fmul float %455, %455, !dbg !53 |
|
%464 = fmul float %435, %463, !dbg !56 |
|
%465 = fmul float %459, %464, !dbg !54 |
|
%466 = fadd float %462, %465, !dbg !55 |
|
%467 = bitcast float %461 to i32, !dbg !57 |
|
%468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 4, i32 31), !dbg !57 |
|
%469 = bitcast i32 %468 to float, !dbg !57 |
|
%470 = bitcast float %466 to i32, !dbg !57 |
|
%471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 4, i32 31), !dbg !57 |
|
%472 = bitcast i32 %471 to float, !dbg !57 |
|
%473 = bitcast float %456 to i32, !dbg !57 |
|
%474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !57 |
|
%475 = bitcast i32 %474 to float, !dbg !57 |
|
%476 = fsub float %469, %461, !dbg !45 |
|
%477 = fadd float %456, %475, !dbg !59 |
|
%478 = fcmp oeq float %477, 0.000000e+00, !dbg !60 |
|
%479 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %475, float %477) #6, !dbg !49 |
|
%480 = select i1 %478, float 0.000000e+00, float %479, !dbg !61 |
|
%481 = fmul float %480, %476, !dbg !50 |
|
%482 = fadd float %461, %481, !dbg !51 |
|
%483 = fadd float %466, %472, !dbg !52 |
|
%484 = fmul float %476, %476, !dbg !53 |
|
%485 = fmul float %456, %484, !dbg !56 |
|
%486 = fmul float %480, %485, !dbg !54 |
|
%487 = fadd float %483, %486, !dbg !55 |
|
%488 = bitcast float %482 to i32, !dbg !57 |
|
%489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 2, i32 31), !dbg !57 |
|
%490 = bitcast i32 %489 to float, !dbg !57 |
|
%491 = bitcast float %487 to i32, !dbg !57 |
|
%492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57 |
|
%493 = bitcast i32 %492 to float, !dbg !57 |
|
%494 = bitcast float %477 to i32, !dbg !57 |
|
%495 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 2, i32 31), !dbg !57 |
|
%496 = bitcast i32 %495 to float, !dbg !57 |
|
%497 = fsub float %490, %482, !dbg !45 |
|
%498 = fadd float %477, %496, !dbg !59 |
|
%499 = fcmp oeq float %498, 0.000000e+00, !dbg !60 |
|
%500 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %496, float %498) #6, !dbg !49 |
|
%501 = select i1 %499, float 0.000000e+00, float %500, !dbg !61 |
|
%502 = fmul float %497, %501, !dbg !50 |
|
%503 = fadd float %482, %502, !dbg !51 |
|
%504 = fadd float %487, %493, !dbg !52 |
|
%505 = fmul float %497, %497, !dbg !53 |
|
%506 = fmul float %477, %505, !dbg !56 |
|
%507 = fmul float %501, %506, !dbg !54 |
|
%508 = fadd float %504, %507, !dbg !55 |
|
%509 = bitcast float %503 to i32, !dbg !57 |
|
%510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !57 |
|
%511 = bitcast float %508 to i32, !dbg !57 |
|
%512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 1, i32 31), !dbg !57 |
|
%513 = bitcast float %498 to i32, !dbg !57 |
|
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !57 |
|
%515 = bitcast i32 %514 to float, !dbg !57 |
|
%516 = fadd float %498, %515, !dbg !59 |
|
%517 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %515, float %516) #6, !dbg !49 |
|
%518 = bitcast float %420 to i32, !dbg !57 |
|
%519 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %518, i32 16, i32 31), !dbg !57 |
|
%520 = bitcast i32 %519 to float, !dbg !57 |
|
%521 = bitcast float %425 to i32, !dbg !57 |
|
%522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 16, i32 31), !dbg !57 |
|
%523 = bitcast i32 %522 to float, !dbg !57 |
|
%524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57 |
|
%525 = bitcast i32 %524 to float, !dbg !57 |
|
%526 = fsub float %520, %420, !dbg !45 |
|
%527 = fadd float %525, 8.000000e+00, !dbg !59 |
|
%528 = fcmp oeq float %527, 0.000000e+00, !dbg !60 |
|
%529 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %525, float %527) #6, !dbg !49 |
|
%530 = select i1 %528, float 0.000000e+00, float %529, !dbg !61 |
|
%531 = fmul float %526, %530, !dbg !50 |
|
%532 = fadd float %420, %531, !dbg !51 |
|
%533 = fadd float %425, %523, !dbg !52 |
|
%534 = fmul float %526, %526, !dbg !53 |
|
%535 = fmul float %534, 8.000000e+00, !dbg !56 |
|
%536 = fmul float %535, %530, !dbg !54 |
|
%537 = fadd float %533, %536, !dbg !55 |
|
%538 = bitcast float %532 to i32, !dbg !57 |
|
%539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !57 |
|
%540 = bitcast i32 %539 to float, !dbg !57 |
|
%541 = bitcast float %537 to i32, !dbg !57 |
|
%542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 8, i32 31), !dbg !57 |
|
%543 = bitcast i32 %542 to float, !dbg !57 |
|
%544 = bitcast float %527 to i32, !dbg !57 |
|
%545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %544, i32 8, i32 31), !dbg !57 |
|
%546 = bitcast i32 %545 to float, !dbg !57 |
|
%547 = fsub float %540, %532, !dbg !45 |
|
%548 = fadd float %527, %546, !dbg !59 |
|
%549 = fcmp oeq float %548, 0.000000e+00, !dbg !60 |
|
%550 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %546, float %548) #6, !dbg !49 |
|
%551 = select i1 %549, float 0.000000e+00, float %550, !dbg !61 |
|
%552 = fmul float %547, %551, !dbg !50 |
|
%553 = fadd float %532, %552, !dbg !51 |
|
%554 = fadd float %537, %543, !dbg !52 |
|
%555 = fmul float %547, %547, !dbg !53 |
|
%556 = fmul float %527, %555, !dbg !56 |
|
%557 = fmul float %551, %556, !dbg !54 |
|
%558 = fadd float %554, %557, !dbg !55 |
|
%559 = bitcast float %553 to i32, !dbg !57 |
|
%560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !57 |
|
%561 = bitcast i32 %560 to float, !dbg !57 |
|
%562 = bitcast float %558 to i32, !dbg !57 |
|
%563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 4, i32 31), !dbg !57 |
|
%564 = bitcast i32 %563 to float, !dbg !57 |
|
%565 = bitcast float %548 to i32, !dbg !57 |
|
%566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 4, i32 31), !dbg !57 |
|
%567 = bitcast i32 %566 to float, !dbg !57 |
|
%568 = fsub float %561, %553, !dbg !45 |
|
%569 = fadd float %548, %567, !dbg !59 |
|
%570 = fcmp oeq float %569, 0.000000e+00, !dbg !60 |
|
%571 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %567, float %569) #6, !dbg !49 |
|
%572 = select i1 %570, float 0.000000e+00, float %571, !dbg !61 |
|
%573 = fmul float %568, %572, !dbg !50 |
|
%574 = fadd float %553, %573, !dbg !51 |
|
%575 = fadd float %558, %564, !dbg !52 |
|
%576 = fmul float %568, %568, !dbg !53 |
|
%577 = fmul float %548, %576, !dbg !56 |
|
%578 = fmul float %572, %577, !dbg !54 |
|
%579 = fadd float %575, %578, !dbg !55 |
|
%580 = bitcast float %574 to i32, !dbg !57 |
|
%581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 2, i32 31), !dbg !57 |
|
%582 = bitcast i32 %581 to float, !dbg !57 |
|
%583 = bitcast float %579 to i32, !dbg !57 |
|
%584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %583, i32 2, i32 31), !dbg !57 |
|
%585 = bitcast i32 %584 to float, !dbg !57 |
|
%586 = bitcast float %569 to i32, !dbg !57 |
|
%587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 2, i32 31), !dbg !57 |
|
%588 = bitcast i32 %587 to float, !dbg !57 |
|
%589 = fsub float %582, %574, !dbg !45 |
|
%590 = fadd float %569, %588, !dbg !59 |
|
%591 = fcmp oeq float %590, 0.000000e+00, !dbg !60 |
|
%592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %588, float %590) #6, !dbg !49 |
|
%593 = select i1 %591, float 0.000000e+00, float %592, !dbg !61 |
|
%594 = fmul float %589, %593, !dbg !50 |
|
%595 = fadd float %574, %594, !dbg !51 |
|
%596 = fadd float %579, %585, !dbg !52 |
|
%597 = fmul float %589, %589, !dbg !53 |
|
%598 = fmul float %569, %597, !dbg !56 |
|
%599 = fmul float %593, %598, !dbg !54 |
|
%600 = fadd float %596, %599, !dbg !55 |
|
%601 = bitcast float %595 to i32, !dbg !57 |
|
%602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !57 |
|
%603 = bitcast float %600 to i32, !dbg !57 |
|
%604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 1, i32 31), !dbg !57 |
|
%605 = bitcast float %590 to i32, !dbg !57 |
|
%606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !57 |
|
%607 = bitcast i32 %606 to float, !dbg !57 |
|
%608 = fadd float %590, %607, !dbg !59 |
|
%609 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %607, float %608) #6, !dbg !49 |
|
%610 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62 |
|
%611 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62 |
|
%612 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62 |
|
%613 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62 |
|
%614 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63 |
|
%615 = extractvalue { i32, i32, i32, i32 } %614, 0, !dbg !63 |
|
%616 = extractvalue { i32, i32, i32, i32 } %614, 1, !dbg !63 |
|
%617 = extractvalue { i32, i32, i32, i32 } %614, 2, !dbg !63 |
|
%618 = extractvalue { i32, i32, i32, i32 } %614, 3, !dbg !63 |
|
%619 = trunc i32 %615 to i16, !dbg !63 |
|
%extelt.offset8 = lshr i32 %615, 16, !dbg !63 |
|
%620 = trunc i32 %extelt.offset8 to i16, !dbg !63 |
|
%621 = trunc i32 %616 to i16, !dbg !63 |
|
%extelt.offset9 = lshr i32 %616, 16, !dbg !63 |
|
%622 = trunc i32 %extelt.offset9 to i16, !dbg !63 |
|
%623 = trunc i32 %617 to i16, !dbg !63 |
|
%extelt.offset10 = lshr i32 %617, 16, !dbg !63 |
|
%624 = trunc i32 %extelt.offset10 to i16, !dbg !63 |
|
%625 = trunc i32 %618 to i16, !dbg !63 |
|
%extelt.offset11 = lshr i32 %618, 16, !dbg !63 |
|
%626 = trunc i32 %extelt.offset11 to i16, !dbg !63 |
|
%627 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63 |
|
%628 = extractvalue { i32, i32, i32, i32 } %627, 0, !dbg !63 |
|
%629 = extractvalue { i32, i32, i32, i32 } %627, 1, !dbg !63 |
|
%630 = extractvalue { i32, i32, i32, i32 } %627, 2, !dbg !63 |
|
%631 = extractvalue { i32, i32, i32, i32 } %627, 3, !dbg !63 |
|
%632 = trunc i32 %628 to i16, !dbg !63 |
|
%extelt.offset12 = lshr i32 %628, 16, !dbg !63 |
|
%633 = trunc i32 %extelt.offset12 to i16, !dbg !63 |
|
%634 = trunc i32 %629 to i16, !dbg !63 |
|
%extelt.offset13 = lshr i32 %629, 16, !dbg !63 |
|
%635 = trunc i32 %extelt.offset13 to i16, !dbg !63 |
|
%636 = trunc i32 %630 to i16, !dbg !63 |
|
%extelt.offset14 = lshr i32 %630, 16, !dbg !63 |
|
%637 = trunc i32 %extelt.offset14 to i16, !dbg !63 |
|
%638 = trunc i32 %631 to i16, !dbg !63 |
|
%extelt.offset15 = lshr i32 %631, 16, !dbg !63 |
|
%639 = trunc i32 %extelt.offset15 to i16, !dbg !63 |
|
%640 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %619) #6, !dbg !64 |
|
%641 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %620) #6, !dbg !64 |
|
%642 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %621) #6, !dbg !64 |
|
%643 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %622) #6, !dbg !64 |
|
%644 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %623) #6, !dbg !64 |
|
%645 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %624) #6, !dbg !64 |
|
%646 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %625) #6, !dbg !64 |
|
%647 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %626) #6, !dbg !64 |
|
%648 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %632) #6, !dbg !64 |
|
%649 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %633) #6, !dbg !64 |
|
%650 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %634) #6, !dbg !64 |
|
%651 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %635) #6, !dbg !64 |
|
%652 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %636) #6, !dbg !64 |
|
%653 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %637) #6, !dbg !64 |
|
%654 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %638) #6, !dbg !64 |
|
%655 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %639) #6, !dbg !64 |
|
%656 = zext nneg i32 %urem to i64, !dbg !65 |
|
%657 = getelementptr float, ptr addrspace(1) %4, i64 %656, !dbg !65 |
|
%658 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %657, i1 true, i32 0, i1 true) #6, !dbg !66 |
|
br i1 %151, label %659, label %660, !dbg !67 |
|
|
|
659: |
|
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67 |
|
br label %660, !dbg !67 |
|
|
|
660: |
|
%661 = bitcast i32 %604 to float, !dbg !57 |
|
%662 = fadd float %600, %661, !dbg !52 |
|
%663 = bitcast i32 %602 to float, !dbg !57 |
|
%664 = fsub float %663, %595, !dbg !45 |
|
%665 = fmul float %664, %664, !dbg !53 |
|
%666 = fmul float %590, %665, !dbg !56 |
|
%667 = fcmp oeq float %608, 0.000000e+00, !dbg !60 |
|
%668 = select i1 %667, float 0.000000e+00, float %609, !dbg !61 |
|
%669 = fmul float %668, %666, !dbg !54 |
|
%670 = fadd float %662, %669, !dbg !55 |
|
%671 = bitcast i32 %512 to float, !dbg !57 |
|
%672 = fadd float %508, %671, !dbg !52 |
|
%673 = bitcast i32 %510 to float, !dbg !57 |
|
%674 = fsub float %673, %503, !dbg !45 |
|
%675 = fmul float %674, %674, !dbg !53 |
|
%676 = fmul float %498, %675, !dbg !56 |
|
%677 = fcmp oeq float %516, 0.000000e+00, !dbg !60 |
|
%678 = select i1 %677, float 0.000000e+00, float %517, !dbg !61 |
|
%679 = fmul float %678, %676, !dbg !54 |
|
%680 = fadd float %672, %679, !dbg !55 |
|
%681 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 |
|
%682 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 |
|
%683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 |
|
%684 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68 |
|
%685 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%686 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%687 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%689 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%690 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%691 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69 |
|
%693 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%694 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%695 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%696 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%697 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%698 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%699 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%700 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69 |
|
%701 = fadd float %685, 0x3EE4F8B580000000, !dbg !70 |
|
%702 = fadd float %693, 0x3EE4F8B580000000, !dbg !70 |
|
%703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%.not.i = icmp eq i32 %703, 0, !dbg !71 |
|
br i1 %.not.i, label %706, label %704, !dbg !71 |
|
|
|
704: |
|
%705 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %701), !dbg !71 |
|
br label %__nv_rsqrtf.exit, !dbg !71 |
|
|
|
706: |
|
%707 = tail call float @llvm.nvvm.rsqrt.approx.f(float %701), !dbg !71 |
|
br label %__nv_rsqrtf.exit, !dbg !71 |
|
|
|
__nv_rsqrtf.exit: |
|
%.0.i = phi float [ %705, %704 ], [ %707, %706 ], !dbg !71 |
|
%708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%.not.i37 = icmp eq i32 %715, 0, !dbg !71 |
|
br i1 %.not.i37, label %718, label %716, !dbg !71 |
|
|
|
716: |
|
%717 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %702), !dbg !71 |
|
br label %__nv_rsqrtf.exit39, !dbg !71 |
|
|
|
718: |
|
%719 = tail call float @llvm.nvvm.rsqrt.approx.f(float %702), !dbg !71 |
|
br label %__nv_rsqrtf.exit39, !dbg !71 |
|
|
|
__nv_rsqrtf.exit39: |
|
%.0.i38 = phi float [ %717, %716 ], [ %719, %718 ], !dbg !71 |
|
%720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71 |
|
%727 = extractvalue { i32, i32, i32, i32 } %684, 3, !dbg !68 |
|
%728 = bitcast i32 %727 to float, !dbg !68 |
|
%729 = extractvalue { i32, i32, i32, i32 } %613, 3, !dbg !62 |
|
%730 = bitcast i32 %729 to float, !dbg !62 |
|
%731 = fadd float %730, %728, !dbg !72 |
|
%732 = fadd float %655, %731, !dbg !73 |
|
%733 = fmul float %664, %668, !dbg !50 |
|
%734 = fadd float %595, %733, !dbg !51 |
|
%735 = fsub float %732, %734, !dbg !74 |
|
%736 = extractvalue { i32, i32, i32, i32 } %684, 2, !dbg !68 |
|
%737 = bitcast i32 %736 to float, !dbg !68 |
|
%738 = extractvalue { i32, i32, i32, i32 } %613, 2, !dbg !62 |
|
%739 = bitcast i32 %738 to float, !dbg !62 |
|
%740 = fadd float %739, %737, !dbg !72 |
|
%741 = fadd float %654, %740, !dbg !73 |
|
%742 = fsub float %741, %734, !dbg !74 |
|
%743 = extractvalue { i32, i32, i32, i32 } %684, 1, !dbg !68 |
|
%744 = bitcast i32 %743 to float, !dbg !68 |
|
%745 = extractvalue { i32, i32, i32, i32 } %613, 1, !dbg !62 |
|
%746 = bitcast i32 %745 to float, !dbg !62 |
|
%747 = fadd float %746, %744, !dbg !72 |
|
%748 = fadd float %653, %747, !dbg !73 |
|
%749 = fsub float %748, %734, !dbg !74 |
|
%750 = extractvalue { i32, i32, i32, i32 } %684, 0, !dbg !68 |
|
%751 = bitcast i32 %750 to float, !dbg !68 |
|
%752 = extractvalue { i32, i32, i32, i32 } %613, 0, !dbg !62 |
|
%753 = bitcast i32 %752 to float, !dbg !62 |
|
%754 = fadd float %753, %751, !dbg !72 |
|
%755 = fadd float %652, %754, !dbg !73 |
|
%756 = fsub float %755, %734, !dbg !74 |
|
%757 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !68 |
|
%758 = bitcast i32 %757 to float, !dbg !68 |
|
%759 = extractvalue { i32, i32, i32, i32 } %612, 3, !dbg !62 |
|
%760 = bitcast i32 %759 to float, !dbg !62 |
|
%761 = fadd float %760, %758, !dbg !72 |
|
%762 = fadd float %651, %761, !dbg !73 |
|
%763 = fsub float %762, %734, !dbg !74 |
|
%764 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !68 |
|
%765 = bitcast i32 %764 to float, !dbg !68 |
|
%766 = extractvalue { i32, i32, i32, i32 } %612, 2, !dbg !62 |
|
%767 = bitcast i32 %766 to float, !dbg !62 |
|
%768 = fadd float %767, %765, !dbg !72 |
|
%769 = fadd float %650, %768, !dbg !73 |
|
%770 = fsub float %769, %734, !dbg !74 |
|
%771 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !68 |
|
%772 = bitcast i32 %771 to float, !dbg !68 |
|
%773 = extractvalue { i32, i32, i32, i32 } %612, 1, !dbg !62 |
|
%774 = bitcast i32 %773 to float, !dbg !62 |
|
%775 = fadd float %774, %772, !dbg !72 |
|
%776 = fadd float %649, %775, !dbg !73 |
|
%777 = fsub float %776, %734, !dbg !74 |
|
%778 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !68 |
|
%779 = bitcast i32 %778 to float, !dbg !68 |
|
%780 = extractvalue { i32, i32, i32, i32 } %612, 0, !dbg !62 |
|
%781 = bitcast i32 %780 to float, !dbg !62 |
|
%782 = fadd float %781, %779, !dbg !72 |
|
%783 = fadd float %648, %782, !dbg !73 |
|
%784 = fsub float %783, %734, !dbg !74 |
|
%785 = extractvalue { i32, i32, i32, i32 } %682, 3, !dbg !68 |
|
%786 = bitcast i32 %785 to float, !dbg !68 |
|
%787 = extractvalue { i32, i32, i32, i32 } %611, 3, !dbg !62 |
|
%788 = bitcast i32 %787 to float, !dbg !62 |
|
%789 = fadd float %788, %786, !dbg !72 |
|
%790 = fadd float %647, %789, !dbg !73 |
|
%791 = fmul float %674, %678, !dbg !50 |
|
%792 = fadd float %503, %791, !dbg !51 |
|
%793 = fsub float %790, %792, !dbg !74 |
|
%794 = extractvalue { i32, i32, i32, i32 } %682, 2, !dbg !68 |
|
%795 = bitcast i32 %794 to float, !dbg !68 |
|
%796 = extractvalue { i32, i32, i32, i32 } %611, 2, !dbg !62 |
|
%797 = bitcast i32 %796 to float, !dbg !62 |
|
%798 = fadd float %797, %795, !dbg !72 |
|
%799 = fadd float %646, %798, !dbg !73 |
|
%800 = fsub float %799, %792, !dbg !74 |
|
%801 = extractvalue { i32, i32, i32, i32 } %682, 1, !dbg !68 |
|
%802 = bitcast i32 %801 to float, !dbg !68 |
|
%803 = extractvalue { i32, i32, i32, i32 } %611, 1, !dbg !62 |
|
%804 = bitcast i32 %803 to float, !dbg !62 |
|
%805 = fadd float %804, %802, !dbg !72 |
|
%806 = fadd float %645, %805, !dbg !73 |
|
%807 = fsub float %806, %792, !dbg !74 |
|
%808 = extractvalue { i32, i32, i32, i32 } %682, 0, !dbg !68 |
|
%809 = bitcast i32 %808 to float, !dbg !68 |
|
%810 = extractvalue { i32, i32, i32, i32 } %611, 0, !dbg !62 |
|
%811 = bitcast i32 %810 to float, !dbg !62 |
|
%812 = fadd float %811, %809, !dbg !72 |
|
%813 = fadd float %644, %812, !dbg !73 |
|
%814 = fsub float %813, %792, !dbg !74 |
|
%815 = extractvalue { i32, i32, i32, i32 } %681, 3, !dbg !68 |
|
%816 = bitcast i32 %815 to float, !dbg !68 |
|
%817 = extractvalue { i32, i32, i32, i32 } %610, 3, !dbg !62 |
|
%818 = bitcast i32 %817 to float, !dbg !62 |
|
%819 = fadd float %818, %816, !dbg !72 |
|
%820 = fadd float %643, %819, !dbg !73 |
|
%821 = fsub float %820, %792, !dbg !74 |
|
%822 = extractvalue { i32, i32, i32, i32 } %681, 2, !dbg !68 |
|
%823 = bitcast i32 %822 to float, !dbg !68 |
|
%824 = extractvalue { i32, i32, i32, i32 } %610, 2, !dbg !62 |
|
%825 = bitcast i32 %824 to float, !dbg !62 |
|
%826 = fadd float %825, %823, !dbg !72 |
|
%827 = fadd float %642, %826, !dbg !73 |
|
%828 = fsub float %827, %792, !dbg !74 |
|
%829 = extractvalue { i32, i32, i32, i32 } %681, 1, !dbg !68 |
|
%830 = bitcast i32 %829 to float, !dbg !68 |
|
%831 = extractvalue { i32, i32, i32, i32 } %610, 1, !dbg !62 |
|
%832 = bitcast i32 %831 to float, !dbg !62 |
|
%833 = fadd float %832, %830, !dbg !72 |
|
%834 = fadd float %641, %833, !dbg !73 |
|
%835 = fsub float %834, %792, !dbg !74 |
|
%836 = extractvalue { i32, i32, i32, i32 } %681, 0, !dbg !68 |
|
%837 = bitcast i32 %836 to float, !dbg !68 |
|
%838 = extractvalue { i32, i32, i32, i32 } %610, 0, !dbg !62 |
|
%839 = bitcast i32 %838 to float, !dbg !62 |
|
%840 = fadd float %839, %837, !dbg !72 |
|
%841 = fadd float %640, %840, !dbg !73 |
|
%842 = fsub float %841, %792, !dbg !74 |
|
%843 = fmul float %842, %.0.i, !dbg !75 |
|
%844 = fmul float %835, %.0.i, !dbg !75 |
|
%845 = fmul float %828, %.0.i, !dbg !75 |
|
%846 = fmul float %821, %.0.i, !dbg !75 |
|
%847 = fmul float %814, %.0.i, !dbg !75 |
|
%848 = fmul float %807, %.0.i, !dbg !75 |
|
%849 = fmul float %800, %.0.i, !dbg !75 |
|
%850 = fmul float %793, %.0.i, !dbg !75 |
|
%851 = fmul float %784, %.0.i38, !dbg !75 |
|
%852 = fmul float %777, %.0.i38, !dbg !75 |
|
%853 = fmul float %770, %.0.i38, !dbg !75 |
|
%854 = fmul float %763, %.0.i38, !dbg !75 |
|
%855 = fmul float %756, %.0.i38, !dbg !75 |
|
%856 = fmul float %749, %.0.i38, !dbg !75 |
|
%857 = fmul float %742, %.0.i38, !dbg !75 |
|
%858 = fmul float %735, %.0.i38, !dbg !75 |
|
%859 = getelementptr float, ptr addrspace(3) @global_smem, i64 %656, !dbg !76 |
|
store i32 %658, ptr addrspace(3) %859, align 4, !dbg !76 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !76 |
|
%860 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !76 |
|
%861 = load float, ptr addrspace(3) %860, align 32, !dbg !76 |
|
%862 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 1, !dbg !76 |
|
%863 = load float, ptr addrspace(3) %862, align 4, !dbg !76 |
|
%864 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 2, !dbg !76 |
|
%865 = load float, ptr addrspace(3) %864, align 8, !dbg !76 |
|
%866 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 3, !dbg !76 |
|
%867 = load float, ptr addrspace(3) %866, align 4, !dbg !76 |
|
%868 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 4, !dbg !76 |
|
%869 = load float, ptr addrspace(3) %868, align 16, !dbg !76 |
|
%870 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 5, !dbg !76 |
|
%871 = load float, ptr addrspace(3) %870, align 4, !dbg !76 |
|
%872 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 6, !dbg !76 |
|
%873 = load float, ptr addrspace(3) %872, align 8, !dbg !76 |
|
%874 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 7, !dbg !76 |
|
%875 = load float, ptr addrspace(3) %874, align 4, !dbg !76 |
|
%876 = fmul float %843, %861, !dbg !76 |
|
%877 = fmul float %844, %863, !dbg !76 |
|
%878 = fmul float %845, %865, !dbg !76 |
|
%879 = fmul float %846, %867, !dbg !76 |
|
%880 = fmul float %847, %869, !dbg !76 |
|
%881 = fmul float %848, %871, !dbg !76 |
|
%882 = fmul float %849, %873, !dbg !76 |
|
%883 = fmul float %850, %875, !dbg !76 |
|
%884 = fmul float %851, %861, !dbg !76 |
|
%885 = fmul float %852, %863, !dbg !76 |
|
%886 = fmul float %853, %865, !dbg !76 |
|
%887 = fmul float %854, %867, !dbg !76 |
|
%888 = fmul float %855, %869, !dbg !76 |
|
%889 = fmul float %856, %871, !dbg !76 |
|
%890 = fmul float %857, %873, !dbg !76 |
|
%891 = fmul float %858, %875, !dbg !76 |
|
%892 = getelementptr i16, ptr addrspace(1) %5, i64 %100, !dbg !77 |
|
%893 = getelementptr i16, ptr addrspace(1) %5, i64 %102, !dbg !77 |
|
%894 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %876) #6, !dbg !78 |
|
%895 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %877) #6, !dbg !78 |
|
%896 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %878) #6, !dbg !78 |
|
%897 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %879) #6, !dbg !78 |
|
%898 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %880) #6, !dbg !78 |
|
%899 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %881) #6, !dbg !78 |
|
%900 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %882) #6, !dbg !78 |
|
%901 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %883) #6, !dbg !78 |
|
%902 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %884) #6, !dbg !78 |
|
%903 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %885) #6, !dbg !78 |
|
%904 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %886) #6, !dbg !78 |
|
%905 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %887) #6, !dbg !78 |
|
%906 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %888) #6, !dbg !78 |
|
%907 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %889) #6, !dbg !78 |
|
%908 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %890) #6, !dbg !78 |
|
%909 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %891) #6, !dbg !78 |
|
%910 = insertelement <2 x i16> undef, i16 %894, i64 0, !dbg !78 |
|
%911 = insertelement <2 x i16> %910, i16 %895, i64 1, !dbg !78 |
|
%912 = bitcast <2 x i16> %911 to i32, !dbg !78 |
|
%913 = insertelement <2 x i16> undef, i16 %896, i64 0, !dbg !78 |
|
%914 = insertelement <2 x i16> %913, i16 %897, i64 1, !dbg !78 |
|
%915 = bitcast <2 x i16> %914 to i32, !dbg !78 |
|
%916 = insertelement <2 x i16> undef, i16 %898, i64 0, !dbg !78 |
|
%917 = insertelement <2 x i16> %916, i16 %899, i64 1, !dbg !78 |
|
%918 = bitcast <2 x i16> %917 to i32, !dbg !78 |
|
%919 = insertelement <2 x i16> undef, i16 %900, i64 0, !dbg !78 |
|
%920 = insertelement <2 x i16> %919, i16 %901, i64 1, !dbg !78 |
|
%921 = bitcast <2 x i16> %920 to i32, !dbg !78 |
|
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %912, i32 %915, i32 %918, i32 %921, ptr addrspace(1) %892, i1 true) #6, !dbg !78 |
|
%922 = insertelement <2 x i16> undef, i16 %902, i64 0, !dbg !78 |
|
%923 = insertelement <2 x i16> %922, i16 %903, i64 1, !dbg !78 |
|
%924 = bitcast <2 x i16> %923 to i32, !dbg !78 |
|
%925 = insertelement <2 x i16> undef, i16 %904, i64 0, !dbg !78 |
|
%926 = insertelement <2 x i16> %925, i16 %905, i64 1, !dbg !78 |
|
%927 = bitcast <2 x i16> %926 to i32, !dbg !78 |
|
%928 = insertelement <2 x i16> undef, i16 %906, i64 0, !dbg !78 |
|
%929 = insertelement <2 x i16> %928, i16 %907, i64 1, !dbg !78 |
|
%930 = bitcast <2 x i16> %929 to i32, !dbg !78 |
|
%931 = insertelement <2 x i16> undef, i16 %908, i64 0, !dbg !78 |
|
%932 = insertelement <2 x i16> %931, i16 %909, i64 1, !dbg !78 |
|
%933 = bitcast <2 x i16> %932 to i32, !dbg !78 |
|
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %924, i32 %927, i32 %930, i32 %933, ptr addrspace(1) %893, i1 true) #6, !dbg !78 |
|
ret void, !dbg !79 |
|
} |
|
|
|
|
|
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 |
|
|
|
|
|
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 |
|
|
|
|
|
declare void @llvm.nvvm.barrier0() #2 |
|
|
|
|
|
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { |
|
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 |
|
%.not = icmp eq i32 %1, 0 |
|
br i1 %.not, label %4, label %2 |
|
|
|
2: |
|
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) |
|
br label %6 |
|
|
|
4: |
|
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) |
|
br label %6 |
|
|
|
6: |
|
%.0 = phi float [ %3, %2 ], [ %5, %4 ] |
|
ret float %.0 |
|
} |
|
|
|
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 |
|
|
|
|
|
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 |
|
|
|
|
|
declare float @llvm.nvvm.rsqrt.approx.f(float) #5 |
|
|
|
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
|
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } |
|
attributes #2 = { convergent nocallback nounwind } |
|
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } |
|
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } |
|
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } |
|
attributes #6 = { nounwind } |
|
|
|
!llvm.module.flags = !{!0, !1} |
|
!llvm.dbg.cu = !{!2} |
|
!nvvm.annotations = !{!4, !5, !5, !4} |
|
!llvm.ident = !{!6} |
|
|
|
!0 = !{i32 2, !"Debug Info Version", i32 3} |
|
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} |
|
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) |
|
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn") |
|
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1} |
|
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256} |
|
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} |
|
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) |
|
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) |
|
!9 = !{} |
|
!10 = !DILocation(line: 22, column: 44, scope: !7) |
|
!11 = !DILocation(line: 24, column: 33, scope: !7) |
|
!12 = !DILocation(line: 21, column: 28, scope: !7) |
|
!13 = !DILocation(line: 21, column: 33, scope: !7) |
|
!14 = !DILocation(line: 22, column: 23, scope: !7) |
|
!15 = !DILocation(line: 26, column: 30, scope: !7) |
|
!16 = !DILocation(line: 26, column: 35, scope: !7) |
|
!17 = !DILocation(line: 27, column: 18, scope: !7) |
|
!18 = !DILocation(line: 35, column: 44, scope: !7) |
|
!19 = !DILocation(line: 35, column: 40, scope: !7) |
|
!20 = !DILocation(line: 35, column: 34, scope: !7) |
|
!21 = !DILocation(line: 35, column: 50, scope: !7) |
|
!22 = !DILocation(line: 36, column: 44, scope: !7) |
|
!23 = !DILocation(line: 36, column: 40, scope: !7) |
|
!24 = !DILocation(line: 36, column: 34, scope: !7) |
|
!25 = !DILocation(line: 36, column: 50, scope: !7) |
|
!26 = !DILocation(line: 36, column: 101, scope: !7) |
|
!27 = !DILocation(line: 37, column: 22, scope: !7) |
|
!28 = !DILocation(line: 38, column: 22, scope: !7) |
|
!29 = !DILocation(line: 39, column: 36, scope: !7) |
|
!30 = !DILocation(line: 40, column: 40, scope: !7) |
|
!31 = !DILocation(line: 40, column: 55, scope: !7) |
|
!32 = !DILocation(line: 41, column: 44, scope: !7) |
|
!33 = !DILocation(line: 41, column: 40, scope: !7) |
|
!34 = !DILocation(line: 41, column: 34, scope: !7) |
|
!35 = !DILocation(line: 41, column: 52, scope: !7) |
|
!36 = !DILocation(line: 42, column: 22, scope: !7) |
|
!37 = !DILocation(line: 44, column: 22, scope: !7) |
|
!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41) |
|
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0) |
|
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") |
|
!41 = !DILocation(line: 47, column: 41, scope: !39) |
|
!42 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41) |
|
!43 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41) |
|
!44 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41) |
|
!45 = !DILocation(line: 108, column: 21, scope: !46, inlinedAt: !47) |
|
!46 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0) |
|
!47 = !DILocation(line: 120, column: 46, scope: !46, inlinedAt: !48) |
|
!48 = !DILocation(line: 53, column: 44, scope: !46) |
|
!49 = !DILocation(line: 110, column: 60, scope: !46, inlinedAt: !47) |
|
!50 = !DILocation(line: 112, column: 25, scope: !46, inlinedAt: !47) |
|
!51 = !DILocation(line: 112, column: 17, scope: !46, inlinedAt: !47) |
|
!52 = !DILocation(line: 113, column: 15, scope: !46, inlinedAt: !47) |
|
!53 = !DILocation(line: 113, column: 30, scope: !46, inlinedAt: !47) |
|
!54 = !DILocation(line: 113, column: 49, scope: !46, inlinedAt: !47) |
|
!55 = !DILocation(line: 113, column: 22, scope: !46, inlinedAt: !47) |
|
!56 = !DILocation(line: 113, column: 38, scope: !46, inlinedAt: !47) |
|
!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58) |
|
!58 = !DILocation(line: 53, column: 44, scope: !39) |
|
!59 = !DILocation(line: 109, column: 28, scope: !46, inlinedAt: !47) |
|
!60 = !DILocation(line: 110, column: 39, scope: !46, inlinedAt: !47) |
|
!61 = !DILocation(line: 110, column: 49, scope: !46, inlinedAt: !47) |
|
!62 = !DILocation(line: 62, column: 51, scope: !7) |
|
!63 = !DILocation(line: 63, column: 51, scope: !7) |
|
!64 = !DILocation(line: 63, column: 103, scope: !7) |
|
!65 = !DILocation(line: 64, column: 35, scope: !7) |
|
!66 = !DILocation(line: 64, column: 40, scope: !7) |
|
!67 = !DILocation(line: 68, column: 57, scope: !7) |
|
!68 = !DILocation(line: 69, column: 54, scope: !7) |
|
!69 = !DILocation(line: 75, column: 24, scope: !7) |
|
!70 = !DILocation(line: 77, column: 24, scope: !7) |
|
!71 = !DILocation(line: 78, column: 30, scope: !7) |
|
!72 = !DILocation(line: 70, column: 24, scope: !7) |
|
!73 = !DILocation(line: 72, column: 24, scope: !7) |
|
!74 = !DILocation(line: 73, column: 24, scope: !7) |
|
!75 = !DILocation(line: 79, column: 24, scope: !7) |
|
!76 = !DILocation(line: 80, column: 24, scope: !7) |
|
!77 = !DILocation(line: 82, column: 29, scope: !7) |
|
!78 = !DILocation(line: 82, column: 52, scope: !7) |
|
!79 = !DILocation(line: 58, column: 4, scope: !7) |
|
|