|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2d3d4d5d6d7de8 |
|
.extern .shared .align 1 .b8 global_smem[] |
|
|
|
.visible .entry triton__0d1d2d3d4d5d6d7de8( |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7, |
|
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8 |
|
) |
|
.maxntid 256, 1, 1 |
|
{ |
|
.reg .pred %p<176> |
|
.reg .b16 %rs<129> |
|
.reg .b32 %r<238> |
|
.reg .f32 %f<393> |
|
.reg .b64 %rd<166> |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_6] |
|
ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_5] |
|
ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_4] |
|
ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6d7de8_param_0] |
|
$L__tmp0: |
|
.loc 1 22 44 |
|
mov.u32 %r1, %tid.x |
|
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6d7de8_param_1] |
|
shr.u32 %r2, %r1, 5 |
|
ld.param.u64 %rd56, [triton__0d1d2d3d4d5d6d7de8_param_2] |
|
.loc 1 24 33 |
|
and.b32 %r9, %r1, 255 |
|
ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6d7de8_param_3] |
|
or.b32 %r10, %r9, 256 |
|
.loc 1 21 28 |
|
mov.u32 %r3, %ctaid.x |
|
.loc 1 21 34 |
|
cvt.s64.s32 %rd1, %r3 |
|
.loc 1 21 46 |
|
mul.wide.s32 %rd60, %r3, 8 |
|
.loc 1 22 23 |
|
or.b64 %rd61, %rd60, 1 |
|
cvt.u64.u32 %rd2, %r9 |
|
cvt.u64.u32 %rd3, %r10 |
|
.loc 1 26 30 |
|
shl.b64 %rd62, %rd60, 3 |
|
add.s64 %rd41, %rd59, %rd62 |
|
add.s64 %rd43, %rd41, 8 |
|
add.s64 %rd45, %rd41, 16 |
|
add.s64 %rd47, %rd41, 24 |
|
add.s64 %rd49, %rd41, 32 |
|
add.s64 %rd51, %rd41, 40 |
|
add.s64 %rd53, %rd41, 48 |
|
add.s64 %rd55, %rd41, 56 |
|
mov.pred %p1, -1 |
|
.loc 1 26 35 |
|
mov.u64 %rd40, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ] |
|
mov.u64 %rd42, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ] |
|
mov.u64 %rd44, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ] |
|
mov.u64 %rd46, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ] |
|
mov.u64 %rd48, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ] |
|
mov.u64 %rd50, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd50 }, [ %rd51 + 0 ] |
|
mov.u64 %rd52, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd52 }, [ %rd53 + 0 ] |
|
mov.u64 %rd54, 0x0 |
|
@%p1 ld.global.L1::evict_last.b64 { %rd54 }, [ %rd55 + 0 ] |
|
.loc 1 27 19 |
|
mov.u32 %r7, 0x0 |
|
@%p1 ld.global.b32 { %r7 }, [ %rd56 + 0 ] |
|
.loc 1 29 19 |
|
mov.u32 %r8, 0x0 |
|
@%p1 ld.global.b32 { %r8 }, [ %rd57 + 0 ] |
|
.loc 1 36 46 |
|
mul.wide.s32 %rd4, %r3, 402056 |
|
mul.lo.s64 %rd5, %rd61, 50257 |
|
.loc 1 38 23 |
|
setp.eq.s64 %p11, %rd40, -1 |
|
setp.eq.s64 %p12, %rd42, -1 |
|
setp.eq.s64 %p13, %rd44, -1 |
|
setp.eq.s64 %p14, %rd46, -1 |
|
setp.eq.s64 %p15, %rd48, -1 |
|
setp.eq.s64 %p16, %rd50, -1 |
|
setp.eq.s64 %p17, %rd52, -1 |
|
setp.eq.s64 %p18, %rd54, -1 |
|
.loc 1 39 22 |
|
div.full.f32 %r6, %r7, %r8 |
|
mov.b32 %f89, %r6 |
|
.loc 1 41 37 |
|
selp.f32 %f8, 0f00000000, %f89, %p18 |
|
selp.f32 %f7, 0f00000000, %f89, %p17 |
|
selp.f32 %f6, 0f00000000, %f89, %p16 |
|
selp.f32 %f5, 0f00000000, %f89, %p15 |
|
selp.f32 %f4, 0f00000000, %f89, %p14 |
|
selp.f32 %f3, 0f00000000, %f89, %p13 |
|
selp.f32 %f2, 0f00000000, %f89, %p12 |
|
selp.f32 %f1, 0f00000000, %f89, %p11 |
|
mov.f32 %f377, 0f00000000 |
|
mov.u64 %rd157, 0 |
|
shl.b64 %rd83, %rd4, 2 |
|
shl.b64 %rd86, %rd5, 2 |
|
mov.f32 %f378, %f377 |
|
mov.f32 %f379, %f377 |
|
mov.f32 %f380, %f377 |
|
mov.f32 %f381, %f377 |
|
mov.f32 %f382, %f377 |
|
mov.f32 %f383, %f377 |
|
mov.f32 %f384, %f377 |
|
mov.f32 %f385, %f377 |
|
mov.f32 %f386, %f377 |
|
mov.f32 %f387, %f377 |
|
mov.f32 %f388, %f377 |
|
mov.f32 %f389, %f377 |
|
mov.f32 %f390, %f377 |
|
mov.f32 %f391, %f377 |
|
mov.f32 %f392, %f377 |
|
$L__BB0_1: |
|
.loc 1 33 27 |
|
or.b64 %rd79, %rd157, %rd2 |
|
or.b64 %rd80, %rd157, %rd3 |
|
.loc 1 34 25 |
|
setp.lt.u64 %p22, %rd80, 50257 |
|
setp.lt.u64 %p20, %rd79, 50257 |
|
.loc 1 36 34 |
|
shl.b64 %rd81, %rd79, 2 |
|
add.s64 %rd82, %rd36, %rd81 |
|
add.s64 %rd63, %rd82, %rd83 |
|
shl.b64 %rd84, %rd80, 2 |
|
add.s64 %rd85, %rd36, %rd84 |
|
add.s64 %rd64, %rd85, %rd83 |
|
add.s64 %rd65, %rd82, %rd86 |
|
add.s64 %rd66, %rd85, %rd86 |
|
add.s64 %rd67, %rd65, 201028 |
|
add.s64 %rd68, %rd66, 201028 |
|
add.s64 %rd69, %rd65, 402056 |
|
add.s64 %rd70, %rd66, 402056 |
|
add.s64 %rd71, %rd65, 603084 |
|
add.s64 %rd72, %rd66, 603084 |
|
add.s64 %rd73, %rd65, 804112 |
|
add.s64 %rd74, %rd66, 804112 |
|
add.s64 %rd75, %rd65, 1005140 |
|
add.s64 %rd76, %rd66, 1005140 |
|
add.s64 %rd77, %rd65, 1206168 |
|
add.s64 %rd78, %rd66, 1206168 |
|
mov.b32 %r173, 0 |
|
.loc 1 36 52 |
|
mov.u32 %r11, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r11 }, [ %rd63 + 0 ] |
|
@!%p20 mov.u32 %r11, %r173 |
|
mov.u32 %r13, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r13 }, [ %rd64 + 0 ] |
|
@!%p22 mov.u32 %r13, %r173 |
|
mov.u32 %r15, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r15 }, [ %rd65 + 0 ] |
|
@!%p20 mov.u32 %r15, %r173 |
|
mov.u32 %r17, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r17 }, [ %rd66 + 0 ] |
|
@!%p22 mov.u32 %r17, %r173 |
|
mov.u32 %r19, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r19 }, [ %rd67 + 0 ] |
|
@!%p20 mov.u32 %r19, %r173 |
|
mov.u32 %r21, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r21 }, [ %rd68 + 0 ] |
|
@!%p22 mov.u32 %r21, %r173 |
|
mov.u32 %r23, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r23 }, [ %rd69 + 0 ] |
|
@!%p20 mov.u32 %r23, %r173 |
|
mov.u32 %r25, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r25 }, [ %rd70 + 0 ] |
|
@!%p22 mov.u32 %r25, %r173 |
|
mov.u32 %r27, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r27 }, [ %rd71 + 0 ] |
|
@!%p20 mov.u32 %r27, %r173 |
|
mov.u32 %r29, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r29 }, [ %rd72 + 0 ] |
|
@!%p22 mov.u32 %r29, %r173 |
|
mov.u32 %r31, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd73 + 0 ] |
|
@!%p20 mov.u32 %r31, %r173 |
|
mov.u32 %r33, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r33 }, [ %rd74 + 0 ] |
|
@!%p22 mov.u32 %r33, %r173 |
|
mov.u32 %r35, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r35 }, [ %rd75 + 0 ] |
|
@!%p20 mov.u32 %r35, %r173 |
|
mov.u32 %r37, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r37 }, [ %rd76 + 0 ] |
|
@!%p22 mov.u32 %r37, %r173 |
|
mov.u32 %r39, 0x0 |
|
@%p20 ld.global.L1::evict_last.b32 { %r39 }, [ %rd77 + 0 ] |
|
@!%p20 mov.u32 %r39, %r173 |
|
mov.u32 %r41, 0x0 |
|
@%p22 ld.global.L1::evict_last.b32 { %r41 }, [ %rd78 + 0 ] |
|
@!%p22 mov.u32 %r41, %r173 |
|
mov.b32 %f90, %r41 |
|
mov.b32 %f91, %r39 |
|
mov.b32 %f92, %r37 |
|
mov.b32 %f93, %r35 |
|
mov.b32 %f94, %r33 |
|
mov.b32 %f95, %r31 |
|
mov.b32 %f96, %r29 |
|
mov.b32 %f97, %r27 |
|
mov.b32 %f98, %r25 |
|
mov.b32 %f99, %r23 |
|
mov.b32 %f100, %r21 |
|
mov.b32 %f101, %r19 |
|
mov.b32 %f102, %r17 |
|
mov.b32 %f103, %r15 |
|
mov.b32 %f104, %r13 |
|
mov.b32 %f105, %r11 |
|
.loc 1 42 23 |
|
mul.f32 %f106, %f1, %f105 |
|
mul.f32 %f107, %f1, %f104 |
|
mul.f32 %f108, %f2, %f103 |
|
mul.f32 %f109, %f2, %f102 |
|
mul.f32 %f110, %f3, %f101 |
|
mul.f32 %f111, %f3, %f100 |
|
mul.f32 %f112, %f4, %f99 |
|
mul.f32 %f113, %f4, %f98 |
|
mul.f32 %f114, %f5, %f97 |
|
mul.f32 %f115, %f5, %f96 |
|
mul.f32 %f116, %f6, %f95 |
|
mul.f32 %f117, %f6, %f94 |
|
mul.f32 %f118, %f7, %f93 |
|
mul.f32 %f119, %f7, %f92 |
|
mul.f32 %f120, %f8, %f91 |
|
mul.f32 %f121, %f8, %f90 |
|
.loc 1 45 40 |
|
selp.f32 %f122, %f121, 0f80000000, %p22 |
|
selp.f32 %f123, %f120, 0f80000000, %p20 |
|
selp.f32 %f124, %f119, 0f80000000, %p22 |
|
selp.f32 %f125, %f118, 0f80000000, %p20 |
|
selp.f32 %f126, %f117, 0f80000000, %p22 |
|
selp.f32 %f127, %f116, 0f80000000, %p20 |
|
selp.f32 %f128, %f115, 0f80000000, %p22 |
|
selp.f32 %f129, %f114, 0f80000000, %p20 |
|
selp.f32 %f130, %f113, 0f80000000, %p22 |
|
selp.f32 %f131, %f112, 0f80000000, %p20 |
|
selp.f32 %f132, %f111, 0f80000000, %p22 |
|
selp.f32 %f133, %f110, 0f80000000, %p20 |
|
selp.f32 %f134, %f109, 0f80000000, %p22 |
|
selp.f32 %f135, %f108, 0f80000000, %p20 |
|
selp.f32 %f136, %f107, 0f80000000, %p22 |
|
selp.f32 %f137, %f106, 0f80000000, %p20 |
|
add.f32 %f377, %f377, %f137 |
|
add.f32 %f378, %f378, %f136 |
|
add.f32 %f379, %f379, %f135 |
|
add.f32 %f380, %f380, %f134 |
|
add.f32 %f381, %f381, %f133 |
|
add.f32 %f382, %f382, %f132 |
|
add.f32 %f383, %f383, %f131 |
|
add.f32 %f384, %f384, %f130 |
|
add.f32 %f385, %f385, %f129 |
|
add.f32 %f386, %f386, %f128 |
|
add.f32 %f387, %f387, %f127 |
|
add.f32 %f388, %f388, %f126 |
|
add.f32 %f389, %f389, %f125 |
|
add.f32 %f390, %f390, %f124 |
|
add.f32 %f391, %f391, %f123 |
|
add.f32 %f392, %f392, %f122 |
|
.loc 1 32 36 |
|
add.s64 %rd157, %rd157, 512 |
|
cvt.u32.u64 %r43, %rd157 |
|
add.s32 %r44, %r43, -512 |
|
setp.lt.u32 %p51, %r44, 49745 |
|
@%p51 bra $L__BB0_1 |
|
.loc 1 22 44 |
|
and.b32 %r65, %r1, 31 |
|
.loc 1 24 33 |
|
and.b32 %r66, %r2, 7 |
|
$L__tmp1: |
|
.loc 2 233 15 |
|
add.f32 %f138, %f377, %f378 |
|
add.f32 %f139, %f379, %f380 |
|
add.f32 %f140, %f381, %f382 |
|
add.f32 %f141, %f383, %f384 |
|
add.f32 %f142, %f385, %f386 |
|
add.f32 %f143, %f387, %f388 |
|
add.f32 %f144, %f389, %f390 |
|
add.f32 %f145, %f391, %f392 |
|
$L__tmp2: |
|
.loc 2 243 36 |
|
mov.b32 %r67, %f138 |
|
shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1 |
|
mov.b32 %f146, %r68 |
|
$L__tmp3: |
|
.loc 2 233 15 |
|
add.f32 %f147, %f138, %f146 |
|
$L__tmp4: |
|
.loc 2 243 36 |
|
mov.b32 %r69, %f147 |
|
shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1 |
|
mov.b32 %f148, %r70 |
|
$L__tmp5: |
|
.loc 2 233 15 |
|
add.f32 %f149, %f147, %f148 |
|
$L__tmp6: |
|
.loc 2 243 36 |
|
mov.b32 %r71, %f149 |
|
shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1 |
|
mov.b32 %f150, %r72 |
|
$L__tmp7: |
|
.loc 2 233 15 |
|
add.f32 %f151, %f149, %f150 |
|
$L__tmp8: |
|
.loc 2 243 36 |
|
mov.b32 %r73, %f151 |
|
shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1 |
|
mov.b32 %f152, %r74 |
|
$L__tmp9: |
|
.loc 2 233 15 |
|
add.f32 %f153, %f151, %f152 |
|
$L__tmp10: |
|
.loc 2 243 36 |
|
mov.b32 %r75, %f153 |
|
shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1 |
|
mov.b32 %f154, %r76 |
|
$L__tmp11: |
|
.loc 2 233 15 |
|
add.f32 %f155, %f153, %f154 |
|
$L__tmp12: |
|
.loc 2 243 36 |
|
mov.b32 %r77, %f139 |
|
shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1 |
|
mov.b32 %f156, %r78 |
|
$L__tmp13: |
|
.loc 2 233 15 |
|
add.f32 %f157, %f139, %f156 |
|
$L__tmp14: |
|
.loc 2 243 36 |
|
mov.b32 %r79, %f157 |
|
shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1 |
|
mov.b32 %f158, %r80 |
|
$L__tmp15: |
|
.loc 2 233 15 |
|
add.f32 %f159, %f157, %f158 |
|
$L__tmp16: |
|
.loc 2 243 36 |
|
mov.b32 %r81, %f159 |
|
shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1 |
|
mov.b32 %f160, %r82 |
|
$L__tmp17: |
|
.loc 2 233 15 |
|
add.f32 %f161, %f159, %f160 |
|
$L__tmp18: |
|
.loc 2 243 36 |
|
mov.b32 %r83, %f161 |
|
shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1 |
|
mov.b32 %f162, %r84 |
|
$L__tmp19: |
|
.loc 2 233 15 |
|
add.f32 %f163, %f161, %f162 |
|
$L__tmp20: |
|
.loc 2 243 36 |
|
mov.b32 %r85, %f163 |
|
shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1 |
|
mov.b32 %f164, %r86 |
|
$L__tmp21: |
|
.loc 2 233 15 |
|
add.f32 %f165, %f163, %f164 |
|
$L__tmp22: |
|
.loc 2 243 36 |
|
mov.b32 %r87, %f140 |
|
shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1 |
|
mov.b32 %f166, %r88 |
|
$L__tmp23: |
|
.loc 2 233 15 |
|
add.f32 %f167, %f140, %f166 |
|
$L__tmp24: |
|
.loc 2 243 36 |
|
mov.b32 %r89, %f167 |
|
shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1 |
|
mov.b32 %f168, %r90 |
|
$L__tmp25: |
|
.loc 2 233 15 |
|
add.f32 %f169, %f167, %f168 |
|
$L__tmp26: |
|
.loc 2 243 36 |
|
mov.b32 %r91, %f169 |
|
shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1 |
|
mov.b32 %f170, %r92 |
|
$L__tmp27: |
|
.loc 2 233 15 |
|
add.f32 %f171, %f169, %f170 |
|
$L__tmp28: |
|
.loc 2 243 36 |
|
mov.b32 %r93, %f171 |
|
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1 |
|
mov.b32 %f172, %r94 |
|
$L__tmp29: |
|
.loc 2 233 15 |
|
add.f32 %f173, %f171, %f172 |
|
$L__tmp30: |
|
.loc 2 243 36 |
|
mov.b32 %r95, %f173 |
|
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1 |
|
mov.b32 %f174, %r96 |
|
$L__tmp31: |
|
.loc 2 233 15 |
|
add.f32 %f175, %f173, %f174 |
|
$L__tmp32: |
|
.loc 2 243 36 |
|
mov.b32 %r97, %f141 |
|
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1 |
|
mov.b32 %f176, %r98 |
|
$L__tmp33: |
|
.loc 2 233 15 |
|
add.f32 %f177, %f141, %f176 |
|
$L__tmp34: |
|
.loc 2 243 36 |
|
mov.b32 %r99, %f177 |
|
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1 |
|
mov.b32 %f178, %r100 |
|
$L__tmp35: |
|
.loc 2 233 15 |
|
add.f32 %f179, %f177, %f178 |
|
$L__tmp36: |
|
.loc 2 243 36 |
|
mov.b32 %r101, %f179 |
|
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1 |
|
mov.b32 %f180, %r102 |
|
$L__tmp37: |
|
.loc 2 233 15 |
|
add.f32 %f181, %f179, %f180 |
|
$L__tmp38: |
|
.loc 2 243 36 |
|
mov.b32 %r103, %f181 |
|
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1 |
|
mov.b32 %f182, %r104 |
|
$L__tmp39: |
|
.loc 2 233 15 |
|
add.f32 %f183, %f181, %f182 |
|
$L__tmp40: |
|
.loc 2 243 36 |
|
mov.b32 %r105, %f183 |
|
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1 |
|
mov.b32 %f184, %r106 |
|
$L__tmp41: |
|
.loc 2 233 15 |
|
add.f32 %f185, %f183, %f184 |
|
$L__tmp42: |
|
.loc 2 243 36 |
|
mov.b32 %r107, %f142 |
|
shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1 |
|
mov.b32 %f186, %r108 |
|
$L__tmp43: |
|
.loc 2 233 15 |
|
add.f32 %f187, %f142, %f186 |
|
$L__tmp44: |
|
.loc 2 243 36 |
|
mov.b32 %r109, %f187 |
|
shfl.sync.bfly.b32 %r110, %r109, 8, 31, -1 |
|
mov.b32 %f188, %r110 |
|
$L__tmp45: |
|
.loc 2 233 15 |
|
add.f32 %f189, %f187, %f188 |
|
$L__tmp46: |
|
.loc 2 243 36 |
|
mov.b32 %r111, %f189 |
|
shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1 |
|
mov.b32 %f190, %r112 |
|
$L__tmp47: |
|
.loc 2 233 15 |
|
add.f32 %f191, %f189, %f190 |
|
$L__tmp48: |
|
.loc 2 243 36 |
|
mov.b32 %r113, %f191 |
|
shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1 |
|
mov.b32 %f192, %r114 |
|
$L__tmp49: |
|
.loc 2 233 15 |
|
add.f32 %f193, %f191, %f192 |
|
$L__tmp50: |
|
.loc 2 243 36 |
|
mov.b32 %r115, %f193 |
|
shfl.sync.bfly.b32 %r116, %r115, 1, 31, -1 |
|
mov.b32 %f194, %r116 |
|
$L__tmp51: |
|
.loc 2 233 15 |
|
add.f32 %f195, %f193, %f194 |
|
$L__tmp52: |
|
.loc 2 243 36 |
|
mov.b32 %r117, %f143 |
|
shfl.sync.bfly.b32 %r118, %r117, 16, 31, -1 |
|
mov.b32 %f196, %r118 |
|
$L__tmp53: |
|
.loc 2 233 15 |
|
add.f32 %f197, %f143, %f196 |
|
$L__tmp54: |
|
.loc 2 243 36 |
|
mov.b32 %r119, %f197 |
|
shfl.sync.bfly.b32 %r120, %r119, 8, 31, -1 |
|
mov.b32 %f198, %r120 |
|
$L__tmp55: |
|
.loc 2 233 15 |
|
add.f32 %f199, %f197, %f198 |
|
$L__tmp56: |
|
.loc 2 243 36 |
|
mov.b32 %r121, %f199 |
|
shfl.sync.bfly.b32 %r122, %r121, 4, 31, -1 |
|
mov.b32 %f200, %r122 |
|
$L__tmp57: |
|
.loc 2 233 15 |
|
add.f32 %f201, %f199, %f200 |
|
$L__tmp58: |
|
.loc 2 243 36 |
|
mov.b32 %r123, %f201 |
|
shfl.sync.bfly.b32 %r124, %r123, 2, 31, -1 |
|
mov.b32 %f202, %r124 |
|
$L__tmp59: |
|
.loc 2 233 15 |
|
add.f32 %f203, %f201, %f202 |
|
$L__tmp60: |
|
.loc 2 243 36 |
|
mov.b32 %r125, %f203 |
|
shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1 |
|
mov.b32 %f204, %r126 |
|
$L__tmp61: |
|
.loc 2 233 15 |
|
add.f32 %f205, %f203, %f204 |
|
$L__tmp62: |
|
.loc 2 243 36 |
|
mov.b32 %r127, %f144 |
|
shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1 |
|
mov.b32 %f206, %r128 |
|
$L__tmp63: |
|
.loc 2 233 15 |
|
add.f32 %f207, %f144, %f206 |
|
$L__tmp64: |
|
.loc 2 243 36 |
|
mov.b32 %r129, %f207 |
|
shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1 |
|
mov.b32 %f208, %r130 |
|
$L__tmp65: |
|
.loc 2 233 15 |
|
add.f32 %f209, %f207, %f208 |
|
$L__tmp66: |
|
.loc 2 243 36 |
|
mov.b32 %r131, %f209 |
|
shfl.sync.bfly.b32 %r132, %r131, 4, 31, -1 |
|
mov.b32 %f210, %r132 |
|
$L__tmp67: |
|
.loc 2 233 15 |
|
add.f32 %f211, %f209, %f210 |
|
$L__tmp68: |
|
.loc 2 243 36 |
|
mov.b32 %r133, %f211 |
|
shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1 |
|
mov.b32 %f212, %r134 |
|
$L__tmp69: |
|
.loc 2 233 15 |
|
add.f32 %f213, %f211, %f212 |
|
$L__tmp70: |
|
.loc 2 243 36 |
|
mov.b32 %r135, %f213 |
|
shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1 |
|
mov.b32 %f214, %r136 |
|
$L__tmp71: |
|
.loc 2 233 15 |
|
add.f32 %f215, %f213, %f214 |
|
$L__tmp72: |
|
.loc 2 243 36 |
|
mov.b32 %r137, %f145 |
|
shfl.sync.bfly.b32 %r138, %r137, 16, 31, -1 |
|
mov.b32 %f216, %r138 |
|
$L__tmp73: |
|
.loc 2 233 15 |
|
add.f32 %f217, %f145, %f216 |
|
$L__tmp74: |
|
.loc 2 243 36 |
|
mov.b32 %r139, %f217 |
|
shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1 |
|
mov.b32 %f218, %r140 |
|
$L__tmp75: |
|
.loc 2 233 15 |
|
add.f32 %f219, %f217, %f218 |
|
$L__tmp76: |
|
.loc 2 243 36 |
|
mov.b32 %r141, %f219 |
|
shfl.sync.bfly.b32 %r142, %r141, 4, 31, -1 |
|
mov.b32 %f220, %r142 |
|
$L__tmp77: |
|
.loc 2 233 15 |
|
add.f32 %f221, %f219, %f220 |
|
$L__tmp78: |
|
.loc 2 243 36 |
|
mov.b32 %r143, %f221 |
|
shfl.sync.bfly.b32 %r144, %r143, 2, 31, -1 |
|
mov.b32 %f222, %r144 |
|
$L__tmp79: |
|
.loc 2 233 15 |
|
add.f32 %f223, %f221, %f222 |
|
$L__tmp80: |
|
.loc 2 243 36 |
|
mov.b32 %r145, %f223 |
|
shfl.sync.bfly.b32 %r146, %r145, 1, 31, -1 |
|
mov.b32 %f224, %r146 |
|
$L__tmp81: |
|
.loc 2 233 15 |
|
add.f32 %f225, %f223, %f224 |
|
$L__tmp82: |
|
.loc 2 243 36 |
|
setp.eq.s32 %p52, %r65, 0 |
|
shl.b32 %r147, %r66, 2 |
|
mov.u32 %r148, global_smem |
|
add.s32 %r45, %r148, %r147 |
|
mov.b32 %r46, %f155 |
|
@%p52 st.shared.b32 [ %r45 + 0 ], %r46 |
|
add.s32 %r47, %r45, 32 |
|
mov.b32 %r48, %f165 |
|
@%p52 st.shared.b32 [ %r47 + 0 ], %r48 |
|
add.s32 %r49, %r45, 64 |
|
mov.b32 %r50, %f175 |
|
@%p52 st.shared.b32 [ %r49 + 0 ], %r50 |
|
add.s32 %r51, %r45, 96 |
|
mov.b32 %r52, %f185 |
|
@%p52 st.shared.b32 [ %r51 + 0 ], %r52 |
|
add.s32 %r53, %r45, 128 |
|
mov.b32 %r54, %f195 |
|
@%p52 st.shared.b32 [ %r53 + 0 ], %r54 |
|
add.s32 %r55, %r45, 160 |
|
mov.b32 %r56, %f205 |
|
@%p52 st.shared.b32 [ %r55 + 0 ], %r56 |
|
add.s32 %r57, %r45, 192 |
|
mov.b32 %r58, %f215 |
|
@%p52 st.shared.b32 [ %r57 + 0 ], %r58 |
|
add.s32 %r59, %r45, 224 |
|
mov.b32 %r60, %f225 |
|
@%p52 st.shared.b32 [ %r59 + 0 ], %r60 |
|
bar.sync 0 |
|
setp.lt.s32 %p60, %r1, 64 |
|
shl.b32 %r149, %r1, 2 |
|
add.s32 %r62, %r148, %r149 |
|
@%p60 ld.shared.b32 %r61, [ %r62 + 0 ] |
|
mov.b32 %f226, %r61 |
|
shfl.sync.bfly.b32 %r150, %r61, 4, 31, -1 |
|
mov.b32 %f227, %r150 |
|
$L__tmp83: |
|
.loc 2 233 15 |
|
add.f32 %f228, %f226, %f227 |
|
$L__tmp84: |
|
.loc 2 243 36 |
|
mov.b32 %r151, %f228 |
|
shfl.sync.bfly.b32 %r152, %r151, 2, 31, -1 |
|
mov.b32 %f229, %r152 |
|
$L__tmp85: |
|
.loc 2 233 15 |
|
add.f32 %f230, %f228, %f229 |
|
$L__tmp86: |
|
.loc 2 243 36 |
|
mov.b32 %r153, %f230 |
|
shfl.sync.bfly.b32 %r154, %r153, 1, 31, -1 |
|
mov.b32 %f231, %r154 |
|
$L__tmp87: |
|
.loc 2 233 15 |
|
add.f32 %f232, %f230, %f231 |
|
$L__tmp88: |
|
.loc 2 243 36 |
|
and.b32 %r155, %r1, 7 |
|
setp.eq.s32 %p62, %r155, 0 |
|
and.pred %p61, %p60, %p62 |
|
mov.b32 %r64, %f232 |
|
@%p61 st.shared.b32 [ %r62 + 0 ], %r64 |
|
bar.sync 0 |
|
ld.shared.f32 %f57, [global_smem] |
|
ld.shared.f32 %f58, [global_smem+32] |
|
ld.shared.f32 %f59, [global_smem+64] |
|
ld.shared.f32 %f60, [global_smem+96] |
|
ld.shared.f32 %f61, [global_smem+128] |
|
ld.shared.f32 %f62, [global_smem+160] |
|
ld.shared.f32 %f63, [global_smem+192] |
|
ld.shared.f32 %f64, [global_smem+224] |
|
$L__tmp89: |
|
.loc 1 51 36 |
|
mul.lo.s64 %rd10, %rd1, 804112 |
|
shl.b64 %rd88, %rd3, 1 |
|
add.s64 %rd164, %rd39, %rd88 |
|
add.s64 %rd163, %rd38, %rd88 |
|
shl.b64 %rd13, %rd3, 2 |
|
mul.lo.s64 %rd89, %rd1, 1608224 |
|
add.s64 %rd162, %rd36, %rd89 |
|
add.s64 %rd161, %rd37, %rd88 |
|
shl.b64 %rd90, %rd2, 1 |
|
add.s64 %rd160, %rd39, %rd90 |
|
add.s64 %rd159, %rd38, %rd90 |
|
shl.b64 %rd18, %rd2, 2 |
|
add.s64 %rd158, %rd37, %rd90 |
|
mov.u64 %rd165, 0 |
|
mov.u16 %rs2, 0 |
|
$L__BB0_3: |
|
.loc 1 52 27 |
|
add.s64 %rd155, %rd2, %rd165 |
|
.loc 1 53 25 |
|
add.s64 %rd156, %rd3, %rd165 |
|
setp.lt.u64 %p63, %rd155, 50257 |
|
setp.lt.u64 %p65, %rd156, 50257 |
|
.loc 1 55 35 |
|
add.s64 %rd91, %rd158, %rd10 |
|
add.s64 %rd92, %rd161, %rd10 |
|
add.s64 %rd93, %rd91, 100514 |
|
add.s64 %rd94, %rd92, 100514 |
|
add.s64 %rd95, %rd91, 201028 |
|
add.s64 %rd96, %rd92, 201028 |
|
add.s64 %rd97, %rd91, 301542 |
|
add.s64 %rd98, %rd92, 301542 |
|
add.s64 %rd99, %rd91, 402056 |
|
add.s64 %rd100, %rd92, 402056 |
|
add.s64 %rd101, %rd91, 502570 |
|
add.s64 %rd102, %rd92, 502570 |
|
add.s64 %rd103, %rd91, 603084 |
|
add.s64 %rd104, %rd92, 603084 |
|
add.s64 %rd105, %rd91, 703598 |
|
.loc 1 55 53 |
|
add.s64 %rd106, %rd92, 703598 |
|
mov.u16 %rs1, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd91 + 0 ] |
|
@!%p63 mov.u16 %rs1, %rs2 |
|
mov.u16 %rs3, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd92 + 0 ] |
|
@!%p65 mov.u16 %rs3, %rs2 |
|
mov.u16 %rs5, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd93 + 0 ] |
|
@!%p63 mov.u16 %rs5, %rs2 |
|
mov.u16 %rs7, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd94 + 0 ] |
|
@!%p65 mov.u16 %rs7, %rs2 |
|
mov.u16 %rs9, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd95 + 0 ] |
|
@!%p63 mov.u16 %rs9, %rs2 |
|
mov.u16 %rs11, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd96 + 0 ] |
|
@!%p65 mov.u16 %rs11, %rs2 |
|
mov.u16 %rs13, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd97 + 0 ] |
|
@!%p63 mov.u16 %rs13, %rs2 |
|
mov.u16 %rs15, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd98 + 0 ] |
|
@!%p65 mov.u16 %rs15, %rs2 |
|
mov.u16 %rs17, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd99 + 0 ] |
|
@!%p63 mov.u16 %rs17, %rs2 |
|
mov.u16 %rs19, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd100 + 0 ] |
|
@!%p65 mov.u16 %rs19, %rs2 |
|
mov.u16 %rs21, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd101 + 0 ] |
|
@!%p63 mov.u16 %rs21, %rs2 |
|
mov.u16 %rs23, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd102 + 0 ] |
|
@!%p65 mov.u16 %rs23, %rs2 |
|
mov.u16 %rs25, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd103 + 0 ] |
|
@!%p63 mov.u16 %rs25, %rs2 |
|
mov.u16 %rs27, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd104 + 0 ] |
|
@!%p65 mov.u16 %rs27, %rs2 |
|
mov.u16 %rs29, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd105 + 0 ] |
|
@!%p63 mov.u16 %rs29, %rs2 |
|
mov.u16 %rs31, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd106 + 0 ] |
|
@!%p65 mov.u16 %rs31, %rs2 |
|
.loc 1 55 105 |
|
cvt.f32.bf16 %r156, %rs1 |
|
mov.b32 %f265, %r156 |
|
cvt.f32.bf16 %r157, %rs3 |
|
mov.b32 %f266, %r157 |
|
cvt.f32.bf16 %r158, %rs5 |
|
mov.b32 %f267, %r158 |
|
cvt.f32.bf16 %r159, %rs7 |
|
mov.b32 %f268, %r159 |
|
cvt.f32.bf16 %r160, %rs9 |
|
mov.b32 %f269, %r160 |
|
cvt.f32.bf16 %r161, %rs11 |
|
mov.b32 %f270, %r161 |
|
cvt.f32.bf16 %r162, %rs13 |
|
mov.b32 %f271, %r162 |
|
cvt.f32.bf16 %r163, %rs15 |
|
mov.b32 %f272, %r163 |
|
cvt.f32.bf16 %r164, %rs17 |
|
mov.b32 %f273, %r164 |
|
cvt.f32.bf16 %r165, %rs19 |
|
mov.b32 %f274, %r165 |
|
cvt.f32.bf16 %r166, %rs21 |
|
mov.b32 %f275, %r166 |
|
cvt.f32.bf16 %r167, %rs23 |
|
mov.b32 %f276, %r167 |
|
cvt.f32.bf16 %r168, %rs25 |
|
mov.b32 %f277, %r168 |
|
cvt.f32.bf16 %r169, %rs27 |
|
mov.b32 %f278, %r169 |
|
cvt.f32.bf16 %r170, %rs29 |
|
mov.b32 %f279, %r170 |
|
cvt.f32.bf16 %r171, %rs31 |
|
mov.b32 %f280, %r171 |
|
.loc 1 56 35 |
|
add.s64 %rd107, %rd162, %rd18 |
|
add.s64 %rd108, %rd162, %rd13 |
|
add.s64 %rd109, %rd107, 201028 |
|
add.s64 %rd110, %rd108, 201028 |
|
add.s64 %rd111, %rd107, 402056 |
|
add.s64 %rd112, %rd108, 402056 |
|
add.s64 %rd113, %rd107, 603084 |
|
add.s64 %rd114, %rd108, 603084 |
|
add.s64 %rd115, %rd107, 804112 |
|
add.s64 %rd116, %rd108, 804112 |
|
add.s64 %rd117, %rd107, 1005140 |
|
add.s64 %rd118, %rd108, 1005140 |
|
add.s64 %rd119, %rd107, 1206168 |
|
add.s64 %rd120, %rd108, 1206168 |
|
add.s64 %rd121, %rd107, 1407196 |
|
.loc 1 56 53 |
|
add.s64 %rd122, %rd108, 1407196 |
|
mov.u32 %r172, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r172 }, [ %rd107 + 0 ] |
|
@!%p63 mov.u32 %r172, %r173 |
|
mov.b32 %f281, %r172 |
|
mov.u32 %r174, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r174 }, [ %rd108 + 0 ] |
|
@!%p65 mov.u32 %r174, %r173 |
|
mov.b32 %f282, %r174 |
|
mov.u32 %r176, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r176 }, [ %rd109 + 0 ] |
|
@!%p63 mov.u32 %r176, %r173 |
|
mov.b32 %f283, %r176 |
|
mov.u32 %r178, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r178 }, [ %rd110 + 0 ] |
|
@!%p65 mov.u32 %r178, %r173 |
|
mov.b32 %f284, %r178 |
|
mov.u32 %r180, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r180 }, [ %rd111 + 0 ] |
|
@!%p63 mov.u32 %r180, %r173 |
|
mov.b32 %f285, %r180 |
|
mov.u32 %r182, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r182 }, [ %rd112 + 0 ] |
|
@!%p65 mov.u32 %r182, %r173 |
|
mov.b32 %f286, %r182 |
|
mov.u32 %r184, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r184 }, [ %rd113 + 0 ] |
|
@!%p63 mov.u32 %r184, %r173 |
|
mov.b32 %f287, %r184 |
|
mov.u32 %r186, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r186 }, [ %rd114 + 0 ] |
|
@!%p65 mov.u32 %r186, %r173 |
|
mov.b32 %f288, %r186 |
|
mov.u32 %r188, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r188 }, [ %rd115 + 0 ] |
|
@!%p63 mov.u32 %r188, %r173 |
|
mov.b32 %f289, %r188 |
|
mov.u32 %r190, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r190 }, [ %rd116 + 0 ] |
|
@!%p65 mov.u32 %r190, %r173 |
|
mov.b32 %f290, %r190 |
|
mov.u32 %r192, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r192 }, [ %rd117 + 0 ] |
|
@!%p63 mov.u32 %r192, %r173 |
|
mov.b32 %f291, %r192 |
|
mov.u32 %r194, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r194 }, [ %rd118 + 0 ] |
|
@!%p65 mov.u32 %r194, %r173 |
|
mov.b32 %f292, %r194 |
|
mov.u32 %r196, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r196 }, [ %rd119 + 0 ] |
|
@!%p63 mov.u32 %r196, %r173 |
|
mov.b32 %f293, %r196 |
|
mov.u32 %r198, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r198 }, [ %rd120 + 0 ] |
|
@!%p65 mov.u32 %r198, %r173 |
|
mov.b32 %f294, %r198 |
|
mov.u32 %r200, 0x0 |
|
@%p63 ld.global.L1::evict_first.b32 { %r200 }, [ %rd121 + 0 ] |
|
@!%p63 mov.u32 %r200, %r173 |
|
mov.b32 %f295, %r200 |
|
mov.u32 %r202, 0x0 |
|
@%p65 ld.global.L1::evict_first.b32 { %r202 }, [ %rd122 + 0 ] |
|
@!%p65 mov.u32 %r202, %r173 |
|
mov.b32 %f296, %r202 |
|
.loc 1 57 35 |
|
add.s64 %rd123, %rd159, %rd10 |
|
add.s64 %rd124, %rd163, %rd10 |
|
add.s64 %rd125, %rd123, 100514 |
|
add.s64 %rd126, %rd124, 100514 |
|
add.s64 %rd127, %rd123, 201028 |
|
add.s64 %rd128, %rd124, 201028 |
|
add.s64 %rd129, %rd123, 301542 |
|
add.s64 %rd130, %rd124, 301542 |
|
add.s64 %rd131, %rd123, 402056 |
|
add.s64 %rd132, %rd124, 402056 |
|
add.s64 %rd133, %rd123, 502570 |
|
add.s64 %rd134, %rd124, 502570 |
|
add.s64 %rd135, %rd123, 603084 |
|
add.s64 %rd136, %rd124, 603084 |
|
add.s64 %rd137, %rd123, 703598 |
|
.loc 1 57 53 |
|
add.s64 %rd138, %rd124, 703598 |
|
mov.u16 %rs49, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd123 + 0 ] |
|
@!%p63 mov.u16 %rs49, %rs2 |
|
mov.u16 %rs51, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd124 + 0 ] |
|
@!%p65 mov.u16 %rs51, %rs2 |
|
mov.u16 %rs53, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd125 + 0 ] |
|
@!%p63 mov.u16 %rs53, %rs2 |
|
mov.u16 %rs55, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd126 + 0 ] |
|
@!%p65 mov.u16 %rs55, %rs2 |
|
mov.u16 %rs57, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd127 + 0 ] |
|
@!%p63 mov.u16 %rs57, %rs2 |
|
mov.u16 %rs59, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd128 + 0 ] |
|
@!%p65 mov.u16 %rs59, %rs2 |
|
mov.u16 %rs61, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd129 + 0 ] |
|
@!%p63 mov.u16 %rs61, %rs2 |
|
mov.u16 %rs63, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd130 + 0 ] |
|
@!%p65 mov.u16 %rs63, %rs2 |
|
mov.u16 %rs65, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd131 + 0 ] |
|
@!%p63 mov.u16 %rs65, %rs2 |
|
mov.u16 %rs67, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd132 + 0 ] |
|
@!%p65 mov.u16 %rs67, %rs2 |
|
mov.u16 %rs69, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd133 + 0 ] |
|
@!%p63 mov.u16 %rs69, %rs2 |
|
mov.u16 %rs71, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd134 + 0 ] |
|
@!%p65 mov.u16 %rs71, %rs2 |
|
mov.u16 %rs73, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd135 + 0 ] |
|
@!%p63 mov.u16 %rs73, %rs2 |
|
mov.u16 %rs75, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd136 + 0 ] |
|
@!%p65 mov.u16 %rs75, %rs2 |
|
mov.u16 %rs77, 0x0 |
|
@%p63 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd137 + 0 ] |
|
@!%p63 mov.u16 %rs77, %rs2 |
|
mov.u16 %rs79, 0x0 |
|
@%p65 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd138 + 0 ] |
|
@!%p65 mov.u16 %rs79, %rs2 |
|
.loc 1 57 105 |
|
cvt.f32.bf16 %r204, %rs49 |
|
mov.b32 %f297, %r204 |
|
cvt.f32.bf16 %r205, %rs51 |
|
mov.b32 %f298, %r205 |
|
cvt.f32.bf16 %r206, %rs53 |
|
mov.b32 %f299, %r206 |
|
cvt.f32.bf16 %r207, %rs55 |
|
mov.b32 %f300, %r207 |
|
cvt.f32.bf16 %r208, %rs57 |
|
mov.b32 %f301, %r208 |
|
cvt.f32.bf16 %r209, %rs59 |
|
mov.b32 %f302, %r209 |
|
cvt.f32.bf16 %r210, %rs61 |
|
mov.b32 %f303, %r210 |
|
cvt.f32.bf16 %r211, %rs63 |
|
mov.b32 %f304, %r211 |
|
cvt.f32.bf16 %r212, %rs65 |
|
mov.b32 %f305, %r212 |
|
cvt.f32.bf16 %r213, %rs67 |
|
mov.b32 %f306, %r213 |
|
cvt.f32.bf16 %r214, %rs69 |
|
mov.b32 %f307, %r214 |
|
cvt.f32.bf16 %r215, %rs71 |
|
mov.b32 %f308, %r215 |
|
cvt.f32.bf16 %r216, %rs73 |
|
mov.b32 %f309, %r216 |
|
cvt.f32.bf16 %r217, %rs75 |
|
mov.b32 %f310, %r217 |
|
cvt.f32.bf16 %r218, %rs77 |
|
mov.b32 %f311, %r218 |
|
cvt.f32.bf16 %r219, %rs79 |
|
mov.b32 %f312, %r219 |
|
.loc 1 65 23 |
|
mul.f32 %f234, %f297, 0f3FB8AA3B |
|
ex2.approx.f32 %f233, %f234 |
|
mul.f32 %f236, %f298, 0f3FB8AA3B |
|
ex2.approx.f32 %f235, %f236 |
|
mul.f32 %f238, %f299, 0f3FB8AA3B |
|
ex2.approx.f32 %f237, %f238 |
|
mul.f32 %f240, %f300, 0f3FB8AA3B |
|
ex2.approx.f32 %f239, %f240 |
|
mul.f32 %f242, %f301, 0f3FB8AA3B |
|
ex2.approx.f32 %f241, %f242 |
|
mul.f32 %f244, %f302, 0f3FB8AA3B |
|
ex2.approx.f32 %f243, %f244 |
|
mul.f32 %f246, %f303, 0f3FB8AA3B |
|
ex2.approx.f32 %f245, %f246 |
|
mul.f32 %f248, %f304, 0f3FB8AA3B |
|
ex2.approx.f32 %f247, %f248 |
|
mul.f32 %f250, %f305, 0f3FB8AA3B |
|
ex2.approx.f32 %f249, %f250 |
|
mul.f32 %f252, %f306, 0f3FB8AA3B |
|
ex2.approx.f32 %f251, %f252 |
|
mul.f32 %f254, %f307, 0f3FB8AA3B |
|
ex2.approx.f32 %f253, %f254 |
|
mul.f32 %f256, %f308, 0f3FB8AA3B |
|
ex2.approx.f32 %f255, %f256 |
|
mul.f32 %f258, %f309, 0f3FB8AA3B |
|
ex2.approx.f32 %f257, %f258 |
|
mul.f32 %f260, %f310, 0f3FB8AA3B |
|
ex2.approx.f32 %f259, %f260 |
|
mul.f32 %f262, %f311, 0f3FB8AA3B |
|
ex2.approx.f32 %f261, %f262 |
|
mul.f32 %f264, %f312, 0f3FB8AA3B |
|
ex2.approx.f32 %f263, %f264 |
|
.loc 1 66 24 |
|
mul.f32 %f313, %f57, %f233 |
|
mul.f32 %f314, %f57, %f235 |
|
mul.f32 %f315, %f58, %f237 |
|
mul.f32 %f316, %f58, %f239 |
|
mul.f32 %f317, %f59, %f241 |
|
mul.f32 %f318, %f59, %f243 |
|
mul.f32 %f319, %f60, %f245 |
|
mul.f32 %f320, %f60, %f247 |
|
mul.f32 %f321, %f61, %f249 |
|
mul.f32 %f322, %f61, %f251 |
|
mul.f32 %f323, %f62, %f253 |
|
mul.f32 %f324, %f62, %f255 |
|
mul.f32 %f325, %f63, %f257 |
|
mul.f32 %f326, %f63, %f259 |
|
mul.f32 %f327, %f64, %f261 |
|
mul.f32 %f328, %f64, %f263 |
|
.loc 1 67 24 |
|
neg.f32 %f329, %f313 |
|
fma.rn.f32 %f330, %f1, %f281, %f329 |
|
neg.f32 %f331, %f314 |
|
fma.rn.f32 %f332, %f1, %f282, %f331 |
|
neg.f32 %f333, %f315 |
|
fma.rn.f32 %f334, %f2, %f283, %f333 |
|
neg.f32 %f335, %f316 |
|
fma.rn.f32 %f336, %f2, %f284, %f335 |
|
neg.f32 %f337, %f317 |
|
fma.rn.f32 %f338, %f3, %f285, %f337 |
|
neg.f32 %f339, %f318 |
|
fma.rn.f32 %f340, %f3, %f286, %f339 |
|
neg.f32 %f341, %f319 |
|
fma.rn.f32 %f342, %f4, %f287, %f341 |
|
neg.f32 %f343, %f320 |
|
fma.rn.f32 %f344, %f4, %f288, %f343 |
|
neg.f32 %f345, %f321 |
|
fma.rn.f32 %f346, %f5, %f289, %f345 |
|
neg.f32 %f347, %f322 |
|
fma.rn.f32 %f348, %f5, %f290, %f347 |
|
neg.f32 %f349, %f323 |
|
fma.rn.f32 %f350, %f6, %f291, %f349 |
|
neg.f32 %f351, %f324 |
|
fma.rn.f32 %f352, %f6, %f292, %f351 |
|
neg.f32 %f353, %f325 |
|
fma.rn.f32 %f354, %f7, %f293, %f353 |
|
neg.f32 %f355, %f326 |
|
fma.rn.f32 %f356, %f7, %f294, %f355 |
|
neg.f32 %f357, %f327 |
|
fma.rn.f32 %f358, %f8, %f295, %f357 |
|
neg.f32 %f359, %f328 |
|
fma.rn.f32 %f360, %f8, %f296, %f359 |
|
.loc 1 69 24 |
|
add.f32 %f361, %f265, %f330 |
|
add.f32 %f362, %f266, %f332 |
|
add.f32 %f363, %f267, %f334 |
|
add.f32 %f364, %f268, %f336 |
|
add.f32 %f365, %f269, %f338 |
|
add.f32 %f366, %f270, %f340 |
|
add.f32 %f367, %f271, %f342 |
|
add.f32 %f368, %f272, %f344 |
|
add.f32 %f369, %f273, %f346 |
|
add.f32 %f370, %f274, %f348 |
|
add.f32 %f371, %f275, %f350 |
|
add.f32 %f372, %f276, %f352 |
|
add.f32 %f373, %f277, %f354 |
|
add.f32 %f374, %f278, %f356 |
|
add.f32 %f375, %f279, %f358 |
|
add.f32 %f376, %f280, %f360 |
|
.loc 1 70 29 |
|
add.s64 %rd139, %rd160, %rd10 |
|
add.s64 %rd140, %rd164, %rd10 |
|
add.s64 %rd141, %rd139, 100514 |
|
add.s64 %rd142, %rd140, 100514 |
|
add.s64 %rd143, %rd139, 201028 |
|
add.s64 %rd144, %rd140, 201028 |
|
add.s64 %rd145, %rd139, 301542 |
|
add.s64 %rd146, %rd140, 301542 |
|
add.s64 %rd147, %rd139, 402056 |
|
add.s64 %rd148, %rd140, 402056 |
|
add.s64 %rd149, %rd139, 502570 |
|
add.s64 %rd150, %rd140, 502570 |
|
add.s64 %rd151, %rd139, 603084 |
|
add.s64 %rd152, %rd140, 603084 |
|
add.s64 %rd153, %rd139, 703598 |
|
.loc 1 70 54 |
|
add.s64 %rd154, %rd140, 703598 |
|
mov.b32 %r220, %f361 |
|
cvt.rn.bf16.f32 %rs97, %r220 |
|
mov.b32 %r221, %f362 |
|
cvt.rn.bf16.f32 %rs98, %r221 |
|
mov.b32 %r222, %f363 |
|
cvt.rn.bf16.f32 %rs99, %r222 |
|
mov.b32 %r223, %f364 |
|
cvt.rn.bf16.f32 %rs100, %r223 |
|
mov.b32 %r224, %f365 |
|
cvt.rn.bf16.f32 %rs101, %r224 |
|
mov.b32 %r225, %f366 |
|
cvt.rn.bf16.f32 %rs102, %r225 |
|
mov.b32 %r226, %f367 |
|
cvt.rn.bf16.f32 %rs103, %r226 |
|
mov.b32 %r227, %f368 |
|
cvt.rn.bf16.f32 %rs104, %r227 |
|
mov.b32 %r228, %f369 |
|
cvt.rn.bf16.f32 %rs105, %r228 |
|
mov.b32 %r229, %f370 |
|
cvt.rn.bf16.f32 %rs106, %r229 |
|
mov.b32 %r230, %f371 |
|
cvt.rn.bf16.f32 %rs107, %r230 |
|
mov.b32 %r231, %f372 |
|
cvt.rn.bf16.f32 %rs108, %r231 |
|
mov.b32 %r232, %f373 |
|
cvt.rn.bf16.f32 %rs109, %r232 |
|
mov.b32 %r233, %f374 |
|
cvt.rn.bf16.f32 %rs110, %r233 |
|
mov.b32 %r234, %f375 |
|
cvt.rn.bf16.f32 %rs111, %r234 |
|
mov.b32 %r235, %f376 |
|
cvt.rn.bf16.f32 %rs112, %r235 |
|
@%p63 st.global.b16 [ %rd139 + 0 ], { %rs97 } |
|
@%p65 st.global.b16 [ %rd140 + 0 ], { %rs98 } |
|
@%p63 st.global.b16 [ %rd141 + 0 ], { %rs99 } |
|
@%p65 st.global.b16 [ %rd142 + 0 ], { %rs100 } |
|
@%p63 st.global.b16 [ %rd143 + 0 ], { %rs101 } |
|
@%p65 st.global.b16 [ %rd144 + 0 ], { %rs102 } |
|
@%p63 st.global.b16 [ %rd145 + 0 ], { %rs103 } |
|
@%p65 st.global.b16 [ %rd146 + 0 ], { %rs104 } |
|
@%p63 st.global.b16 [ %rd147 + 0 ], { %rs105 } |
|
@%p65 st.global.b16 [ %rd148 + 0 ], { %rs106 } |
|
@%p63 st.global.b16 [ %rd149 + 0 ], { %rs107 } |
|
@%p65 st.global.b16 [ %rd150 + 0 ], { %rs108 } |
|
@%p63 st.global.b16 [ %rd151 + 0 ], { %rs109 } |
|
@%p65 st.global.b16 [ %rd152 + 0 ], { %rs110 } |
|
@%p63 st.global.b16 [ %rd153 + 0 ], { %rs111 } |
|
@%p65 st.global.b16 [ %rd154 + 0 ], { %rs112 } |
|
.loc 1 51 36 |
|
add.s64 %rd165, %rd165, 512 |
|
cvt.u32.u64 %r236, %rd165 |
|
add.s32 %r237, %r236, -512 |
|
add.s64 %rd164, %rd164, 1024 |
|
add.s64 %rd163, %rd163, 1024 |
|
add.s64 %rd162, %rd162, 2048 |
|
add.s64 %rd161, %rd161, 1024 |
|
add.s64 %rd160, %rd160, 1024 |
|
add.s64 %rd159, %rd159, 1024 |
|
add.s64 %rd158, %rd158, 1024 |
|
setp.lt.u32 %p175, %r237, 49745 |
|
@%p175 bra $L__BB0_3 |
|
.loc 1 51 4 |
|
ret |
|
$L__tmp90: |
|
$L__func_end0: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py" |
|
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 32 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 3 |
|
.b8 46 |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 49 |
|
.b8 19 |
|
.b8 0 |
|
.b8 0 |
|
.b8 4 |
|
.b8 29 |
|
.b8 1 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 5 |
|
.b8 29 |
|
.b8 0 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 278 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 107 |
|
.b8 122 |
|
.b8 103 |
|
.b8 108 |
|
.b8 55 |
|
.b8 116 |
|
.b8 104 |
|
.b8 98 |
|
.b8 52 |
|
.b8 120 |
|
.b8 100 |
|
.b8 102 |
|
.b8 107 |
|
.b8 102 |
|
.b8 110 |
|
.b8 100 |
|
.b8 50 |
|
.b8 116 |
|
.b8 105 |
|
.b8 100 |
|
.b8 107 |
|
.b8 115 |
|
.b8 54 |
|
.b8 109 |
|
.b8 116 |
|
.b8 53 |
|
.b8 102 |
|
.b8 51 |
|
.b8 104 |
|
.b8 97 |
|
.b8 117 |
|
.b8 119 |
|
.b8 102 |
|
.b8 121 |
|
.b8 106 |
|
.b8 102 |
|
.b8 108 |
|
.b8 98 |
|
.b8 116 |
|
.b8 122 |
|
.b8 121 |
|
.b8 101 |
|
.b8 112 |
|
.b8 111 |
|
.b8 53 |
|
.b8 111 |
|
.b8 120 |
|
.b8 107 |
|
.b8 118 |
|
.b8 104 |
|
.b8 107 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 107 |
|
.b8 122 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 56 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 56 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 1 |
|
.b8 3 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b32 125 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp88 |
|
.b8 2 |
|
.b8 46 |
|
.b8 27 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp88 |
|
.b8 2 |
|
.b8 243 |
|
.b8 36 |
|
.b8 0 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp89 |
|
.b8 2 |
|
.b8 46 |
|
.b8 27 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 282 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 56 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 282 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|