emvKernel_fp32_param_0+48]; ld.param.f32 %f18, [gemvKernel_fp32_param_0+44]; ld.param.u32 %r41, [gemvKernel_fp32_param_0+40]; ld.param.u64 %rd9, [gemvKernel_fp32_param_0+24]; ld.param.u64 %rd8, [gemvKernel_fp32_param_0+16]; ld.param.u64 %rd7, [gemvKernel_fp32_param_0+8]; ld.param.u64 %rd6, [gemvKernel_fp32_param_0]; cvta.to.global.u64 %rd1, %rd6; cvta.to.global.u64 %rd2, %rd7; mov.u32 %r44, %ntid.y; mov.u32 %r45, %ctaid.x; mov.u32 %r46, %tid.y; mad.lo.s32 %r1, %r45, %r44, %r46; setp.ge.s32 %p2, %r1, %r42; @%p2 bra $L__BB0_22; setp.lt.s32 %p3, %r41, 1; mov.f32 %f46, 0f00000000; mov.u32 %r3, %tid.x; @%p3 bra $L__BB0_18; mov.u32 %r5, WARP_SZ; and.b32 %r81, %r41, 3; add.s32 %r48, %r41, -1; setp.lt.u32 %p4, %r48, 3; mov.f32 %f46, 0f00000000; mov.u32 %r78, 0; @%p4 bra $L__BB0_13; add.s32 %r76, %r5, %r3; shl.b32 %r8, %r5, 2; mul.lo.s32 %r9, %r43, %r1; add.s32 %r75, %r76, %r9; shl.b32 %r50, %r5, 1; add.s32 %r74, %r3, %r50; add.s32 %r51, %r3, %r9; add.s32 %r73, %r51, %r50; mad.lo.s32 %r72, %r5, 3, %r3; sub.s32 %r14, %r81, %r41; mov.u32 %r71, %r3; $L__BB0_4: setp.ge.s32 %p5, %r71, %r43; @%p5 bra $L__BB0_6; add.s32 %r52, %r9, %r71; mul.wide.s32 %rd10, %r52, 4; add.s64 %rd11, %rd1, %rd10; mul.wide.s32 %rd12, %r71, 4; add.s64 %rd13, %rd2, %rd12; ld.global.f32 %f24, [%rd13]; ld.global.f32 %f25, [%rd11]; fma.rn.ftz.f32 %f46, %f25, %f24, %f46; $L__BB0_6: setp.ge.s32 %p6, %r76, %r43; @%p6 bra $L__BB0_8; mul.wide.s32 %rd14, %r75, 4; add.s64 %rd15, %rd1, %rd14; mul.wide.s32 %rd16, %r76, 4; add.s64 %rd17, %rd2, %rd16; ld.global.f32 %f26, [%rd17]; ld.global.f32 %f27, [%rd15]; fma.rn.ftz.f32 %f46, %f27, %f26, %f46; $L__BB0_8: setp.ge.s32 %p7, %r74, %r43; @%p7 bra $L__BB0_10; mul.wide.s32 %rd18, %r73, 4; add.s64 %rd19, %rd1, %rd18; mul.wide.s32 %rd20, %r74, 4; add.s64 %rd21, %rd2, %rd20; ld.global.f32 %f28, [%rd21]; ld.global.f32 %f29, [%rd19]; fma.rn.ftz.f32 %f46, %f29, %f28, %f46; $L__BB0_10: setp.ge.s32 %p8, %r72, %r43; @%p8 bra $L__BB0_12; add.s32 %r53, %r9, %r72; mul.wide.s32 %rd22, %r53, 4; add.s64 %rd23, %rd1, %rd22; mul.wide.s32 %rd24, %r72, 4; add.s64 %rd25, %rd2, %rd24; ld.global.f32 %f30, [%rd25]; ld.global.f32 %f31, [%rd23]; fma.rn.ftz.f32 %f46, %f31, %f30, %f46; $L__BB0_12: add.s32 %r76, %r76, %r8; add.s32 %r75, %r75, %r8; add.s32 %r74, %r74, %r8; add.s32 %r73, %r73, %r8; add.s32 %r72, %r72, %r8; add.s32 %r71, %r71, %r8; add.s32 %r78, %r78, 4; add.s32 %r54, %r14, %r78; setp.ne.s32 %p9, %r54, 0; @%p9 bra $L__BB0_4; $L__BB0_13: setp.eq.s32 %p10, %r81, 0; @%p10 bra $L__BB0_18; mad.lo.s32 %r80, %r5, %r78, %r3; mad.lo.s32 %r79, %r43, %r1, %r80; $L__BB0_15: .pragma "nounroll"; setp.ge.s32 %p11, %r80, %r43; @%p11 bra $L__BB0_17; mul.wide.s32 %rd26, %r79, 4; add.s64 %rd27, %rd1, %rd26; mul.wide.s32 %rd28, %r80, 4; add.s64 %rd29, %rd2, %rd28; ld.global.f32 %f32, [%rd29]; ld.global.f32 %f33, [%rd27]; fma.rn.ftz.f32 %f46, %f33, %f32, %f46; $L__BB0_17: add.s32 %r80, %r80, %r5; add.s32 %r79, %r79, %r5; add.s32 %r81, %r81, -1; setp.ne.s32 %p12, %r81, 0; @%p12 bra $L__BB0_15; $L__BB0_18: mov.b32 %r55, %f46; mov.u32 %r56, 31; mov.u32 %r57, 16; mov.u32 %r58, -1; shfl.sync.bfly.b32 %r59|%p13, %r55, %r57, %r56, %r58; mov.b32 %f34, %r59; add.ftz.f32 %f35, %f46, %f34; mov.b32 %r60, %f35; mov.u32 %r61, 8; shfl.sync.bfly.b32 %r62|%p14, %r60, %r61, %r56, %r58; mov.b32 %f36, %r62; add.ftz.f32 %f37, %f35, %f36; mov.b32 %r63, %f37; mov.u32 %r64, 4; shfl.sync.bfly.b32 %r65|%p15, %r63, %r64, %r56, %r58; mov.b32 %f38, %r65; add.ftz.f32 %f39, %f37, %f38; mov.b32 %r66, %f39; mov.u32 %r67, 2; shfl.sync.bfly.b32 %r68|%p16, %r66, %r67, %r56, %r58; mov.b32 %f40, %r68; add.ftz.f32 %f16, %f39, %f40; mov.b32 %r69, %f16; mov.u32 %r70, 1; shfl.sync.bfly.b32 %r38|%p1, %r69, %r70, %r56, %r58; setp.ne.s32 %p17, %r3, 0; @%p17 bra $L__BB0_22; setp.eq.s64 %p18, %rd8, 0; mov.b32 %f41, %r38; add.ftz.f32 %f42, %f16, %f41; mul.ftz.f32 %f17, %f18, %f42; cvt.s64.s32 %rd4, %r1; cvta.to.global.u64 %rd30, %rd9; mul.wide.s32 %rd31, %r1, 4; add.s64 %rd5, %rd30, %rd31; @%p18 bra $L__BB0_21; cvta.to.global.u64 %rd32, %rd8; shl.b64 %rd33, %rd4, 2; add.s64 %rd34, %rd32, %rd33; ld.global.f32 %f43, [%rd34]; fma.rn.ftz.f32 %f44, %f19, %f43, %f17; st.global.f32 [%rd5], %f44; bra.uni $L__BB0_22; $L__BB0_21: st.global.f32 [%rd5], %f17; $L__BB0_22: ret; }