kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_128_16_S_40_sm86_kernel_nl_param_0+52]; mov.u32 %r439, %ctaid.z; shl.b32 %r3, %r439, 7; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_86; mov.u32 %r440, %tid.x; mov.u32 %r441, %ctaid.y; mov.u32 %r442, %ctaid.x; mul.lo.s32 %r443, %r1, %r441; mad.lo.s32 %r444, %r443, %r2, %r442; shr.s32 %r445, %r440, 31; shr.u32 %r446, %r445, 27; add.s32 %r447, %r440, %r446; and.b32 %r448, %r447, -32; sub.s32 %r449, %r440, %r448; shr.u32 %r450, %r445, 25; add.s32 %r451, %r440, %r450; shr.s32 %r452, %r451, 7; shl.b32 %r453, %r452, 4; shr.s32 %r454, %r449, 31; shr.u32 %r455, %r454, 30; add.s32 %r456, %r449, %r455; and.b32 %r457, %r456, 2147483644; sub.s32 %r458, %r449, %r457; shl.b32 %r459, %r458, 1; add.s32 %r1356, %r459, %r453; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r460, %r447, 5; shr.s32 %r461, %r447, 31; shr.u32 %r462, %r461, 30; add.s32 %r463, %r460, %r462; and.b32 %r464, %r463, 268435452; sub.s32 %r465, %r460, %r464; shl.b32 %r466, %r465, 4; shr.s32 %r467, %r456, 2; add.s32 %r5, %r466, %r467; shr.u32 %r468, %r445, 29; add.s32 %r469, %r440, %r468; and.b32 %r470, %r469, -8; sub.s32 %r6, %r440, %r470; setp.gt.s32 %p6, %r6, 4; shr.s32 %r7, %r469, 3; add.s32 %r471, %r7, %r3; cvt.s64.s32 %rd5, %r471; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd37, %rd6, %rd5; mul.lo.s32 %r472, %r444, 3; mul.wide.s32 %rd38, %r472, 80; shl.b32 %r473, %r6, 4; cvt.s64.s32 %rd39, %r473; add.s64 %rd40, %rd38, %rd39; add.s64 %rd41, %rd40, %rd37; ld.param.u64 %rd42, [%rd1]; add.s64 %rd7, %rd42, %rd41; shr.s32 %r474, %r469, 31; shr.u32 %r475, %r474, 29; add.s32 %r476, %r7, %r475; and.b32 %r477, %r476, 268435448; sub.s32 %r478, %r7, %r477; xor.b32 %r479, %r478, %r6; shl.b32 %r480, %r7, 7; shl.b32 %r481, %r479, 4; mov.u32 %r482, 31; mov.u32 %r483, 0; mov.u32 %r484, -1; shfl.sync.idx.b32 %r8|%p1, %r483, %r483, %r482, %r484; shfl.sync.idx.b32 %r20|%p7, %r483, %r483, %r482, %r484; and.b32 %r485, %r440, 96; shr.u32 %r486, %r485, 1; and.b32 %r487, %r440, 15; or.b32 %r488, %r486, %r487; and.b32 %r489, %r440, 7; shl.b32 %r490, %r440, 4; and.b32 %r491, %r490, 112; and.b32 %r492, %r440, 16; xor.b32 %r493, %r491, %r492; cvt.s64.s32 %rd43, %r7; mul.lo.s64 %rd44, %rd6, %rd43; shfl.sync.idx.b32 %r9|%p2, %r483, %r483, %r482, %r484; shfl.sync.idx.b32 %r10|%p3, %r483, %r483, %r482, %r484; shr.u32 %r494, %r492, 1; or.b32 %r495, %r494, %r489; and.b32 %r496, %r440, 8; shr.u32 %r497, %r496, 3; xor.b32 %r498, %r497, %r489; add.s64 %rd45, %rd40, %rd44; add.s64 %rd46, %rd42, %rd45; shfl.sync.idx.b32 %r499|%p8, %r483, %r483, %r482, %r484; shfl.sync.idx.b32 %r11|%p4, %r483, %r483, %r482, %r484; ld.param.u64 %rd10, [%rd1+32]; ld.param.u64 %rd11, [%rd1+8]; ld.param.u32 %r12, [%rd1+60]; sub.s32 %r500, %r1, %r3; min.s32 %r13, %r500, 128; shr.s32 %r501, %r12, 31; shr.u32 %r502, %r501, 29; add.s32 %r503, %r12, %r502; shr.s32 %r14, %r503, 3; shl.b32 %r504, %r440, 7; and.b32 %r505, %r504, 1920; shl.b32 %r506, %r498, 4; shl.b32 %r507, %r495, 7; shl.b32 %r508, %r488, 7; add.s32 %r15, %r481, %r480; or.b32 %r16, %r508, %r493; add.s64 %rd182, %rd46, 80; or.b32 %r17, %r507, %r506; add.s64 %rd183, %rd46, 160; or.b32 %r18, %r493, %r505; mov.u32 %r509, _ZN25fused_multihead_attention5smem_E; add.s32 %r510, %r15, %r509; add.s32 %r19, %r510, 16384; @%p6 bra $L__BB0_3; shl.b64 %rd55, %rd6, 4; add.s32 %r527, %r7, 112; setp.lt.s32 %p9, %r527, %r13; add.s32 %r528, %r7, 96; setp.lt.s32 %p10, %r528, %r13; add.s32 %r529, %r7, 80; setp.lt.s32 %p11, %r529, %r13; add.s32 %r530, %r7, 64; setp.lt.s32 %p12, %r530, %r13; add.s32 %r531, %r7, 48; setp.lt.s32 %p13, %r531, %r13; add.s32 %r532, %r7, 32; setp.lt.s32 %p14, %r532, %r13; add.s32 %r533, %r7, 16; setp.lt.s32 %p15, %r533, %r13; selp.b32 %r522, 16, 0, %p11; add.s32 %r511, %r510, %r20; add.s32 %r513, %r511, 2048; add.s32 %r515, %r511, 4096; add.s32 %r517, %r511, 6144; add.s32 %r519, %r511, 8192; add.s32 %r521, %r511, 10240; add.s32 %r523, %r511, 12288; add.s32 %r525, %r511, 14336; setp.lt.s32 %p16, %r7, %r13; selp.b32 %r512, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r511], [%rd7], 16, %r512; // end inline asm selp.b32 %r514, 16, 0, %p15; add.s64 %rd48, %rd7, %rd55; // begin inline asm cp.async.cg.shared.global [%r513], [%rd48], 16, %r514; // end inline asm selp.b32 %r516, 16, 0, %p14; add.s64 %rd49, %rd48, %rd55; // begin inline asm cp.async.cg.shared.global [%r515], [%rd49], 16, %r516; // end inline asm selp.b32 %r518, 16, 0, %p13; add.s64 %rd50, %rd49, %rd55; // begin inline asm cp.async.cg.shared.global [%r517], [%rd50], 16, %r518; // end inline asm selp.b32 %r520, 16, 0, %p12; add.s64 %rd51, %rd50, %rd55; // begin inline asm cp.async.cg.shared.global [%r519], [%rd51], 16, %r520; // end inline asm add.s64 %rd52, %rd51, %rd55; // begin inline asm cp.async.cg.shared.global [%r521], [%rd52], 16, %r522; // end inline asm selp.b32 %r524, 16, 0, %p10; add.s64 %rd53, %rd52, %rd55; // begin inline asm cp.async.cg.shared.global [%r523], [%rd53], 16, %r524; // end inline asm selp.b32 %r526, 16, 0, %p9; add.s64 %rd54, %rd53, %rd55; // begin inline asm cp.async.cg.shared.global [%r525], [%rd54], 16, %r526; // end inline asm $L__BB0_3: @%p6 bra $L__BB0_5; min.s32 %r540, %r1, 16; setp.lt.s32 %p18, %r7, %r540; selp.b32 %r539, 16, 0, %p18; add.s32 %r536, %r19, %r10; // begin inline asm cp.async.cg.shared.global [%r536], [%rd182], 16, %r539; // end inline asm add.s32 %r543, %r510, %r11; add.s32 %r538, %r543, 18432; // begin inline asm cp.async.cg.shared.global [%r538], [%rd183], 16, %r539; // end inline asm $L__BB0_5: setp.lt.s32 %p19, %r6, 5; // begin inline asm cp.async.commit_group; // end inline asm @%p19 bra $L__BB0_7; add.s32 %r544, %r510, %r20; add.s32 %r549, %r544, 2048; add.s32 %r554, %r544, 4096; add.s32 %r559, %r544, 6144; add.s32 %r564, %r544, 8192; add.s32 %r569, %r544, 10240; add.s32 %r574, %r544, 12288; add.s32 %r579, %r544, 14336; mov.u32 %r593, 0; // begin inline asm st.shared.v4.b32 [%r544], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r549], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r554], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r559], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r564], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r569], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r574], {%r593, %r593, %r593, %r593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r579], {%r593, %r593, %r593, %r593}; // end inline asm add.s32 %r584, %r19, %r10; // begin inline asm st.shared.v4.b32 [%r584], {%r593, %r593, %r593, %r593}; // end inline asm add.s32 %r596, %r510, %r11; add.s32 %r589, %r596, 18432; // begin inline asm st.shared.v4.b32 [%r589], {%r593, %r593, %r593, %r593}; // end inline asm $L__BB0_7: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r706, %r8, %r509; add.s32 %r601, %r706, %r16; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r597, %r598, %r599, %r600}, [%r601]; // end inline asm add.s32 %r606, %r601, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r602, %r603, %r604, %r605}, [%r606]; // end inline asm xor.b32 %r707, %r16, 32; add.s32 %r611, %r706, %r707; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r607, %r608, %r609, %r610}, [%r611]; // end inline asm add.s32 %r616, %r611, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r612, %r613, %r614, %r615}, [%r616]; // end inline asm xor.b32 %r708, %r16, 64; add.s32 %r621, %r706, %r708; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r617, %r618, %r619, %r620}, [%r621]; // end inline asm add.s32 %r626, %r621, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r622, %r623, %r624, %r625}, [%r626]; // end inline asm add.s32 %r709, %r9, %r509; add.s32 %r47, %r709, 16384; add.s32 %r631, %r47, %r17; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1361, %r1362, %r1363, %r1364}, [%r631]; // end inline asm xor.b32 %r710, %r17, 32; add.s32 %r636, %r47, %r710; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1365, %r1366, %r1367, %r1368}, [%r636]; // end inline asm xor.b32 %r711, %r17, 64; add.s32 %r641, %r47, %r711; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1369, %r1370, %r1371, %r1372}, [%r641]; // end inline asm add.s32 %r712, %r509, 18432; add.s32 %r646, %r18, %r712; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1384, %r1383, %r1382, %r1381}, [%r646]; // end inline asm xor.b32 %r713, %r18, 32; add.s32 %r651, %r713, %r712; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1380, %r1379, %r1378, %r1377}, [%r651]; // end inline asm xor.b32 %r714, %r18, 64; add.s32 %r656, %r714, %r712; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1376, %r1375, %r1374, %r1373}, [%r656]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1432, 0; // end inline asm // begin inline asm mov.u32 %r1431, 0; // end inline asm // begin inline asm mov.u32 %r1430, 0; // end inline asm // begin inline asm mov.u32 %r1429, 0; // end inline asm // begin inline asm mov.u32 %r1428, 0; // end inline asm // begin inline asm mov.u32 %r1427, 0; // end inline asm // begin inline asm mov.u32 %r1426, 0; // end inline asm // begin inline asm mov.u32 %r1425, 0; // end inline asm // begin inline asm mov.u32 %r1424, 0; // end inline asm // begin inline asm mov.u32 %r1423, 0; // end inline asm // begin inline asm mov.u32 %r1422, 0; // end inline asm // begin inline asm mov.u32 %r1421, 0; // end inline asm // begin inline asm mov.u32 %r1420, 0; // end inline asm // begin inline asm mov.u32 %r1419, 0; // end inline asm // begin inline asm mov.u32 %r1418, 0; // end inline asm // begin inline asm mov.u32 %r1417, 0; // end inline asm // begin inline asm mov.u32 %r1416, 0; // end inline asm // begin inline asm mov.u32 %r1415, 0; // end inline asm // begin inline asm mov.u32 %r1414, 0; // end inline asm // begin inline asm mov.u32 %r1413, 0; // end inline asm // begin inline asm mov.u32 %r1412, 0; // end inline asm // begin inline asm mov.u32 %r1411, 0; // end inline asm // begin inline asm mov.u32 %r1410, 0; // end inline asm // begin inline asm mov.u32 %r1409, 0; // end inline asm // begin inline asm mov.u32 %r1408, 0; // end inline asm // begin inline asm mov.u32 %r1407, 0; // end inline asm // begin inline asm mov.u32 %r1406, 0; // end inline asm // begin inline asm mov.u32 %r1405, 0; // end inline asm // begin inline asm mov.u32 %r1404, 0; // end inline asm // begin inline asm mov.u32 %r1403, 0; // end inline asm // begin inline asm mov.u32 %r1402, 0; // end inline asm // begin inline asm mov.u32 %r1401, 0; // end inline asm // begin inline asm mov.u32 %r1400, 0; // end inline asm // begin inline asm mov.u32 %r1399, 0; // end inline asm // begin inline asm mov.u32 %r1398, 0; // end inline asm // begin inline asm mov.u32 %r1397, 0; // end inline asm // begin inline asm mov.u32 %r1396, 0; // end inline asm // begin inline asm mov.u32 %r1395, 0; // end inline asm // begin inline asm mov.u32 %r1394, 0; // end inline asm // begin inline asm mov.u32 %r1393, 0; // end inline asm // begin inline asm mov.u32 %r1392, 0; // end inline asm // begin inline asm mov.u32 %r1391, 0; // end inline asm // begin inline asm mov.u32 %r1390, 0; // end inline asm // begin inline asm mov.u32 %r1389, 0; // end inline asm // begin inline asm mov.u32 %r1388, 0; // end inline asm // begin inline asm mov.u32 %r1387, 0; // end inline asm // begin inline asm mov.u32 %r1386, 0; // end inline asm // begin inline asm mov.u32 %r1385, 0; // end inline asm add.s32 %r715, %r1, 15; shr.s32 %r716, %r715, 31; shr.u32 %r717, %r716, 28; add.s32 %r718, %r715, %r717; and.b32 %r120, %r718, -16; setp.lt.s32 %p20, %r1, 1; @%p20 bra $L__BB0_64; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r169, %r19, %r10; add.s32 %r722, %r510, %r11; add.s32 %r170, %r722, 18432; cvt.s64.s32 %rd14, %r5; cvt.s64.s32 %rd15, %r1356; add.s32 %r723, %r1356, 1; cvt.s64.s32 %rd16, %r723; add.s32 %r724, %r1356, 8; cvt.s64.s32 %rd17, %r724; add.s32 %r725, %r1356, 9; cvt.s64.s32 %rd18, %r725; add.s32 %r171, %r5, 8; add.s32 %r172, %r5, 64; add.s32 %r173, %r5, 72; mov.u32 %r1355, 0; mov.f32 %f698, 0fFF800000; mov.f32 %f694, 0f00000000; mov.f32 %f695, %f694; mov.f32 %f696, %f694; mov.f32 %f697, %f694; mov.f32 %f699, %f698; mov.f32 %f700, %f698; mov.f32 %f701, %f698; mov.u32 %r1360, %r1; mov.u32 %r1359, %r1; $L__BB0_9: add.s32 %r726, %r1355, 16; setp.ge.s32 %p21, %r726, %r120; @%p21 bra $L__BB0_16; bar.sync 0; shl.b64 %rd58, %rd6, 4; add.s64 %rd182, %rd182, %rd58; add.s32 %r1360, %r1360, -16; @%p6 bra $L__BB0_12; min.s32 %r729, %r1360, 16; setp.lt.s32 %p23, %r7, %r729; selp.b32 %r728, 16, 0, %p23; // begin inline asm cp.async.cg.shared.global [%r169], [%rd182], 16, %r728; // end inline asm $L__BB0_12: add.s64 %rd183, %rd183, %rd58; add.s32 %r1359, %r1359, -16; @%p6 bra $L__BB0_14; min.s32 %r732, %r1359, 16; setp.lt.s32 %p25, %r7, %r732; selp.b32 %r731, 16, 0, %p25; // begin inline asm cp.async.cg.shared.global [%r170], [%rd183], 16, %r731; // end inline asm $L__BB0_14: // begin inline asm cp.async.commit_group; // end inline asm @%p19 bra $L__BB0_16; mov.u32 %r742, 0; // begin inline asm st.shared.v4.b32 [%r169], {%r742, %r742, %r742, %r742}; // end inline asm // begin inline asm st.shared.v4.b32 [%r170], {%r742, %r742, %r742, %r742}; // end inline asm $L__BB0_16: setp.eq.s16 %p27, %rs1, 0; @%p27 bra $L__BB0_49; mov.u32 %r1281, %ctaid.x; mov.u32 %r1280, %ctaid.y; mov.u32 %r1279, %ctaid.z; shl.b32 %r1278, %r1279, 7; ld.param.u32 %r1277, [fmha_v2_flash_attention_fp16_128_16_S_40_sm86_kernel_nl_param_0+52]; cvt.s64.s32 %rd62, %r1278; add.s64 %rd63, %rd14, %rd62; setp.ge.u64 %p28, %rd63, %rd2; mul.lo.s32 %r745, %r1, %r1278; cvt.s64.s32 %rd64, %r745; cvt.u64.u32 %rd25, %r1355; add.s64 %rd65, %rd64, %rd25; add.s64 %rd26, %rd65, %rd15; mul.lo.s64 %rd66, %rd14, %rd2; add.s64 %rd67, %rd26, %rd66; add.s64 %rd27, %rd15, %rd25; setp.ge.u64 %p29, %rd27, %rd2; shl.b64 %rd68, %rd67, 1; mad.lo.s32 %r748, %r1277, %r1280, %r1281; cvt.s64.s32 %rd69, %r748; mul.lo.s64 %rd70, %rd4, %rd69; add.s64 %rd71, %rd70, %rd68; cvta.to.global.u64 %rd72, %rd3; add.s64 %rd28, %rd72, %rd71; mov.u16 %rs132, 0; or.pred %p30, %p29, %p28; mov.u16 %rs131, %rs132; @%p30 bra $L__BB0_19; ld.global.u16 %rs131, [%rd28]; $L__BB0_19: add.s64 %rd29, %rd16, %rd25; setp.ge.u64 %p32, %rd29, %rd2; or.pred %p33, %p32, %p28; @%p33 bra $L__BB0_21; ld.global.u16 %rs132, [%rd28+2]; $L__BB0_21: add.s64 %rd30, %rd17, %rd25; setp.ge.u64 %p35, %rd30, %rd2; mov.u16 %rs134, 0; or.pred %p36, %p35, %p28; mov.u16 %rs133, %rs134; @%p36 bra $L__BB0_23; ld.global.u16 %rs133, [%rd28+16]; $L__BB0_23: add.s64 %rd31, %rd18, %rd25; setp.ge.u64 %p38, %rd31, %rd2; or.pred %p39, %p38, %p28; @%p39 bra $L__BB0_25; ld.global.u16 %rs134, [%rd28+18]; $L__BB0_25: cvt.s64.s32 %rd80, %r171; add.s64 %rd81, %rd80, %rd62; setp.ge.u64 %p40, %rd81, %rd2; mul.lo.s64 %rd82, %rd80, %rd2; add.s64 %rd83, %rd26, %rd82; shl.b64 %rd84, %rd83, 1; add.s64 %rd87, %rd70, %rd84; add.s64 %rd32, %rd72, %rd87; mov.u16 %rs136, 0; or.pred %p42, %p29, %p40; mov.u16 %rs135, %rs136; @%p42 bra $L__BB0_27; ld.global.u16 %rs135, [%rd32]; $L__BB0_27: or.pred %p45, %p32, %p40; @%p45 bra $L__BB0_29; ld.global.u16 %rs136, [%rd32+2]; $L__BB0_29: mov.u16 %rs138, 0; or.pred %p48, %p35, %p40; mov.u16 %rs137, %rs138; @%p48 bra $L__BB0_31; ld.global.u16 %rs137, [%rd32+16]; $L__BB0_31: or.pred %p51, %p38, %p40; @%p51 bra $L__BB0_33; ld.global.u16 %rs138, [%rd32+18]; $L__BB0_33: cvt.s64.s32 %rd99, %r172; add.s64 %rd100, %rd99, %rd62; setp.ge.u64 %p52, %rd100, %rd2; mul.lo.s64 %rd101, %rd99, %rd2; add.s64 %rd102, %rd26, %rd101; shl.b64 %rd103, %rd102, 1; add.s64 %rd106, %rd70, %rd103; add.s64 %rd33, %rd72, %rd106; mov.u16 %rs140, 0; or.pred %p54, %p29, %p52; mov.u16 %rs139, %rs140; @%p54 bra $L__BB0_35; ld.global.u16 %rs139, [%rd33]; $L__BB0_35: or.pred %p57, %p32, %p52; @%p57 bra $L__BB0_37; ld.global.u16 %rs140, [%rd33+2]; $L__BB0_37: mov.u16 %rs142, 0; or.pred %p60, %p35, %p52; mov.u16 %rs141, %rs142; @%p60 bra $L__BB0_39; ld.global.u16 %rs141, [%rd33+16]; $L__BB0_39: or.pred %p63, %p38, %p52; @%p63 bra $L__BB0_41; ld.global.u16 %rs142, [%rd33+18]; $L__BB0_41: cvt.s64.s32 %rd118, %r173; add.s64 %rd119, %rd118, %rd62; setp.ge.u64 %p64, %rd119, %rd2; mul.lo.s64 %rd120, %rd118, %rd2; add.s64 %rd121, %rd26, %rd120; shl.b64 %rd122, %rd121, 1; add.s64 %rd125, %rd70, %rd122; add.s64 %rd34, %rd72, %rd125; mov.u16 %rs144, 0; or.pred %p66, %p29, %p64; mov.u16 %rs143, %rs144; @%p66 bra $L__BB0_43; ld.global.u16 %rs143, [%rd34]; $L__BB0_43: or.pred %p69, %p32, %p64; @%p69 bra $L__BB0_45; ld.global.u16 %rs144, [%rd34+2]; $L__BB0_45: mov.u16 %rs146, 0; or.pred %p72, %p35, %p64; mov.u16 %rs145, %rs146; @%p72 bra $L__BB0_47; ld.global.u16 %rs145, [%rd34+16]; $L__BB0_47: or.pred %p75, %p38, %p64; @%p75 bra $L__BB0_49; ld.global.u16 %rs146, [%rd34+18]; $L__BB0_49: // begin inline asm mov.u32 %r788, 0; // end inline asm // begin inline asm mov.u32 %r789, 0; // end inline asm // begin inline asm mov.u32 %r790, 0; // end inline asm // begin inline asm mov.u32 %r791, 0; // end inline asm // begin inline asm mov.u32 %r792, 0; // end inline asm // begin inline asm mov.u32 %r793, 0; // end inline asm // begin inline asm mov.u32 %r794, 0; // end inline asm // begin inline asm mov.u32 %r795, 0; // end inline asm // begin inline asm mov.u32 %r796, 0; // end inline asm // begin inline asm mov.u32 %r797, 0; // end inline asm // begin inline asm mov.u32 %r798, 0; // end inline asm // begin inline asm mov.u32 %r799, 0; // end inline asm // begin inline asm mov.u32 %r800, 0; // end inline asm // begin inline asm mov.u32 %r801, 0; // end inline asm // begin inline asm mov.u32 %r802, 0; // end inline asm // begin inline asm mov.u32 %r803, 0; // end inline asm mov.b32 %f182, %r788; mov.b32 %f183, %r789; mov.b32 %f184, %r790; mov.b32 %f185, %r791; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f182, %f183, %f184, %f185}, {%r597, %r598, %r599, %r600}, {%r1361, %r1362}, {%f182, %f183, %f184, %f185}; // end inline asm mov.b32 %f190, %r792; mov.b32 %f191, %r793; mov.b32 %f192, %r794; mov.b32 %f193, %r795; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f190, %f191, %f192, %f193}, {%r597, %r598, %r599, %r600}, {%r1363, %r1364}, {%f190, %f191, %f192, %f193}; // end inline asm mov.b32 %f198, %r796; mov.b32 %f199, %r797; mov.b32 %f200, %r798; mov.b32 %f201, %r799; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r602, %r603, %r604, %r605}, {%r1361, %r1362}, {%f198, %f199, %f200, %f201}; // end inline asm mov.b32 %f206, %r800; mov.b32 %f207, %r801; mov.b32 %f208, %r802; mov.b32 %f209, %r803; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r602, %r603, %r604, %r605}, {%r1363, %r1364}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f182, %f183, %f184, %f185}, {%r607, %r608, %r609, %r610}, {%r1365, %r1366}, {%f182, %f183, %f184, %f185}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f190, %f191, %f192, %f193}, {%r607, %r608, %r609, %r610}, {%r1367, %r1368}, {%f190, %f191, %f192, %f193}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r612, %r613, %r614, %r615}, {%r1365, %r1366}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r612, %r613, %r614, %r615}, {%r1367, %r1368}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f182, %f183, %f184, %f185}, {%r617, %r618, %r619, %r620}, {%r1369, %r1370}, {%f182, %f183, %f184, %f185}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f190, %f191, %f192, %f193}, {%r617, %r618, %r619, %r620}, {%r1371, %r1372}, {%f190, %f191, %f192, %f193}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r622, %r623, %r624, %r625}, {%r1369, %r1370}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r622, %r623, %r624, %r625}, {%r1371, %r1372}, {%f206, %f207, %f208, %f209}; // end inline asm mul.ftz.f32 %f246, %f1, %f182; mul.ftz.f32 %f247, %f1, %f183; mul.ftz.f32 %f248, %f1, %f190; mul.ftz.f32 %f249, %f1, %f191; mul.ftz.f32 %f250, %f1, %f184; mul.ftz.f32 %f251, %f1, %f185; mul.ftz.f32 %f252, %f1, %f192; mul.ftz.f32 %f253, %f1, %f193; mul.ftz.f32 %f254, %f1, %f198; mul.ftz.f32 %f255, %f1, %f199; mul.ftz.f32 %f256, %f1, %f206; mul.ftz.f32 %f257, %f1, %f207; mul.ftz.f32 %f258, %f1, %f200; mul.ftz.f32 %f259, %f1, %f201; mul.ftz.f32 %f260, %f1, %f208; mul.ftz.f32 %f261, %f1, %f209; setp.lt.s32 %p76, %r1356, %r1; selp.f32 %f717, %f246, 0fFF800000, %p76; add.s32 %r876, %r1356, 1; setp.lt.s32 %p77, %r876, %r1; selp.f32 %f716, %f247, 0fFF800000, %p77; add.s32 %r877, %r1356, 8; setp.lt.s32 %p78, %r877, %r1; selp.f32 %f715, %f248, 0fFF800000, %p78; add.s32 %r878, %r1356, 9; setp.lt.s32 %p79, %r878, %r1; selp.f32 %f714, %f249, 0fFF800000, %p79; selp.f32 %f713, %f250, 0fFF800000, %p76; selp.f32 %f712, %f251, 0fFF800000, %p77; selp.f32 %f711, %f252, 0fFF800000, %p78; selp.f32 %f710, %f253, 0fFF800000, %p79; selp.f32 %f709, %f254, 0fFF800000, %p76; selp.f32 %f708, %f255, 0fFF800000, %p77; selp.f32 %f707, %f256, 0fFF800000, %p78; selp.f32 %f706, %f257, 0fFF800000, %p79; selp.f32 %f705, %f258, 0fFF800000, %p76; selp.f32 %f704, %f259, 0fFF800000, %p77; selp.f32 %f703, %f260, 0fFF800000, %p78; selp.f32 %f702, %f261, 0fFF800000, %p79; @%p27 bra $L__BB0_51; // begin inline asm cvt.f32.f16 %f262, %rs131; // end inline asm add.ftz.f32 %f717, %f262, %f717; // begin inline asm cvt.f32.f16 %f263, %rs132; // end inline asm add.ftz.f32 %f716, %f263, %f716; // begin inline asm cvt.f32.f16 %f264, %rs133; // end inline asm add.ftz.f32 %f715, %f264, %f715; // begin inline asm cvt.f32.f16 %f265, %rs134; // end inline asm add.ftz.f32 %f714, %f265, %f714; // begin inline asm cvt.f32.f16 %f266, %rs135; // end inline asm add.ftz.f32 %f713, %f266, %f713; // begin inline asm cvt.f32.f16 %f267, %rs136; // end inline asm add.ftz.f32 %f712, %f267, %f712; // begin inline asm cvt.f32.f16 %f268, %rs137; // end inline asm add.ftz.f32 %f711, %f268, %f711; // begin inline asm cvt.f32.f16 %f269, %rs138; // end inline asm add.ftz.f32 %f710, %f269, %f710; // begin inline asm cvt.f32.f16 %f270, %rs139; // end inline asm add.ftz.f32 %f709, %f270, %f709; // begin inline asm cvt.f32.f16 %f271, %rs140; // end inline asm add.ftz.f32 %f708, %f271, %f708; // begin inline asm cvt.f32.f16 %f272, %rs141; // end inline asm add.ftz.f32 %f707, %f272, %f707; // begin inline asm cvt.f32.f16 %f273, %rs142; // end inline asm add.ftz.f32 %f706, %f273, %f706; // begin inline asm cvt.f32.f16 %f274, %rs143; // end inline asm add.ftz.f32 %f705, %f274, %f705; // begin inline asm cvt.f32.f16 %f275, %rs144; // end inline asm add.ftz.f32 %f704, %f275, %f704; // begin inline asm cvt.f32.f16 %f276, %rs145; // end inline asm add.ftz.f32 %f703, %f276, %f703; // begin inline asm cvt.f32.f16 %f277, %rs146; // end inline asm add.ftz.f32 %f702, %f277, %f702; $L__BB0_51: add.s32 %r1282, %r1355, 16; setp.ge.s32 %p143, %r1282, %r120; setp.gt.ftz.f32 %p82, %f717, %f716; selp.f32 %f278, %f717, %f716, %p82; setp.gt.ftz.f32 %p83, %f278, %f715; selp.f32 %f279, %f278, %f715, %p83; setp.gt.ftz.f32 %p84, %f279, %f714; selp.f32 %f280, %f279, %f714, %p84; setp.gt.ftz.f32 %p85, %f713, %f712; selp.f32 %f281, %f713, %f712, %p85; setp.gt.ftz.f32 %p86, %f281, %f711; selp.f32 %f282, %f281, %f711, %p86; setp.gt.ftz.f32 %p87, %f282, %f710; selp.f32 %f283, %f282, %f710, %p87; setp.gt.ftz.f32 %p88, %f709, %f708; selp.f32 %f284, %f709, %f708, %p88; setp.gt.ftz.f32 %p89, %f284, %f707; selp.f32 %f285, %f284, %f707, %p89; setp.gt.ftz.f32 %p90, %f285, %f706; selp.f32 %f286, %f285, %f706, %p90; setp.gt.ftz.f32 %p91, %f705, %f704; selp.f32 %f287, %f705, %f704, %p91; setp.gt.ftz.f32 %p92, %f287, %f703; selp.f32 %f288, %f287, %f703, %p92; setp.gt.ftz.f32 %p93, %f288, %f702; selp.f32 %f289, %f288, %f702, %p93; mov.b32 %r880, %f280; mov.u32 %r881, 31; mov.u32 %r882, 1; mov.u32 %r883, -1; shfl.sync.bfly.b32 %r884|%p94, %r880, %r882, %r881, %r883; mov.b32 %f290, %r884; setp.gt.ftz.f32 %p95, %f280, %f290; selp.f32 %f291, %f280, %f290, %p95; mov.b32 %r885, %f291; mov.u32 %r886, 2; shfl.sync.bfly.b32 %r887|%p96, %r885, %r886, %r881, %r883; mov.b32 %f292, %r887; setp.gt.ftz.f32 %p97, %f291, %f292; selp.f32 %f293, %f291, %f292, %p97; mov.b32 %r888, %f283; shfl.sync.bfly.b32 %r889|%p98, %r888, %r882, %r881, %r883; mov.b32 %f294, %r889; setp.gt.ftz.f32 %p99, %f283, %f294; selp.f32 %f295, %f283, %f294, %p99; mov.b32 %r890, %f295; shfl.sync.bfly.b32 %r891|%p100, %r890, %r886, %r881, %r883; mov.b32 %f296, %r891; setp.gt.ftz.f32 %p101, %f295, %f296; selp.f32 %f297, %f295, %f296, %p101; mov.b32 %r892, %f286; shfl.sync.bfly.b32 %r893|%p102, %r892, %r882, %r881, %r883; mov.b32 %f298, %r893; setp.gt.ftz.f32 %p103, %f286, %f298; selp.f32 %f299, %f286, %f298, %p103; mov.b32 %r894, %f299; shfl.sync.bfly.b32 %r895|%p104, %r894, %r886, %r881, %r883; mov.b32 %f300, %r895; setp.gt.ftz.f32 %p105, %f299, %f300; selp.f32 %f301, %f299, %f300, %p105; mov.b32 %r896, %f289; shfl.sync.bfly.b32 %r897|%p106, %r896, %r882, %r881, %r883; mov.b32 %f302, %r897; setp.gt.ftz.f32 %p107, %f289, %f302; selp.f32 %f303, %f289, %f302, %p107; mov.b32 %r898, %f303; shfl.sync.bfly.b32 %r899|%p108, %r898, %r886, %r881, %r883; mov.b32 %f304, %r899; setp.gt.ftz.f32 %p109, %f303, %f304; selp.f32 %f305, %f303, %f304, %p109; max.ftz.f32 %f58, %f293, %f701; max.ftz.f32 %f59, %f297, %f700; max.ftz.f32 %f60, %f301, %f699; max.ftz.f32 %f61, %f305, %f698; sub.ftz.f32 %f306, %f717, %f58; mul.ftz.f32 %f307, %f306, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f307; sub.ftz.f32 %f308, %f716, %f58; mul.ftz.f32 %f309, %f308, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f309; sub.ftz.f32 %f310, %f715, %f58; mul.ftz.f32 %f311, %f310, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f311; sub.ftz.f32 %f312, %f714, %f58; mul.ftz.f32 %f313, %f312, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f313; sub.ftz.f32 %f314, %f713, %f59; mul.ftz.f32 %f315, %f314, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f315; sub.ftz.f32 %f316, %f712, %f59; mul.ftz.f32 %f317, %f316, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f317; sub.ftz.f32 %f318, %f711, %f59; mul.ftz.f32 %f319, %f318, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f319; sub.ftz.f32 %f320, %f710, %f59; mul.ftz.f32 %f321, %f320, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f321; sub.ftz.f32 %f322, %f709, %f60; mul.ftz.f32 %f323, %f322, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f323; sub.ftz.f32 %f324, %f708, %f60; mul.ftz.f32 %f325, %f324, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f325; sub.ftz.f32 %f326, %f707, %f60; mul.ftz.f32 %f327, %f326, 0f3FB8AA3B; ex2.approx.ftz.f32 %f72, %f327; sub.ftz.f32 %f328, %f706, %f60; mul.ftz.f32 %f329, %f328, 0f3FB8AA3B; ex2.approx.ftz.f32 %f73, %f329; sub.ftz.f32 %f330, %f705, %f61; mul.ftz.f32 %f331, %f330, 0f3FB8AA3B; ex2.approx.ftz.f32 %f74, %f331; sub.ftz.f32 %f332, %f704, %f61; mul.ftz.f32 %f333, %f332, 0f3FB8AA3B; ex2.approx.ftz.f32 %f75, %f333; sub.ftz.f32 %f334, %f703, %f61; mul.ftz.f32 %f335, %f334, 0f3FB8AA3B; ex2.approx.ftz.f32 %f76, %f335; sub.ftz.f32 %f336, %f702, %f61; mul.ftz.f32 %f337, %f336, 0f3FB8AA3B; ex2.approx.ftz.f32 %f77, %f337; add.ftz.f32 %f338, %f62, %f63; add.ftz.f32 %f339, %f338, 0f00000000; add.ftz.f32 %f340, %f64, %f65; add.ftz.f32 %f341, %f340, 0f00000000; add.ftz.f32 %f342, %f339, %f341; add.ftz.f32 %f343, %f66, %f67; add.ftz.f32 %f344, %f343, 0f00000000; add.ftz.f32 %f345, %f68, %f69; add.ftz.f32 %f346, %f345, 0f00000000; add.ftz.f32 %f347, %f344, %f346; add.ftz.f32 %f348, %f70, %f71; add.ftz.f32 %f349, %f348, 0f00000000; add.ftz.f32 %f350, %f72, %f73; add.ftz.f32 %f351, %f350, 0f00000000; add.ftz.f32 %f352, %f349, %f351; add.ftz.f32 %f353, %f74, %f75; add.ftz.f32 %f354, %f353, 0f00000000; add.ftz.f32 %f355, %f76, %f77; add.ftz.f32 %f356, %f355, 0f00000000; add.ftz.f32 %f357, %f354, %f356; mov.b32 %r900, %f342; shfl.sync.bfly.b32 %r901|%p110, %r900, %r882, %r881, %r883; mov.b32 %f358, %r901; add.ftz.f32 %f359, %f342, %f358; mov.b32 %r902, %f359; shfl.sync.bfly.b32 %r903|%p111, %r902, %r886, %r881, %r883; mov.b32 %f360, %r903; add.ftz.f32 %f361, %f359, %f360; mov.b32 %r904, %f347; shfl.sync.bfly.b32 %r905|%p112, %r904, %r882, %r881, %r883; mov.b32 %f362, %r905; add.ftz.f32 %f363, %f347, %f362; mov.b32 %r906, %f363; shfl.sync.bfly.b32 %r907|%p113, %r906, %r886, %r881, %r883; mov.b32 %f364, %r907; add.ftz.f32 %f365, %f363, %f364; mov.b32 %r908, %f352; shfl.sync.bfly.b32 %r909|%p114, %r908, %r882, %r881, %r883; mov.b32 %f366, %r909; add.ftz.f32 %f367, %f352, %f366; mov.b32 %r910, %f367; shfl.sync.bfly.b32 %r911|%p115, %r910, %r886, %r881, %r883; mov.b32 %f368, %r911; add.ftz.f32 %f369, %f367, %f368; mov.b32 %r912, %f357; shfl.sync.bfly.b32 %r913|%p116, %r912, %r882, %r881, %r883; mov.b32 %f370, %r913; add.ftz.f32 %f371, %f357, %f370; mov.b32 %r914, %f371; shfl.sync.bfly.b32 %r915|%p117, %r914, %r886, %r881, %r883; mov.b32 %f372, %r915; add.ftz.f32 %f373, %f371, %f372; sub.ftz.f32 %f374, %f701, %f58; mul.ftz.f32 %f375, %f374, 0f3FB8AA3B; ex2.approx.ftz.f32 %f376, %f375; mul.ftz.f32 %f78, %f376, %f697; add.ftz.f32 %f697, %f78, %f361; sub.ftz.f32 %f377, %f700, %f59; mul.ftz.f32 %f378, %f377, 0f3FB8AA3B; ex2.approx.ftz.f32 %f379, %f378; mul.ftz.f32 %f80, %f379, %f696; add.ftz.f32 %f696, %f80, %f365; sub.ftz.f32 %f380, %f699, %f60; mul.ftz.f32 %f381, %f380, 0f3FB8AA3B; ex2.approx.ftz.f32 %f382, %f381; mul.ftz.f32 %f82, %f382, %f695; add.ftz.f32 %f695, %f82, %f369; sub.ftz.f32 %f383, %f698, %f61; mul.ftz.f32 %f384, %f383, 0f3FB8AA3B; ex2.approx.ftz.f32 %f385, %f384; mul.ftz.f32 %f84, %f385, %f694; add.ftz.f32 %f694, %f84, %f373; @%p143 bra $L__BB0_53; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1361, %r1362, %r1363, %r1364}, [%r631]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1365, %r1366, %r1367, %r1368}, [%r636]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1369, %r1370, %r1371, %r1372}, [%r641]; // end inline asm $L__BB0_53: // begin inline asm cvt.rn.f16x2.f32 %r944, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r945, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r946, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r947, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r948, %f71, %f70; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r949, %f75, %f74; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r950, %f73, %f72; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r951, %f77, %f76; // end inline asm // begin inline asm mov.u32 %r952, 0; // end inline asm // begin inline asm mov.u32 %r953, 0; // end inline asm // begin inline asm mov.u32 %r954, 0; // end inline asm // begin inline asm mov.u32 %r955, 0; // end inline asm // begin inline asm mov.u32 %r956, 0; // end inline asm // begin inline asm mov.u32 %r957, 0; // end inline asm // begin inline asm mov.u32 %r958, 0; // end inline asm // begin inline asm mov.u32 %r959, 0; // end inline asm // begin inline asm mov.u32 %r960, 0; // end inline asm // begin inline asm mov.u32 %r961, 0; // end inline asm // begin inline asm mov.u32 %r962, 0; // end inline asm // begin inline asm mov.u32 %r963, 0; // end inline asm // begin inline asm mov.u32 %r964, 0; // end inline asm // begin inline asm mov.u32 %r965, 0; // end inline asm // begin inline asm mov.u32 %r966, 0; // end inline asm // begin inline asm mov.u32 %r967, 0; // end inline asm // begin inline asm mov.u32 %r968, 0; // end inline asm // begin inline asm mov.u32 %r969, 0; // end inline asm // begin inline asm mov.u32 %r970, 0; // end inline asm // begin inline asm mov.u32 %r971, 0; // end inline asm // begin inline asm mov.u32 %r972, 0; // end inline asm // begin inline asm mov.u32 %r973, 0; // end inline asm // begin inline asm mov.u32 %r974, 0; // end inline asm // begin inline asm mov.u32 %r975, 0; // end inline asm // begin inline asm mov.u32 %r976, 0; // end inline asm // begin inline asm mov.u32 %r977, 0; // end inline asm // begin inline asm mov.u32 %r978, 0; // end inline asm // begin inline asm mov.u32 %r979, 0; // end inline asm // begin inline asm mov.u32 %r980, 0; // end inline asm // begin inline asm mov.u32 %r981, 0; // end inline asm // begin inline asm mov.u32 %r982, 0; // end inline asm // begin inline asm mov.u32 %r983, 0; // end inline asm // begin inline asm mov.u32 %r984, 0; // end inline asm // begin inline asm mov.u32 %r985, 0; // end inline asm // begin inline asm mov.u32 %r986, 0; // end inline asm // begin inline asm mov.u32 %r987, 0; // end inline asm // begin inline asm mov.u32 %r988, 0; // end inline asm // begin inline asm mov.u32 %r989, 0; // end inline asm // begin inline asm mov.u32 %r990, 0; // end inline asm // begin inline asm mov.u32 %r991, 0; // end inline asm // begin inline asm mov.u32 %r992, 0; // end inline asm // begin inline asm mov.u32 %r993, 0; // end inline asm // begin inline asm mov.u32 %r994, 0; // end inline asm // begin inline asm mov.u32 %r995, 0; // end inline asm // begin inline asm mov.u32 %r996, 0; // end inline asm // begin inline asm mov.u32 %r997, 0; // end inline asm // begin inline asm mov.u32 %r998, 0; // end inline asm // begin inline asm mov.u32 %r999, 0; // end inline asm mov.b32 %f402, %r952; mov.b32 %f403, %r953; mov.b32 %f404, %r954; mov.b32 %f405, %r955; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f402, %f403, %f404, %f405}, {%r944, %r945, %r946, %r947}, {%r1384, %r1383}, {%f402, %f403, %f404, %f405}; // end inline asm mov.b32 %f410, %r956; mov.b32 %f411, %r957; mov.b32 %f412, %r958; mov.b32 %f413, %r959; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f410, %f411, %f412, %f413}, {%r944, %r945, %r946, %r947}, {%r1382, %r1381}, {%f410, %f411, %f412, %f413}; // end inline asm mov.b32 %f418, %r960; mov.b32 %f419, %r961; mov.b32 %f420, %r962; mov.b32 %f421, %r963; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f418, %f419, %f420, %f421}, {%r944, %r945, %r946, %r947}, {%r1380, %r1379}, {%f418, %f419, %f420, %f421}; // end inline asm mov.b32 %f426, %r964; mov.b32 %f427, %r965; mov.b32 %f428, %r966; mov.b32 %f429, %r967; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f426, %f427, %f428, %f429}, {%r944, %r945, %r946, %r947}, {%r1378, %r1377}, {%f426, %f427, %f428, %f429}; // end inline asm mov.b32 %f434, %r968; mov.b32 %f435, %r969; mov.b32 %f436, %r970; mov.b32 %f437, %r971; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f434, %f435, %f436, %f437}, {%r944, %r945, %r946, %r947}, {%r1376, %r1375}, {%f434, %f435, %f436, %f437}; // end inline asm mov.b32 %f442, %r972; mov.b32 %f443, %r973; mov.b32 %f444, %r974; mov.b32 %f445, %r975; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f442, %f443, %f444, %f445}, {%r944, %r945, %r946, %r947}, {%r1374, %r1373}, {%f442, %f443, %f444, %f445}; // end inline asm mov.b32 %f450, %r976; mov.b32 %f451, %r977; mov.b32 %f452, %r978; mov.b32 %f453, %r979; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f450, %f451, %f452, %f453}, {%r948, %r949, %r950, %r951}, {%r1384, %r1383}, {%f450, %f451, %f452, %f453}; // end inline asm mov.b32 %f458, %r980; mov.b32 %f459, %r981; mov.b32 %f460, %r982; mov.b32 %f461, %r983; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f458, %f459, %f460, %f461}, {%r948, %r949, %r950, %r951}, {%r1382, %r1381}, {%f458, %f459, %f460, %f461}; // end inline asm mov.b32 %f466, %r984; mov.b32 %f467, %r985; mov.b32 %f468, %r986; mov.b32 %f469, %r987; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f466, %f467, %f468, %f469}, {%r948, %r949, %r950, %r951}, {%r1380, %r1379}, {%f466, %f467, %f468, %f469}; // end inline asm mov.b32 %f474, %r988; mov.b32 %f475, %r989; mov.b32 %f476, %r990; mov.b32 %f477, %r991; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f474, %f475, %f476, %f477}, {%r948, %r949, %r950, %r951}, {%r1378, %r1377}, {%f474, %f475, %f476, %f477}; // end inline asm mov.b32 %f482, %r992; mov.b32 %f483, %r993; mov.b32 %f484, %r994; mov.b32 %f485, %r995; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r948, %r949, %r950, %r951}, {%r1376, %r1375}, {%f482, %f483, %f484, %f485}; // end inline asm mov.b32 %f490, %r996; mov.b32 %f491, %r997; mov.b32 %f492, %r998; mov.b32 %f493, %r999; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r948, %r949, %r950, %r951}, {%r1374, %r1373}, {%f490, %f491, %f492, %f493}; // end inline asm setp.equ.ftz.f32 %p118, %f697, 0f00000000; mov.f32 %f719, 0f3F800000; mov.f32 %f718, %f719; @%p118 bra $L__BB0_55; rcp.approx.ftz.f32 %f718, %f697; $L__BB0_55: setp.equ.ftz.f32 %p119, %f696, 0f00000000; @%p119 bra $L__BB0_57; rcp.approx.ftz.f32 %f719, %f696; $L__BB0_57: mov.b32 %f501, %r1432; fma.rn.ftz.f32 %f502, %f78, %f501, %f402; mul.ftz.f32 %f503, %f718, %f502; mov.b32 %r1432, %f503; mov.b32 %f504, %r1431; fma.rn.ftz.f32 %f505, %f78, %f504, %f403; mul.ftz.f32 %f506, %f718, %f505; mov.b32 %r1431, %f506; mov.b32 %f507, %r1430; fma.rn.ftz.f32 %f508, %f80, %f507, %f404; mul.ftz.f32 %f509, %f719, %f508; mov.b32 %r1430, %f509; mov.b32 %f510, %r1429; fma.rn.ftz.f32 %f511, %f80, %f510, %f405; mul.ftz.f32 %f512, %f719, %f511; mov.b32 %r1429, %f512; mov.b32 %f513, %r1428; fma.rn.ftz.f32 %f514, %f78, %f513, %f410; mul.ftz.f32 %f515, %f718, %f514; mov.b32 %r1428, %f515; mov.b32 %f516, %r1427; fma.rn.ftz.f32 %f517, %f78, %f516, %f411; mul.ftz.f32 %f518, %f718, %f517; mov.b32 %r1427, %f518; mov.b32 %f519, %r1426; fma.rn.ftz.f32 %f520, %f80, %f519, %f412; mul.ftz.f32 %f521, %f719, %f520; mov.b32 %r1426, %f521; mov.b32 %f522, %r1425; fma.rn.ftz.f32 %f523, %f80, %f522, %f413; mul.ftz.f32 %f524, %f719, %f523; mov.b32 %r1425, %f524; mov.b32 %f525, %r1424; fma.rn.ftz.f32 %f526, %f78, %f525, %f418; mul.ftz.f32 %f527, %f718, %f526; mov.b32 %r1424, %f527; mov.b32 %f528, %r1423; fma.rn.ftz.f32 %f529, %f78, %f528, %f419; mul.ftz.f32 %f530, %f718, %f529; mov.b32 %r1423, %f530; mov.b32 %f531, %r1422; fma.rn.ftz.f32 %f532, %f80, %f531, %f420; mul.ftz.f32 %f533, %f719, %f532; mov.b32 %r1422, %f533; mov.b32 %f534, %r1421; fma.rn.ftz.f32 %f535, %f80, %f534, %f421; mul.ftz.f32 %f536, %f719, %f535; mov.b32 %r1421, %f536; mov.b32 %f537, %r1420; fma.rn.ftz.f32 %f538, %f78, %f537, %f426; mul.ftz.f32 %f539, %f718, %f538; mov.b32 %r1420, %f539; mov.b32 %f540, %r1419; fma.rn.ftz.f32 %f541, %f78, %f540, %f427; mul.ftz.f32 %f542, %f718, %f541; mov.b32 %r1419, %f542; mov.b32 %f543, %r1418; fma.rn.ftz.f32 %f544, %f80, %f543, %f428; mul.ftz.f32 %f545, %f719, %f544; mov.b32 %r1418, %f545; mov.b32 %f546, %r1417; fma.rn.ftz.f32 %f547, %f80, %f546, %f429; mul.ftz.f32 %f548, %f719, %f547; mov.b32 %r1417, %f548; mov.b32 %f549, %r1416; fma.rn.ftz.f32 %f550, %f78, %f549, %f434; mul.ftz.f32 %f551, %f718, %f550; mov.b32 %r1416, %f551; mov.b32 %f552, %r1415; fma.rn.ftz.f32 %f553, %f78, %f552, %f435; mul.ftz.f32 %f554, %f718, %f553; mov.b32 %r1415, %f554; mov.b32 %f555, %r1414; fma.rn.ftz.f32 %f556, %f80, %f555, %f436; mul.ftz.f32 %f557, %f719, %f556; mov.b32 %r1414, %f557; mov.b32 %f558, %r1413; fma.rn.ftz.f32 %f559, %f80, %f558, %f437; mul.ftz.f32 %f560, %f719, %f559; mov.b32 %r1413, %f560; mov.b32 %f561, %r1412; fma.rn.ftz.f32 %f562, %f78, %f561, %f442; mul.ftz.f32 %f563, %f718, %f562; mov.b32 %r1412, %f563; mov.b32 %f564, %r1411; fma.rn.ftz.f32 %f565, %f78, %f564, %f443; mul.ftz.f32 %f566, %f718, %f565; mov.b32 %r1411, %f566; mov.b32 %f567, %r1410; fma.rn.ftz.f32 %f568, %f80, %f567, %f444; mul.ftz.f32 %f569, %f719, %f568; mov.b32 %r1410, %f569; mov.b32 %f570, %r1409; fma.rn.ftz.f32 %f571, %f80, %f570, %f445; mul.ftz.f32 %f572, %f719, %f571; mov.b32 %r1409, %f572; setp.equ.ftz.f32 %p120, %f695, 0f00000000; mov.f32 %f721, 0f3F800000; mov.f32 %f720, %f721; @%p120 bra $L__BB0_59; rcp.approx.ftz.f32 %f720, %f695; $L__BB0_59: setp.equ.ftz.f32 %p121, %f694, 0f00000000; @%p121 bra $L__BB0_61; rcp.approx.ftz.f32 %f721, %f694; $L__BB0_61: add.s32 %r1258, %r1355, 16; setp.ge.s32 %p142, %r1258, %r120; mov.b32 %f574, %r1408; fma.rn.ftz.f32 %f575, %f82, %f574, %f450; mul.ftz.f32 %f576, %f720, %f575; mov.b32 %r1408, %f576; mov.b32 %f577, %r1407; fma.rn.ftz.f32 %f578, %f82, %f577, %f451; mul.ftz.f32 %f579, %f720, %f578; mov.b32 %r1407, %f579; mov.b32 %f580, %r1406; fma.rn.ftz.f32 %f581, %f84, %f580, %f452; mul.ftz.f32 %f582, %f721, %f581; mov.b32 %r1406, %f582; mov.b32 %f583, %r1405; fma.rn.ftz.f32 %f584, %f84, %f583, %f453; mul.ftz.f32 %f585, %f721, %f584; mov.b32 %r1405, %f585; mov.b32 %f586, %r1404; fma.rn.ftz.f32 %f587, %f82, %f586, %f458; mul.ftz.f32 %f588, %f720, %f587; mov.b32 %r1404, %f588; mov.b32 %f589, %r1403; fma.rn.ftz.f32 %f590, %f82, %f589, %f459; mul.ftz.f32 %f591, %f720, %f590; mov.b32 %r1403, %f591; mov.b32 %f592, %r1402; fma.rn.ftz.f32 %f593, %f84, %f592, %f460; mul.ftz.f32 %f594, %f721, %f593; mov.b32 %r1402, %f594; mov.b32 %f595, %r1401; fma.rn.ftz.f32 %f596, %f84, %f595, %f461; mul.ftz.f32 %f597, %f721, %f596; mov.b32 %r1401, %f597; mov.b32 %f598, %r1400; fma.rn.ftz.f32 %f599, %f82, %f598, %f466; mul.ftz.f32 %f600, %f720, %f599; mov.b32 %r1400, %f600; mov.b32 %f601, %r1399; fma.rn.ftz.f32 %f602, %f82, %f601, %f467; mul.ftz.f32 %f603, %f720, %f602; mov.b32 %r1399, %f603; mov.b32 %f604, %r1398; fma.rn.ftz.f32 %f605, %f84, %f604, %f468; mul.ftz.f32 %f606, %f721, %f605; mov.b32 %r1398, %f606; mov.b32 %f607, %r1397; fma.rn.ftz.f32 %f608, %f84, %f607, %f469; mul.ftz.f32 %f609, %f721, %f608; mov.b32 %r1397, %f609; mov.b32 %f610, %r1396; fma.rn.ftz.f32 %f611, %f82, %f610, %f474; mul.ftz.f32 %f612, %f720, %f611; mov.b32 %r1396, %f612; mov.b32 %f613, %r1395; fma.rn.ftz.f32 %f614, %f82, %f613, %f475; mul.ftz.f32 %f615, %f720, %f614; mov.b32 %r1395, %f615; mov.b32 %f616, %r1394; fma.rn.ftz.f32 %f617, %f84, %f616, %f476; mul.ftz.f32 %f618, %f721, %f617; mov.b32 %r1394, %f618; mov.b32 %f619, %r1393; fma.rn.ftz.f32 %f620, %f84, %f619, %f477; mul.ftz.f32 %f621, %f721, %f620; mov.b32 %r1393, %f621; mov.b32 %f622, %r1392; fma.rn.ftz.f32 %f623, %f82, %f622, %f482; mul.ftz.f32 %f624, %f720, %f623; mov.b32 %r1392, %f624; mov.b32 %f625, %r1391; fma.rn.ftz.f32 %f626, %f82, %f625, %f483; mul.ftz.f32 %f627, %f720, %f626; mov.b32 %r1391, %f627; mov.b32 %f628, %r1390; fma.rn.ftz.f32 %f629, %f84, %f628, %f484; mul.ftz.f32 %f630, %f721, %f629; mov.b32 %r1390, %f630; mov.b32 %f631, %r1389; fma.rn.ftz.f32 %f632, %f84, %f631, %f485; mul.ftz.f32 %f633, %f721, %f632; mov.b32 %r1389, %f633; mov.b32 %f634, %r1388; fma.rn.ftz.f32 %f635, %f82, %f634, %f490; mul.ftz.f32 %f636, %f720, %f635; mov.b32 %r1388, %f636; mov.b32 %f637, %r1387; fma.rn.ftz.f32 %f638, %f82, %f637, %f491; mul.ftz.f32 %f639, %f720, %f638; mov.b32 %r1387, %f639; mov.b32 %f640, %r1386; fma.rn.ftz.f32 %f641, %f84, %f640, %f492; mul.ftz.f32 %f642, %f721, %f641; mov.b32 %r1386, %f642; mov.b32 %f643, %r1385; fma.rn.ftz.f32 %f644, %f84, %f643, %f493; mul.ftz.f32 %f645, %f721, %f644; mov.b32 %r1385, %f645; @%p142 bra $L__BB0_63; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1384, %r1383, %r1382, %r1381}, [%r646]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1380, %r1379, %r1378, %r1377}, [%r651]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1376, %r1375, %r1374, %r1373}, [%r656]; // end inline asm $L__BB0_63: add.s32 %r1355, %r1355, 16; setp.lt.s32 %p123, %r1355, %r120; add.s32 %r1356, %r1356, 16; mov.f32 %f698, %f61; mov.f32 %f699, %f60; mov.f32 %f700, %f59; mov.f32 %f701, %f58; @%p123 bra $L__BB0_9; $L__BB0_64: mov.u32 %r1276, %tid.x; mov.u32 %r1275, %tid.x; and.b32 %r1274, %r1275, 96; shr.u32 %r1273, %r1274, 1; shr.s32 %r1272, %r1275, 31; shr.u32 %r1271, %r1272, 29; add.s32 %r1270, %r1275, %r1271; and.b32 %r1269, %r1270, -8; sub.s32 %r1268, %r1275, %r1269; shl.b32 %r1267, %r1268, 4; cvt.s64.s32 %rd179, %r1267; mov.b64 %rd178, fmha_v2_flash_attention_fp16_128_16_S_40_sm86_kernel_nl_param_0; mov.u64 %rd177, %rd178; ld.param.u32 %r1266, [%rd177+60]; mov.u32 %r1265, %ctaid.y; mov.u32 %r1264, %ctaid.x; ld.param.u32 %r1263, [fmha_v2_flash_attention_fp16_128_16_S_40_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1262, %r1, %r1265; mad.lo.s32 %r1261, %r1262, %r1263, %r1264; mov.u32 %r1260, _ZN25fused_multihead_attention5smem_E; bar.sync 0; mul.lo.s32 %r1160, %r1261, %r1266; shl.b32 %r1161, %r1160, 1; cvt.s64.s32 %rd136, %r1161; add.s64 %rd35, %rd136, %rd179; mov.b32 %f646, %r1431; mov.b32 %f647, %r1432; // begin inline asm cvt.rn.f16x2.f32 %r1100, %f646, %f647; // end inline asm mov.b32 %f648, %r1429; mov.b32 %f649, %r1430; // begin inline asm cvt.rn.f16x2.f32 %r1101, %f648, %f649; // end inline asm shl.b32 %r1164, %r1275, 2; and.b32 %r1165, %r1164, 124; add.s32 %r1167, %r1165, %r1260; and.b32 %r1170, %r1275, 28; shr.u32 %r1171, %r1170, 2; or.b32 %r1172, %r1273, %r1171; shl.b32 %r1173, %r1172, 7; add.s32 %r1174, %r1167, %r1173; add.s32 %r1102, %r1174, 16384; // begin inline asm st.shared.b32 [%r1102], %r1100; // end inline asm add.s32 %r400, %r1174, 17408; // begin inline asm st.shared.b32 [%r400], %r1101; // end inline asm xor.b32 %r1108, %r1102, 16; mov.b32 %f650, %r1427; mov.b32 %f651, %r1428; // begin inline asm cvt.rn.f16x2.f32 %r1106, %f650, %f651; // end inline asm mov.b32 %f652, %r1425; mov.b32 %f653, %r1426; // begin inline asm cvt.rn.f16x2.f32 %r1107, %f652, %f653; // end inline asm // begin inline asm st.shared.b32 [%r1108], %r1106; // end inline asm add.s32 %r1110, %r1108, 1024; // begin inline asm st.shared.b32 [%r1110], %r1107; // end inline asm xor.b32 %r1114, %r1102, 32; mov.b32 %f654, %r1423; mov.b32 %f655, %r1424; // begin inline asm cvt.rn.f16x2.f32 %r1112, %f654, %f655; // end inline asm mov.b32 %f656, %r1421; mov.b32 %f657, %r1422; // begin inline asm cvt.rn.f16x2.f32 %r1113, %f656, %f657; // end inline asm // begin inline asm st.shared.b32 [%r1114], %r1112; // end inline asm add.s32 %r1116, %r1114, 1024; // begin inline asm st.shared.b32 [%r1116], %r1113; // end inline asm xor.b32 %r1120, %r1102, 48; mov.b32 %f658, %r1419; mov.b32 %f659, %r1420; // begin inline asm cvt.rn.f16x2.f32 %r1118, %f658, %f659; // end inline asm mov.b32 %f660, %r1417; mov.b32 %f661, %r1418; // begin inline asm cvt.rn.f16x2.f32 %r1119, %f660, %f661; // end inline asm // begin inline asm st.shared.b32 [%r1120], %r1118; // end inline asm add.s32 %r1122, %r1120, 1024; // begin inline asm st.shared.b32 [%r1122], %r1119; // end inline asm xor.b32 %r1126, %r1102, 64; mov.b32 %f662, %r1415; mov.b32 %f663, %r1416; // begin inline asm cvt.rn.f16x2.f32 %r1124, %f662, %f663; // end inline asm mov.b32 %f664, %r1413; mov.b32 %f665, %r1414; // begin inline asm cvt.rn.f16x2.f32 %r1125, %f664, %f665; // end inline asm // begin inline asm st.shared.b32 [%r1126], %r1124; // end inline asm add.s32 %r1128, %r1126, 1024; // begin inline asm st.shared.b32 [%r1128], %r1125; // end inline asm xor.b32 %r1132, %r1102, 80; mov.b32 %f666, %r1411; mov.b32 %f667, %r1412; // begin inline asm cvt.rn.f16x2.f32 %r1130, %f666, %f667; // end inline asm mov.b32 %f668, %r1409; mov.b32 %f669, %r1410; // begin inline asm cvt.rn.f16x2.f32 %r1131, %f668, %f669; // end inline asm // begin inline asm st.shared.b32 [%r1132], %r1130; // end inline asm add.s32 %r1134, %r1132, 1024; // begin inline asm st.shared.b32 [%r1134], %r1131; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1136, %r1137, %r1138, %r1139}, [%r19]; // end inline asm add.s32 %r1145, %r19, 2048; // begin inline asm ld.shared.v4.b32 {%r1141, %r1142, %r1143, %r1144}, [%r1145]; // end inline asm add.s32 %r1150, %r19, 4096; // begin inline asm ld.shared.v4.b32 {%r1146, %r1147, %r1148, %r1149}, [%r1150]; // end inline asm add.s32 %r1155, %r19, 6144; // begin inline asm ld.shared.v4.b32 {%r1151, %r1152, %r1153, %r1154}, [%r1155]; // end inline asm bar.sync 0; cvt.u32.u64 %r1175, %rd5; setp.ge.s32 %p124, %r1175, %r1; @%p124 bra $L__BB0_75; setp.ge.s32 %p125, %r6, %r14; @%p125 bra $L__BB0_67; mul.lo.s64 %rd138, %rd10, %rd5; add.s64 %rd139, %rd35, %rd138; cvta.to.global.u64 %rd140, %rd11; add.s64 %rd141, %rd140, %rd139; st.global.v4.u32 [%rd141], {%r1136, %r1137, %r1138, %r1139}; $L__BB0_67: add.s32 %r1177, %r1175, 16; setp.ge.s32 %p126, %r1177, %r1; @%p126 bra $L__BB0_75; @%p125 bra $L__BB0_70; add.s64 %rd142, %rd5, 16; mul.lo.s64 %rd143, %rd142, %rd10; add.s64 %rd144, %rd35, %rd143; cvta.to.global.u64 %rd145, %rd11; add.s64 %rd146, %rd145, %rd144; st.global.v4.u32 [%rd146], {%r1141, %r1142, %r1143, %r1144}; $L__BB0_70: add.s32 %r1179, %r1175, 32; setp.ge.s32 %p128, %r1179, %r1; @%p128 bra $L__BB0_75; @%p125 bra $L__BB0_73; add.s64 %rd147, %rd5, 32; mul.lo.s64 %rd148, %rd147, %rd10; add.s64 %rd149, %rd35, %rd148; cvta.to.global.u64 %rd150, %rd11; add.s64 %rd151, %rd150, %rd149; st.global.v4.u32 [%rd151], {%r1146, %r1147, %r1148, %r1149}; $L__BB0_73: add.s32 %r1181, %r1175, 48; setp.ge.s32 %p130, %r1181, %r1; or.pred %p132, %p130, %p125; @%p132 bra $L__BB0_75; add.s64 %rd152, %rd5, 48; mul.lo.s64 %rd153, %rd152, %rd10; add.s64 %rd154, %rd35, %rd153; cvta.to.global.u64 %rd155, %rd11; add.s64 %rd156, %rd155, %rd154; st.global.v4.u32 [%rd156], {%r1151, %r1152, %r1153, %r1154}; $L__BB0_75: mov.b32 %f670, %r1407; mov.b32 %f671, %r1408; // begin inline asm cvt.rn.f16x2.f32 %r1182, %f670, %f671; // end inline asm mov.b32 %f672, %r1405; mov.b32 %f673, %r1406; // begin inline asm cvt.rn.f16x2.f32 %r1183, %f672, %f673; // end inline asm // begin inline asm st.shared.b32 [%r1102], %r1182; // end inline asm // begin inline asm st.shared.b32 [%r400], %r1183; // end inline asm mov.b32 %f674, %r1403; mov.b32 %f675, %r1404; // begin inline asm cvt.rn.f16x2.f32 %r1188, %f674, %f675; // end inline asm mov.b32 %f676, %r1401; mov.b32 %f677, %r1402; // begin inline asm cvt.rn.f16x2.f32 %r1189, %f676, %f677; // end inline asm // begin inline asm st.shared.b32 [%r1108], %r1188; // end inline asm // begin inline asm st.shared.b32 [%r1110], %r1189; // end inline asm mov.b32 %f678, %r1399; mov.b32 %f679, %r1400; // begin inline asm cvt.rn.f16x2.f32 %r1194, %f678, %f679; // end inline asm mov.b32 %f680, %r1397; mov.b32 %f681, %r1398; // begin inline asm cvt.rn.f16x2.f32 %r1195, %f680, %f681; // end inline asm // begin inline asm st.shared.b32 [%r1114], %r1194; // end inline asm // begin inline asm st.shared.b32 [%r1116], %r1195; // end inline asm mov.b32 %f682, %r1395; mov.b32 %f683, %r1396; // begin inline asm cvt.rn.f16x2.f32 %r1200, %f682, %f683; // end inline asm mov.b32 %f684, %r1393; mov.b32 %f685, %r1394; // begin inline asm cvt.rn.f16x2.f32 %r1201, %f684, %f685; // end inline asm // begin inline asm st.shared.b32 [%r1120], %r1200; // end inline asm // begin inline asm st.shared.b32 [%r1122], %r1201; // end inline asm mov.b32 %f686, %r1391; mov.b32 %f687, %r1392; // begin inline asm cvt.rn.f16x2.f32 %r1206, %f686, %f687; // end inline asm mov.b32 %f688, %r1389; mov.b32 %f689, %r1390; // begin inline asm cvt.rn.f16x2.f32 %r1207, %f688, %f689; // end inline asm // begin inline asm st.shared.b32 [%r1126], %r1206; // end inline asm // begin inline asm st.shared.b32 [%r1128], %r1207; // end inline asm mov.b32 %f690, %r1387; mov.b32 %f691, %r1388; // begin inline asm cvt.rn.f16x2.f32 %r1212, %f690, %f691; // end inline asm mov.b32 %f692, %r1385; mov.b32 %f693, %r1386; // begin inline asm cvt.rn.f16x2.f32 %r1213, %f692, %f693; // end inline asm // begin inline asm st.shared.b32 [%r1132], %r1212; // end inline asm // begin inline asm st.shared.b32 [%r1134], %r1213; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1218, %r1219, %r1220, %r1221}, [%r19]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1223, %r1224, %r1225, %r1226}, [%r1145]; // end inline asm add.s32 %r1232, %r1145, 2048; // begin inline asm ld.shared.v4.b32 {%r1228, %r1229, %r1230, %r1231}, [%r1232]; // end inline asm add.s32 %r1237, %r1145, 4096; // begin inline asm ld.shared.v4.b32 {%r1233, %r1234, %r1235, %r1236}, [%r1237]; // end inline asm add.s32 %r1251, %r1175, 64; setp.ge.s32 %p133, %r1251, %r1; @%p133 bra $L__BB0_86; setp.ge.s32 %p134, %r6, %r14; @%p134 bra $L__BB0_78; add.s64 %rd157, %rd5, 64; mul.lo.s64 %rd158, %rd157, %rd10; add.s64 %rd159, %rd35, %rd158; cvta.to.global.u64 %rd160, %rd11; add.s64 %rd161, %rd160, %rd159; st.global.v4.u32 [%rd161], {%r1218, %r1219, %r1220, %r1221}; $L__BB0_78: add.s32 %r1253, %r1175, 80; setp.ge.s32 %p135, %r1253, %r1; @%p135 bra $L__BB0_86; @%p134 bra $L__BB0_81; add.s64 %rd162, %rd5, 80; mul.lo.s64 %rd163, %rd162, %rd10; add.s64 %rd164, %rd35, %rd163; cvta.to.global.u64 %rd165, %rd11; add.s64 %rd166, %rd165, %rd164; st.global.v4.u32 [%rd166], {%r1223, %r1224, %r1225, %r1226}; $L__BB0_81: add.s32 %r1255, %r1175, 96; setp.ge.s32 %p137, %r1255, %r1; @%p137 bra $L__BB0_86; @%p134 bra $L__BB0_84; add.s64 %rd167, %rd5, 96; mul.lo.s64 %rd168, %rd167, %rd10; add.s64 %rd169, %rd35, %rd168; cvta.to.global.u64 %rd170, %rd11; add.s64 %rd171, %rd170, %rd169; st.global.v4.u32 [%rd171], {%r1228, %r1229, %r1230, %r1231}; $L__BB0_84: add.s32 %r1257, %r1175, 112; setp.ge.s32 %p139, %r1257, %r1; or.pred %p141, %p139, %p134; @%p141 bra $L__BB0_86; add.s64 %rd172, %rd5, 112; mul.lo.s64 %rd173, %rd172, %rd10; add.s64 %rd174, %rd35, %rd173; cvta.to.global.u64 %rd175, %rd11; add.s64 %rd176, %rd175, %rd174; st.global.v4.u32 [%rd176], {%r1233, %r1234, %r1235, %r1236}; $L__BB0_86: ret; }