.reg .b16 %rs<291>; .reg .f32 %f<770>; .reg .b32 %r<1381>; .reg .b64 %rd<136>; mov.b64 %rd58, fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd58; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0+52]; mov.u32 %r436, %ctaid.z; shl.b32 %r3, %r436, 6; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_86; mov.u32 %r555, %tid.x; mov.u32 %r556, %ctaid.y; mov.u32 %r557, %ctaid.x; mul.lo.s32 %r558, %r1, %r556; mad.lo.s32 %r559, %r558, %r2, %r557; shr.s32 %r560, %r555, 31; shr.u32 %r561, %r560, 27; add.s32 %r562, %r555, %r561; and.b32 %r563, %r562, -32; sub.s32 %r564, %r555, %r563; shr.u32 %r565, %r560, 25; add.s32 %r566, %r555, %r565; shr.s32 %r567, %r566, 7; shl.b32 %r568, %r567, 4; shr.s32 %r569, %r564, 31; shr.u32 %r570, %r569, 30; add.s32 %r571, %r564, %r570; and.b32 %r572, %r571, 2147483644; sub.s32 %r573, %r564, %r572; shl.b32 %r574, %r573, 1; add.s32 %r1298, %r574, %r568; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r575, %r562, 5; shr.s32 %r576, %r562, 31; shr.u32 %r577, %r576, 30; add.s32 %r578, %r575, %r577; and.b32 %r579, %r578, 268435452; sub.s32 %r580, %r575, %r579; shl.b32 %r581, %r580, 4; shr.s32 %r582, %r571, 2; add.s32 %r5, %r581, %r582; shr.u32 %r583, %r560, 30; add.s32 %r584, %r555, %r583; shr.s32 %r6, %r584, 2; add.s32 %r585, %r6, %r3; cvt.s64.s32 %rd5, %r585; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd65, %rd6, %rd5; mul.lo.s32 %r586, %r559, 3; mul.wide.s32 %rd66, %r586, 64; and.b32 %r587, %r584, -4; sub.s32 %r7, %r555, %r587; shl.b32 %r588, %r7, 4; cvt.s64.s32 %rd67, %r588; add.s64 %rd68, %rd66, %rd67; add.s64 %rd69, %rd68, %rd65; ld.param.u64 %rd70, [%rd1]; add.s64 %rd59, %rd70, %rd69; shr.u32 %r589, %r560, 29; add.s32 %r590, %r555, %r589; shr.s32 %r591, %r590, 3; shr.s32 %r592, %r590, 31; shr.u32 %r593, %r592, 30; add.s32 %r594, %r591, %r593; and.b32 %r595, %r594, 268435452; sub.s32 %r596, %r591, %r595; and.b32 %r597, %r590, 268435448; sub.s32 %r598, %r555, %r597; xor.b32 %r599, %r596, %r598; shl.b32 %r600, %r591, 7; shl.b32 %r601, %r599, 4; add.s32 %r602, %r601, %r600; mov.u32 %r603, 31; mov.u32 %r1297, 0; mov.u32 %r605, -1; shfl.sync.idx.b32 %r606|%p2, %r1297, %r1297, %r603, %r605; shfl.sync.idx.b32 %r607|%p3, %r1297, %r1297, %r603, %r605; and.b32 %r608, %r555, 96; shr.u32 %r609, %r608, 2; and.b32 %r610, %r555, 14; shr.u32 %r611, %r610, 1; or.b32 %r612, %r609, %r611; and.b32 %r613, %r555, 6; shr.u32 %r614, %r613, 1; shl.b32 %r615, %r555, 2; and.b32 %r616, %r615, 4; or.b32 %r617, %r614, %r616; shl.b32 %r618, %r612, 7; shl.b32 %r619, %r617, 4; and.b32 %r620, %r555, 16; xor.b32 %r621, %r619, %r620; or.b32 %r622, %r621, %r618; cvt.s64.s32 %rd71, %r6; mul.lo.s64 %rd72, %rd6, %rd71; add.s32 %r623, %r586, 1; mul.wide.s32 %rd73, %r623, 64; add.s64 %rd74, %rd73, %rd67; add.s64 %rd75, %rd74, %rd72; add.s64 %rd135, %rd70, %rd75; shfl.sync.idx.b32 %r624|%p4, %r1297, %r1297, %r603, %r605; shfl.sync.idx.b32 %r625|%p5, %r1297, %r1297, %r603, %r605; shr.u32 %r626, %r620, 2; or.b32 %r627, %r614, %r626; and.b32 %r628, %r555, 8; shr.u32 %r629, %r628, 3; xor.b32 %r630, %r617, %r629; shl.b32 %r631, %r627, 7; shl.b32 %r632, %r630, 4; or.b32 %r633, %r632, %r631; add.s32 %r634, %r586, 2; mul.wide.s32 %rd76, %r634, 64; add.s64 %rd77, %rd76, %rd67; add.s64 %rd78, %rd77, %rd72; add.s64 %rd134, %rd70, %rd78; shfl.sync.idx.b32 %r635|%p6, %r1297, %r1297, %r603, %r605; shfl.sync.idx.b32 %r636|%p7, %r1297, %r1297, %r603, %r605; shl.b32 %r637, %r555, 6; and.b32 %r638, %r637, 896; or.b32 %r639, %r621, %r638; mov.u32 %r643, _ZN25fused_multihead_attention5smem_E; add.s32 %r644, %r643, 4096; add.s32 %r10, %r602, %r644; sub.s32 %r645, %r1, %r3; min.s32 %r646, %r645, 64; setp.lt.s32 %p8, %r6, %r646; add.s32 %r647, %r6, 32; setp.lt.s32 %p9, %r647, %r646; shl.b64 %rd79, %rd6, 5; add.s64 %rd60, %rd59, %rd79; add.s32 %r648, %r602, %r643; add.s32 %r437, %r648, %r607; add.s32 %r439, %r437, 2048; selp.b32 %r438, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r437], [%rd59], 16, %r438; // end inline asm selp.b32 %r440, 16, 0, %p9; // begin inline asm cp.async.cg.shared.global [%r439], [%rd60], 16, %r440; // end inline asm min.s32 %r649, %r1, 64; setp.lt.s32 %p10, %r6, %r649; setp.lt.s32 %p11, %r647, %r649; add.s64 %rd62, %rd135, %rd79; add.s32 %r441, %r10, %r625; add.s32 %r443, %r441, 2048; selp.b32 %r446, 16, 0, %p10; // begin inline asm cp.async.cg.shared.global [%r441], [%rd135], 16, %r446; // end inline asm selp.b32 %r448, 16, 0, %p11; // begin inline asm cp.async.cg.shared.global [%r443], [%rd62], 16, %r448; // end inline asm add.s64 %rd64, %rd134, %rd79; add.s32 %r650, %r643, 8192; add.s32 %r651, %r602, %r650; add.s32 %r445, %r651, %r636; add.s32 %r447, %r445, 2048; // begin inline asm cp.async.cg.shared.global [%r445], [%rd134], 16, %r446; // end inline asm // begin inline asm cp.async.cg.shared.global [%r447], [%rd64], 16, %r448; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r652, %r606, %r643; add.s32 %r453, %r652, %r622; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r449, %r450, %r451, %r452}, [%r453]; // end inline asm xor.b32 %r653, %r622, 32; add.s32 %r458, %r652, %r653; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r454, %r455, %r456, %r457}, [%r458]; // end inline asm add.s32 %r23, %r624, %r644; add.s32 %r463, %r23, %r633; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1325, %r1324, %r1323, %r1322}, [%r463]; // end inline asm add.s32 %r468, %r463, 1024; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1321, %r1320, %r1319, %r1318}, [%r468]; // end inline asm add.s32 %r473, %r463, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1317, %r1316, %r1315, %r1314}, [%r473]; // end inline asm add.s32 %r478, %r463, 3072; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1313, %r1312, %r1311, %r1310}, [%r478]; // end inline asm xor.b32 %r654, %r633, 32; add.s32 %r483, %r23, %r654; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1309, %r1308, %r1307, %r1306}, [%r483]; // end inline asm add.s32 %r488, %r483, 1024; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1305, %r1304, %r1303, %r1302}, [%r488]; // end inline asm add.s32 %r493, %r483, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1301, %r1326, %r1327, %r1328}, [%r493]; // end inline asm add.s32 %r498, %r483, 3072; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1329, %r1330, %r1331, %r1332}, [%r498]; // end inline asm add.s32 %r503, %r639, %r650; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1364, %r1363, %r1362, %r1361}, [%r503]; // end inline asm xor.b32 %r655, %r639, 32; add.s32 %r508, %r655, %r650; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1360, %r1359, %r1358, %r1357}, [%r508]; // end inline asm add.s32 %r656, %r643, 9216; add.s32 %r513, %r639, %r656; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1356, %r1355, %r1354, %r1353}, [%r513]; // end inline asm add.s32 %r518, %r655, %r656; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1352, %r1351, %r1350, %r1349}, [%r518]; // end inline asm add.s32 %r657, %r643, 10240; add.s32 %r523, %r639, %r657; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1348, %r1347, %r1346, %r1345}, [%r523]; // end inline asm add.s32 %r528, %r655, %r657; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1344, %r1343, %r1342, %r1341}, [%r528]; // end inline asm add.s32 %r658, %r643, 11264; add.s32 %r533, %r639, %r658; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1340, %r1339, %r1338, %r1337}, [%r533]; // end inline asm add.s32 %r538, %r655, %r658; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1336, %r1335, %r1334, %r1333}, [%r538]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1380, 0; // end inline asm // begin inline asm mov.u32 %r1379, 0; // end inline asm // begin inline asm mov.u32 %r1378, 0; // end inline asm // begin inline asm mov.u32 %r1377, 0; // end inline asm // begin inline asm mov.u32 %r1376, 0; // end inline asm // begin inline asm mov.u32 %r1375, 0; // end inline asm // begin inline asm mov.u32 %r1374, 0; // end inline asm // begin inline asm mov.u32 %r1373, 0; // end inline asm // begin inline asm mov.u32 %r1372, 0; // end inline asm // begin inline asm mov.u32 %r1371, 0; // end inline asm // begin inline asm mov.u32 %r1370, 0; // end inline asm // begin inline asm mov.u32 %r1369, 0; // end inline asm // begin inline asm mov.u32 %r1368, 0; // end inline asm // begin inline asm mov.u32 %r1367, 0; // end inline asm // begin inline asm mov.u32 %r1366, 0; // end inline asm // begin inline asm mov.u32 %r1365, 0; // end inline asm add.s32 %r659, %r1, 63; shr.s32 %r660, %r659, 31; shr.u32 %r661, %r660, 26; add.s32 %r662, %r659, %r661; and.b32 %r104, %r662, -64; setp.lt.s32 %p12, %r1, 1; @%p12 bra $L__BB0_81; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r1298; cvt.s64.s32 %rd13, %r5; cvt.s64.s32 %rd80, %r3; add.s64 %rd14, %rd13, %rd80; add.s32 %r664, %r1298, 1; cvt.s64.s32 %rd15, %r664; add.s32 %r665, %r1298, 8; cvt.s64.s32 %rd16, %r665; add.s32 %r666, %r1298, 9; cvt.s64.s32 %rd17, %r666; add.s32 %r667, %r1298, 16; cvt.s64.s32 %rd18, %r667; add.s32 %r668, %r1298, 17; cvt.s64.s32 %rd19, %r668; add.s32 %r669, %r1298, 24; cvt.s64.s32 %rd20, %r669; add.s32 %r670, %r1298, 25; cvt.s64.s32 %rd21, %r670; add.s32 %r671, %r1298, 32; cvt.s64.s32 %rd22, %r671; add.s32 %r672, %r1298, 33; cvt.s64.s32 %rd23, %r672; add.s32 %r673, %r1298, 40; cvt.s64.s32 %rd24, %r673; add.s32 %r674, %r1298, 41; cvt.s64.s32 %rd25, %r674; add.s32 %r675, %r1298, 48; cvt.s64.s32 %rd26, %r675; add.s32 %r676, %r1298, 49; cvt.s64.s32 %rd27, %r676; add.s32 %r677, %r1298, 56; cvt.s64.s32 %rd28, %r677; add.s32 %r678, %r1298, 57; cvt.s64.s32 %rd29, %r678; add.s32 %r177, %r5, 8; cvt.s64.s32 %rd81, %r177; add.s64 %rd30, %rd81, %rd80; mov.f32 %f734, 0fFF800000; mov.f32 %f732, 0f00000000; mov.u32 %r1299, %r1; mov.f32 %f733, %f732; mov.f32 %f735, %f734; mov.u32 %r1300, %r1; $L__BB0_3: add.s32 %r679, %r1297, 64; setp.ge.s32 %p13, %r679, %r104; @%p13 bra $L__BB0_5; bar.sync 0; shl.b64 %rd86, %rd6, 6; add.s64 %rd82, %rd135, %rd86; add.s32 %r1300, %r1300, -64; min.s32 %r688, %r1300, 64; setp.lt.s32 %p14, %r6, %r688; setp.lt.s32 %p15, %r647, %r688; mul.lo.s64 %rd87, %rd6, 96; add.s64 %rd83, %rd135, %rd87; selp.b32 %r681, 16, 0, %p14; // begin inline asm cp.async.cg.shared.global [%r441], [%rd82], 16, %r681; // end inline asm selp.b32 %r683, 16, 0, %p15; // begin inline asm cp.async.cg.shared.global [%r443], [%rd83], 16, %r683; // end inline asm add.s64 %rd84, %rd134, %rd86; add.s32 %r1299, %r1299, -64; min.s32 %r690, %r1299, 64; setp.lt.s32 %p16, %r6, %r690; setp.lt.s32 %p17, %r647, %r690; add.s64 %rd85, %rd134, %rd87; selp.b32 %r685, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r445], [%rd84], 16, %r685; // end inline asm selp.b32 %r687, 16, 0, %p17; // begin inline asm cp.async.cg.shared.global [%r447], [%rd85], 16, %r687; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm mov.u64 %rd134, %rd84; mov.u64 %rd135, %rd82; $L__BB0_5: setp.eq.s16 %p18, %rs1, 0; @%p18 bra $L__BB0_70; mov.u32 %r1213, %ctaid.x; mov.u32 %r1212, %ctaid.y; mov.u32 %r1211, %ctaid.z; ld.param.u32 %r1210, [fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0+52]; setp.ge.u64 %p19, %rd14, %rd2; mul.lo.s32 %r692, %r1211, %r1; shl.b32 %r693, %r692, 6; cvt.s64.s32 %rd90, %r693; cvt.u64.u32 %rd37, %r1297; add.s64 %rd38, %rd90, %rd37; mul.lo.s64 %rd91, %rd2, %rd13; add.s64 %rd92, %rd38, %rd91; add.s64 %rd93, %rd92, %rd12; add.s64 %rd39, %rd12, %rd37; setp.ge.u64 %p20, %rd39, %rd2; shl.b64 %rd94, %rd93, 1; mad.lo.s32 %r696, %r1210, %r1212, %r1213; cvt.s64.s32 %rd95, %r696; mul.lo.s64 %rd96, %rd4, %rd95; add.s64 %rd97, %rd96, %rd94; cvta.to.global.u64 %rd98, %rd3; add.s64 %rd40, %rd98, %rd97; mov.u16 %rs260, 0; or.pred %p21, %p20, %p19; mov.u16 %rs259, %rs260; @%p21 bra $L__BB0_8; ld.global.u16 %rs259, [%rd40]; $L__BB0_8: add.s64 %rd41, %rd15, %rd37; setp.ge.u64 %p22, %rd41, %rd2; or.pred %p24, %p22, %p19; @%p24 bra $L__BB0_10; ld.global.u16 %rs260, [%rd40+2]; $L__BB0_10: add.s64 %rd42, %rd16, %rd37; setp.ge.u64 %p25, %rd42, %rd2; mov.u16 %rs262, 0; or.pred %p27, %p25, %p19; mov.u16 %rs261, %rs262; @%p27 bra $L__BB0_12; ld.global.u16 %rs261, [%rd40+16]; $L__BB0_12: add.s64 %rd43, %rd17, %rd37; setp.ge.u64 %p28, %rd43, %rd2; or.pred %p30, %p28, %p19; @%p30 bra $L__BB0_14; ld.global.u16 %rs262, [%rd40+18]; $L__BB0_14: add.s64 %rd44, %rd18, %rd37; setp.ge.u64 %p31, %rd44, %rd2; mov.u16 %rs264, 0; or.pred %p33, %p31, %p19; mov.u16 %rs263, %rs264; @%p33 bra $L__BB0_16; ld.global.u16 %rs263, [%rd40+32]; $L__BB0_16: add.s64 %rd45, %rd19, %rd37; setp.ge.u64 %p34, %rd45, %rd2; or.pred %p36, %p34, %p19; @%p36 bra $L__BB0_18; ld.global.u16 %rs264, [%rd40+34]; $L__BB0_18: add.s64 %rd46, %rd20, %rd37; setp.ge.u64 %p37, %rd46, %rd2; mov.u16 %rs266, 0; or.pred %p39, %p37, %p19; mov.u16 %rs265, %rs266; @%p39 bra $L__BB0_20; ld.global.u16 %rs265, [%rd40+48]; $L__BB0_20: add.s64 %rd47, %rd21, %rd37; setp.ge.u64 %p40, %rd47, %rd2; or.pred %p42, %p40, %p19; @%p42 bra $L__BB0_22; ld.global.u16 %rs266, [%rd40+50]; $L__BB0_22: add.s64 %rd48, %rd22, %rd37; setp.ge.u64 %p43, %rd48, %rd2; mov.u16 %rs268, 0; or.pred %p45, %p43, %p19; mov.u16 %rs267, %rs268; @%p45 bra $L__BB0_24; ld.global.u16 %rs267, [%rd40+64]; $L__BB0_24: add.s64 %rd49, %rd23, %rd37; setp.ge.u64 %p46, %rd49, %rd2; or.pred %p48, %p46, %p19; @%p48 bra $L__BB0_26; ld.global.u16 %rs268, [%rd40+66]; $L__BB0_26: add.s64 %rd50, %rd24, %rd37; setp.ge.u64 %p49, %rd50, %rd2; mov.u16 %rs270, 0; or.pred %p51, %p49, %p19; mov.u16 %rs269, %rs270; @%p51 bra $L__BB0_28; ld.global.u16 %rs269, [%rd40+80]; $L__BB0_28: add.s64 %rd51, %rd25, %rd37; setp.ge.u64 %p52, %rd51, %rd2; or.pred %p54, %p52, %p19; @%p54 bra $L__BB0_30; ld.global.u16 %rs270, [%rd40+82]; $L__BB0_30: add.s64 %rd52, %rd26, %rd37; setp.ge.u64 %p55, %rd52, %rd2; mov.u16 %rs272, 0; or.pred %p57, %p55, %p19; mov.u16 %rs271, %rs272; @%p57 bra $L__BB0_32; ld.global.u16 %rs271, [%rd40+96]; $L__BB0_32: add.s64 %rd53, %rd27, %rd37; setp.ge.u64 %p58, %rd53, %rd2; or.pred %p60, %p58, %p19; @%p60 bra $L__BB0_34; ld.global.u16 %rs272, [%rd40+98]; $L__BB0_34: add.s64 %rd54, %rd28, %rd37; setp.ge.u64 %p61, %rd54, %rd2; mov.u16 %rs274, 0; or.pred %p63, %p61, %p19; mov.u16 %rs273, %rs274; @%p63 bra $L__BB0_36; ld.global.u16 %rs273, [%rd40+112]; $L__BB0_36: add.s64 %rd55, %rd29, %rd37; setp.ge.u64 %p64, %rd55, %rd2; or.pred %p66, %p64, %p19; @%p66 bra $L__BB0_38; ld.global.u16 %rs274, [%rd40+114]; $L__BB0_38: mul.lo.s64 %rd100, %rd2, %rd81; add.s64 %rd101, %rd38, %rd100; add.s64 %rd102, %rd101, %rd12; setp.ge.u64 %p67, %rd30, %rd2; shl.b64 %rd103, %rd102, 1; add.s64 %rd106, %rd96, %rd103; add.s64 %rd56, %rd98, %rd106; mov.u16 %rs276, 0; or.pred %p69, %p20, %p67; mov.u16 %rs275, %rs276; @%p69 bra $L__BB0_40; ld.global.u16 %rs275, [%rd56]; $L__BB0_40: or.pred %p72, %p22, %p67; @%p72 bra $L__BB0_42; ld.global.u16 %rs276, [%rd56+2]; $L__BB0_42: mov.u16 %rs278, 0; or.pred %p75, %p25, %p67; mov.u16 %rs277, %rs278; @%p75 bra $L__BB0_44; ld.global.u16 %rs277, [%rd56+16]; $L__BB0_44: or.pred %p78, %p28, %p67; @%p78 bra $L__BB0_46; ld.global.u16 %rs278, [%rd56+18]; $L__BB0_46: mov.u16 %rs280, 0; or.pred %p81, %p31, %p67; mov.u16 %rs279, %rs280; @%p81 bra $L__BB0_48; ld.global.u16 %rs279, [%rd56+32]; $L__BB0_48: or.pred %p84, %p34, %p67; @%p84 bra $L__BB0_50; ld.global.u16 %rs280, [%rd56+34]; $L__BB0_50: mov.u16 %rs282, 0; or.pred %p87, %p37, %p67; mov.u16 %rs281, %rs282; @%p87 bra $L__BB0_52; ld.global.u16 %rs281, [%rd56+48]; $L__BB0_52: or.pred %p90, %p40, %p67; @%p90 bra $L__BB0_54; ld.global.u16 %rs282, [%rd56+50]; $L__BB0_54: mov.u16 %rs284, 0; or.pred %p93, %p43, %p67; mov.u16 %rs283, %rs284; @%p93 bra $L__BB0_56; ld.global.u16 %rs283, [%rd56+64]; $L__BB0_56: or.pred %p96, %p46, %p67; @%p96 bra $L__BB0_58; ld.global.u16 %rs284, [%rd56+66]; $L__BB0_58: mov.u16 %rs286, 0; or.pred %p99, %p49, %p67; mov.u16 %rs285, %rs286; @%p99 bra $L__BB0_60; ld.global.u16 %rs285, [%rd56+80]; $L__BB0_60: or.pred %p102, %p52, %p67; @%p102 bra $L__BB0_62; ld.global.u16 %rs286, [%rd56+82]; $L__BB0_62: mov.u16 %rs288, 0; or.pred %p105, %p55, %p67; mov.u16 %rs287, %rs288; @%p105 bra $L__BB0_64; ld.global.u16 %rs287, [%rd56+96]; $L__BB0_64: or.pred %p108, %p58, %p67; @%p108 bra $L__BB0_66; ld.global.u16 %rs288, [%rd56+98]; $L__BB0_66: mov.u16 %rs290, 0; or.pred %p111, %p61, %p67; mov.u16 %rs289, %rs290; @%p111 bra $L__BB0_68; ld.global.u16 %rs289, [%rd56+112]; $L__BB0_68: or.pred %p114, %p64, %p67; @%p114 bra $L__BB0_70; ld.global.u16 %rs290, [%rd56+114]; $L__BB0_70: // begin inline asm mov.u32 %r700, 0; // end inline asm // begin inline asm mov.u32 %r701, 0; // end inline asm // begin inline asm mov.u32 %r702, 0; // end inline asm // begin inline asm mov.u32 %r703, 0; // end inline asm // begin inline asm mov.u32 %r704, 0; // end inline asm // begin inline asm mov.u32 %r705, 0; // end inline asm // begin inline asm mov.u32 %r706, 0; // end inline asm // begin inline asm mov.u32 %r707, 0; // end inline asm // begin inline asm mov.u32 %r708, 0; // end inline asm // begin inline asm mov.u32 %r709, 0; // end inline asm // begin inline asm mov.u32 %r710, 0; // end inline asm // begin inline asm mov.u32 %r711, 0; // end inline asm // begin inline asm mov.u32 %r712, 0; // end inline asm // begin inline asm mov.u32 %r713, 0; // end inline asm // begin inline asm mov.u32 %r714, 0; // end inline asm // begin inline asm mov.u32 %r715, 0; // end inline asm // begin inline asm mov.u32 %r716, 0; // end inline asm // begin inline asm mov.u32 %r717, 0; // end inline asm // begin inline asm mov.u32 %r718, 0; // end inline asm // begin inline asm mov.u32 %r719, 0; // end inline asm // begin inline asm mov.u32 %r720, 0; // end inline asm // begin inline asm mov.u32 %r721, 0; // end inline asm // begin inline asm mov.u32 %r722, 0; // end inline asm // begin inline asm mov.u32 %r723, 0; // end inline asm // begin inline asm mov.u32 %r724, 0; // end inline asm // begin inline asm mov.u32 %r725, 0; // end inline asm // begin inline asm mov.u32 %r726, 0; // end inline asm // begin inline asm mov.u32 %r727, 0; // end inline asm // begin inline asm mov.u32 %r728, 0; // end inline asm // begin inline asm mov.u32 %r729, 0; // end inline asm // begin inline asm mov.u32 %r730, 0; // end inline asm // begin inline asm mov.u32 %r731, 0; // end inline asm mov.b32 %f228, %r700; mov.b32 %f229, %r701; mov.b32 %f230, %r702; mov.b32 %f231, %r703; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f228, %f229, %f230, %f231}, {%r449, %r450, %r451, %r452}, {%r1325, %r1324}, {%f228, %f229, %f230, %f231}; // end inline asm mov.b32 %f236, %r704; mov.b32 %f237, %r705; mov.b32 %f238, %r706; mov.b32 %f239, %r707; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f236, %f237, %f238, %f239}, {%r449, %r450, %r451, %r452}, {%r1323, %r1322}, {%f236, %f237, %f238, %f239}; // end inline asm mov.b32 %f244, %r708; mov.b32 %f245, %r709; mov.b32 %f246, %r710; mov.b32 %f247, %r711; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f244, %f245, %f246, %f247}, {%r449, %r450, %r451, %r452}, {%r1321, %r1320}, {%f244, %f245, %f246, %f247}; // end inline asm mov.b32 %f252, %r712; mov.b32 %f253, %r713; mov.b32 %f254, %r714; mov.b32 %f255, %r715; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f252, %f253, %f254, %f255}, {%r449, %r450, %r451, %r452}, {%r1319, %r1318}, {%f252, %f253, %f254, %f255}; // end inline asm mov.b32 %f260, %r716; mov.b32 %f261, %r717; mov.b32 %f262, %r718; mov.b32 %f263, %r719; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f260, %f261, %f262, %f263}, {%r449, %r450, %r451, %r452}, {%r1317, %r1316}, {%f260, %f261, %f262, %f263}; // end inline asm mov.b32 %f268, %r720; mov.b32 %f269, %r721; mov.b32 %f270, %r722; mov.b32 %f271, %r723; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f268, %f269, %f270, %f271}, {%r449, %r450, %r451, %r452}, {%r1315, %r1314}, {%f268, %f269, %f270, %f271}; // end inline asm mov.b32 %f276, %r724; mov.b32 %f277, %r725; mov.b32 %f278, %r726; mov.b32 %f279, %r727; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f276, %f277, %f278, %f279}, {%r449, %r450, %r451, %r452}, {%r1313, %r1312}, {%f276, %f277, %f278, %f279}; // end inline asm mov.b32 %f284, %r728; mov.b32 %f285, %r729; mov.b32 %f286, %r730; mov.b32 %f287, %r731; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f284, %f285, %f286, %f287}, {%r449, %r450, %r451, %r452}, {%r1311, %r1310}, {%f284, %f285, %f286, %f287}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f228, %f229, %f230, %f231}, {%r454, %r455, %r456, %r457}, {%r1309, %r1308}, {%f228, %f229, %f230, %f231}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f236, %f237, %f238, %f239}, {%r454, %r455, %r456, %r457}, {%r1307, %r1306}, {%f236, %f237, %f238, %f239}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f244, %f245, %f246, %f247}, {%r454, %r455, %r456, %r457}, {%r1305, %r1304}, {%f244, %f245, %f246, %f247}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f252, %f253, %f254, %f255}, {%r454, %r455, %r456, %r457}, {%r1303, %r1302}, {%f252, %f253, %f254, %f255}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f260, %f261, %f262, %f263}, {%r454, %r455, %r456, %r457}, {%r1301, %r1326}, {%f260, %f261, %f262, %f263}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f268, %f269, %f270, %f271}, {%r454, %r455, %r456, %r457}, {%r1327, %r1328}, {%f268, %f269, %f270, %f271}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f276, %f277, %f278, %f279}, {%r454, %r455, %r456, %r457}, {%r1329, %r1330}, {%f276, %f277, %f278, %f279}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f284, %f285, %f286, %f287}, {%r454, %r455, %r456, %r457}, {%r1331, %r1332}, {%f284, %f285, %f286, %f287}; // end inline asm mul.ftz.f32 %f292, %f1, %f228; mul.ftz.f32 %f293, %f1, %f229; mul.ftz.f32 %f294, %f1, %f236; mul.ftz.f32 %f295, %f1, %f237; mul.ftz.f32 %f296, %f1, %f230; mul.ftz.f32 %f297, %f1, %f231; mul.ftz.f32 %f298, %f1, %f238; mul.ftz.f32 %f299, %f1, %f239; mul.ftz.f32 %f300, %f1, %f244; mul.ftz.f32 %f301, %f1, %f245; mul.ftz.f32 %f302, %f1, %f252; mul.ftz.f32 %f303, %f1, %f253; mul.ftz.f32 %f304, %f1, %f246; mul.ftz.f32 %f305, %f1, %f247; mul.ftz.f32 %f306, %f1, %f254; mul.ftz.f32 %f307, %f1, %f255; mul.ftz.f32 %f308, %f1, %f260; mul.ftz.f32 %f309, %f1, %f261; mul.ftz.f32 %f310, %f1, %f268; mul.ftz.f32 %f311, %f1, %f269; mul.ftz.f32 %f312, %f1, %f262; mul.ftz.f32 %f313, %f1, %f263; mul.ftz.f32 %f314, %f1, %f270; mul.ftz.f32 %f315, %f1, %f271; mul.ftz.f32 %f316, %f1, %f276; mul.ftz.f32 %f317, %f1, %f277; mul.ftz.f32 %f318, %f1, %f284; mul.ftz.f32 %f319, %f1, %f285; mul.ftz.f32 %f320, %f1, %f278; mul.ftz.f32 %f321, %f1, %f279; mul.ftz.f32 %f322, %f1, %f286; mul.ftz.f32 %f323, %f1, %f287; setp.lt.s32 %p115, %r1298, %r1; selp.f32 %f767, %f292, 0fFF800000, %p115; add.s32 %r828, %r1298, 1; setp.lt.s32 %p116, %r828, %r1; selp.f32 %f766, %f293, 0fFF800000, %p116; add.s32 %r829, %r1298, 8; setp.lt.s32 %p117, %r829, %r1; selp.f32 %f765, %f294, 0fFF800000, %p117; add.s32 %r830, %r1298, 9; setp.lt.s32 %p118, %r830, %r1; selp.f32 %f764, %f295, 0fFF800000, %p118; add.s32 %r831, %r1298, 16; setp.lt.s32 %p119, %r831, %r1; selp.f32 %f763, %f300, 0fFF800000, %p119; add.s32 %r832, %r1298, 17; setp.lt.s32 %p120, %r832, %r1; selp.f32 %f762, %f301, 0fFF800000, %p120; add.s32 %r833, %r1298, 24; setp.lt.s32 %p121, %r833, %r1; selp.f32 %f761, %f302, 0fFF800000, %p121; add.s32 %r834, %r1298, 25; setp.lt.s32 %p122, %r834, %r1; selp.f32 %f760, %f303, 0fFF800000, %p122; add.s32 %r835, %r1298, 32; setp.lt.s32 %p123, %r835, %r1; selp.f32 %f759, %f308, 0fFF800000, %p123; add.s32 %r836, %r1298, 33; setp.lt.s32 %p124, %r836, %r1; selp.f32 %f758, %f309, 0fFF800000, %p124; add.s32 %r837, %r1298, 40; setp.lt.s32 %p125, %r837, %r1; selp.f32 %f757, %f310, 0fFF800000, %p125; add.s32 %r838, %r1298, 41; setp.lt.s32 %p126, %r838, %r1; selp.f32 %f756, %f311, 0fFF800000, %p126; add.s32 %r839, %r1298, 48; setp.lt.s32 %p127, %r839, %r1; selp.f32 %f755, %f316, 0fFF800000, %p127; add.s32 %r840, %r1298, 49; setp.lt.s32 %p128, %r840, %r1; selp.f32 %f754, %f317, 0fFF800000, %p128; add.s32 %r841, %r1298, 56; setp.lt.s32 %p129, %r841, %r1; selp.f32 %f753, %f318, 0fFF800000, %p129; add.s32 %r842, %r1298, 57; setp.lt.s32 %p130, %r842, %r1; selp.f32 %f752, %f319, 0fFF800000, %p130; selp.f32 %f751, %f296, 0fFF800000, %p115; selp.f32 %f750, %f297, 0fFF800000, %p116; selp.f32 %f749, %f298, 0fFF800000, %p117; selp.f32 %f748, %f299, 0fFF800000, %p118; selp.f32 %f747, %f304, 0fFF800000, %p119; selp.f32 %f746, %f305, 0fFF800000, %p120; selp.f32 %f745, %f306, 0fFF800000, %p121; selp.f32 %f744, %f307, 0fFF800000, %p122; selp.f32 %f743, %f312, 0fFF800000, %p123; selp.f32 %f742, %f313, 0fFF800000, %p124; selp.f32 %f741, %f314, 0fFF800000, %p125; selp.f32 %f740, %f315, 0fFF800000, %p126; selp.f32 %f739, %f320, 0fFF800000, %p127; selp.f32 %f738, %f321, 0fFF800000, %p128; selp.f32 %f737, %f322, 0fFF800000, %p129; selp.f32 %f736, %f323, 0fFF800000, %p130; @%p18 bra $L__BB0_72; // begin inline asm cvt.f32.f16 %f324, %rs259; // end inline asm add.ftz.f32 %f767, %f324, %f767; // begin inline asm cvt.f32.f16 %f325, %rs260; // end inline asm add.ftz.f32 %f766, %f325, %f766; // begin inline asm cvt.f32.f16 %f326, %rs261; // end inline asm add.ftz.f32 %f765, %f326, %f765; // begin inline asm cvt.f32.f16 %f327, %rs262; // end inline asm add.ftz.f32 %f764, %f327, %f764; // begin inline asm cvt.f32.f16 %f328, %rs263; // end inline asm add.ftz.f32 %f763, %f328, %f763; // begin inline asm cvt.f32.f16 %f329, %rs264; // end inline asm add.ftz.f32 %f762, %f329, %f762; // begin inline asm cvt.f32.f16 %f330, %rs265; // end inline asm add.ftz.f32 %f761, %f330, %f761; // begin inline asm cvt.f32.f16 %f331, %rs266; // end inline asm add.ftz.f32 %f760, %f331, %f760; // begin inline asm cvt.f32.f16 %f332, %rs267; // end inline asm add.ftz.f32 %f759, %f332, %f759; // begin inline asm cvt.f32.f16 %f333, %rs268; // end inline asm add.ftz.f32 %f758, %f333, %f758; // begin inline asm cvt.f32.f16 %f334, %rs269; // end inline asm add.ftz.f32 %f757, %f334, %f757; // begin inline asm cvt.f32.f16 %f335, %rs270; // end inline asm add.ftz.f32 %f756, %f335, %f756; // begin inline asm cvt.f32.f16 %f336, %rs271; // end inline asm add.ftz.f32 %f755, %f336, %f755; // begin inline asm cvt.f32.f16 %f337, %rs272; // end inline asm add.ftz.f32 %f754, %f337, %f754; // begin inline asm cvt.f32.f16 %f338, %rs273; // end inline asm add.ftz.f32 %f753, %f338, %f753; // begin inline asm cvt.f32.f16 %f339, %rs274; // end inline asm add.ftz.f32 %f752, %f339, %f752; // begin inline asm cvt.f32.f16 %f340, %rs275; // end inline asm add.ftz.f32 %f751, %f340, %f751; // begin inline asm cvt.f32.f16 %f341, %rs276; // end inline asm add.ftz.f32 %f750, %f341, %f750; // begin inline asm cvt.f32.f16 %f342, %rs277; // end inline asm add.ftz.f32 %f749, %f342, %f749; // begin inline asm cvt.f32.f16 %f343, %rs278; // end inline asm add.ftz.f32 %f748, %f343, %f748; // begin inline asm cvt.f32.f16 %f344, %rs279; // end inline asm add.ftz.f32 %f747, %f344, %f747; // begin inline asm cvt.f32.f16 %f345, %rs280; // end inline asm add.ftz.f32 %f746, %f345, %f746; // begin inline asm cvt.f32.f16 %f346, %rs281; // end inline asm add.ftz.f32 %f745, %f346, %f745; // begin inline asm cvt.f32.f16 %f347, %rs282; // end inline asm add.ftz.f32 %f744, %f347, %f744; // begin inline asm cvt.f32.f16 %f348, %rs283; // end inline asm add.ftz.f32 %f743, %f348, %f743; // begin inline asm cvt.f32.f16 %f349, %rs284; // end inline asm add.ftz.f32 %f742, %f349, %f742; // begin inline asm cvt.f32.f16 %f350, %rs285; // end inline asm add.ftz.f32 %f741, %f350, %f741; // begin inline asm cvt.f32.f16 %f351, %rs286; // end inline asm add.ftz.f32 %f740, %f351, %f740; // begin inline asm cvt.f32.f16 %f352, %rs287; // end inline asm add.ftz.f32 %f739, %f352, %f739; // begin inline asm cvt.f32.f16 %f353, %rs288; // end inline asm add.ftz.f32 %f738, %f353, %f738; // begin inline asm cvt.f32.f16 %f354, %rs289; // end inline asm add.ftz.f32 %f737, %f354, %f737; // begin inline asm cvt.f32.f16 %f355, %rs290; // end inline asm add.ftz.f32 %f736, %f355, %f736; $L__BB0_72: add.s32 %r1214, %r1297, 64; setp.ge.s32 %p185, %r1214, %r104; setp.gt.ftz.f32 %p133, %f767, %f766; selp.f32 %f356, %f767, %f766, %p133; setp.gt.ftz.f32 %p134, %f356, %f765; selp.f32 %f357, %f356, %f765, %p134; setp.gt.ftz.f32 %p135, %f357, %f764; selp.f32 %f358, %f357, %f764, %p135; setp.gt.ftz.f32 %p136, %f358, %f763; selp.f32 %f359, %f358, %f763, %p136; setp.gt.ftz.f32 %p137, %f359, %f762; selp.f32 %f360, %f359, %f762, %p137; setp.gt.ftz.f32 %p138, %f360, %f761; selp.f32 %f361, %f360, %f761, %p138; setp.gt.ftz.f32 %p139, %f361, %f760; selp.f32 %f362, %f361, %f760, %p139; setp.gt.ftz.f32 %p140, %f362, %f759; selp.f32 %f363, %f362, %f759, %p140; setp.gt.ftz.f32 %p141, %f363, %f758; selp.f32 %f364, %f363, %f758, %p141; setp.gt.ftz.f32 %p142, %f364, %f757; selp.f32 %f365, %f364, %f757, %p142; setp.gt.ftz.f32 %p143, %f365, %f756; selp.f32 %f366, %f365, %f756, %p143; setp.gt.ftz.f32 %p144, %f366, %f755; selp.f32 %f367, %f366, %f755, %p144; setp.gt.ftz.f32 %p145, %f367, %f754; selp.f32 %f368, %f367, %f754, %p145; setp.gt.ftz.f32 %p146, %f368, %f753; selp.f32 %f369, %f368, %f753, %p146; setp.gt.ftz.f32 %p147, %f369, %f752; selp.f32 %f370, %f369, %f752, %p147; setp.gt.ftz.f32 %p148, %f751, %f750; selp.f32 %f371, %f751, %f750, %p148; setp.gt.ftz.f32 %p149, %f371, %f749; selp.f32 %f372, %f371, %f749, %p149; setp.gt.ftz.f32 %p150, %f372, %f748; selp.f32 %f373, %f372, %f748, %p150; setp.gt.ftz.f32 %p151, %f373, %f747; selp.f32 %f374, %f373, %f747, %p151; setp.gt.ftz.f32 %p152, %f374, %f746; selp.f32 %f375, %f374, %f746, %p152; setp.gt.ftz.f32 %p153, %f375, %f745; selp.f32 %f376, %f375, %f745, %p153; setp.gt.ftz.f32 %p154, %f376, %f744; selp.f32 %f377, %f376, %f744, %p154; setp.gt.ftz.f32 %p155, %f377, %f743; selp.f32 %f378, %f377, %f743, %p155; setp.gt.ftz.f32 %p156, %f378, %f742; selp.f32 %f379, %f378, %f742, %p156; setp.gt.ftz.f32 %p157, %f379, %f741; selp.f32 %f380, %f379, %f741, %p157; setp.gt.ftz.f32 %p158, %f380, %f740; selp.f32 %f381, %f380, %f740, %p158; setp.gt.ftz.f32 %p159, %f381, %f739; selp.f32 %f382, %f381, %f739, %p159; setp.gt.ftz.f32 %p160, %f382, %f738; selp.f32 %f383, %f382, %f738, %p160; setp.gt.ftz.f32 %p161, %f383, %f737; selp.f32 %f384, %f383, %f737, %p161; setp.gt.ftz.f32 %p162, %f384, %f736; selp.f32 %f385, %f384, %f736, %p162; mov.b32 %r844, %f370; mov.u32 %r845, 31; mov.u32 %r846, 1; mov.u32 %r847, -1; shfl.sync.bfly.b32 %r848|%p163, %r844, %r846, %r845, %r847; mov.b32 %f386, %r848; setp.gt.ftz.f32 %p164, %f370, %f386; selp.f32 %f387, %f370, %f386, %p164; mov.b32 %r849, %f387; mov.u32 %r850, 2; shfl.sync.bfly.b32 %r851|%p165, %r849, %r850, %r845, %r847; mov.b32 %f388, %r851; setp.gt.ftz.f32 %p166, %f387, %f388; selp.f32 %f389, %f387, %f388, %p166; mov.b32 %r852, %f385; shfl.sync.bfly.b32 %r853|%p167, %r852, %r846, %r845, %r847; mov.b32 %f390, %r853; setp.gt.ftz.f32 %p168, %f385, %f390; selp.f32 %f391, %f385, %f390, %p168; mov.b32 %r854, %f391; shfl.sync.bfly.b32 %r855|%p169, %r854, %r850, %r845, %r847; mov.b32 %f392, %r855; setp.gt.ftz.f32 %p170, %f391, %f392; selp.f32 %f393, %f391, %f392, %p170; max.ftz.f32 %f102, %f389, %f735; max.ftz.f32 %f103, %f393, %f734; sub.ftz.f32 %f394, %f767, %f102; mul.ftz.f32 %f395, %f394, 0f3FB8AA3B; ex2.approx.ftz.f32 %f104, %f395; sub.ftz.f32 %f396, %f766, %f102; mul.ftz.f32 %f397, %f396, 0f3FB8AA3B; ex2.approx.ftz.f32 %f105, %f397; sub.ftz.f32 %f398, %f765, %f102; mul.ftz.f32 %f399, %f398, 0f3FB8AA3B; ex2.approx.ftz.f32 %f106, %f399; sub.ftz.f32 %f400, %f764, %f102; mul.ftz.f32 %f401, %f400, 0f3FB8AA3B; ex2.approx.ftz.f32 %f107, %f401; sub.ftz.f32 %f402, %f763, %f102; mul.ftz.f32 %f403, %f402, 0f3FB8AA3B; ex2.approx.ftz.f32 %f108, %f403; sub.ftz.f32 %f404, %f762, %f102; mul.ftz.f32 %f405, %f404, 0f3FB8AA3B; ex2.approx.ftz.f32 %f109, %f405; sub.ftz.f32 %f406, %f761, %f102; mul.ftz.f32 %f407, %f406, 0f3FB8AA3B; ex2.approx.ftz.f32 %f110, %f407; sub.ftz.f32 %f408, %f760, %f102; mul.ftz.f32 %f409, %f408, 0f3FB8AA3B; ex2.approx.ftz.f32 %f111, %f409; sub.ftz.f32 %f410, %f759, %f102; mul.ftz.f32 %f411, %f410, 0f3FB8AA3B; ex2.approx.ftz.f32 %f112, %f411; sub.ftz.f32 %f412, %f758, %f102; mul.ftz.f32 %f413, %f412, 0f3FB8AA3B; ex2.approx.ftz.f32 %f113, %f413; sub.ftz.f32 %f414, %f757, %f102; mul.ftz.f32 %f415, %f414, 0f3FB8AA3B; ex2.approx.ftz.f32 %f114, %f415; sub.ftz.f32 %f416, %f756, %f102; mul.ftz.f32 %f417, %f416, 0f3FB8AA3B; ex2.approx.ftz.f32 %f115, %f417; sub.ftz.f32 %f418, %f755, %f102; mul.ftz.f32 %f419, %f418, 0f3FB8AA3B; ex2.approx.ftz.f32 %f116, %f419; sub.ftz.f32 %f420, %f754, %f102; mul.ftz.f32 %f421, %f420, 0f3FB8AA3B; ex2.approx.ftz.f32 %f117, %f421; sub.ftz.f32 %f422, %f753, %f102; mul.ftz.f32 %f423, %f422, 0f3FB8AA3B; ex2.approx.ftz.f32 %f118, %f423; sub.ftz.f32 %f424, %f752, %f102; mul.ftz.f32 %f425, %f424, 0f3FB8AA3B; ex2.approx.ftz.f32 %f119, %f425; sub.ftz.f32 %f426, %f751, %f103; mul.ftz.f32 %f427, %f426, 0f3FB8AA3B; ex2.approx.ftz.f32 %f120, %f427; sub.ftz.f32 %f428, %f750, %f103; mul.ftz.f32 %f429, %f428, 0f3FB8AA3B; ex2.approx.ftz.f32 %f121, %f429; sub.ftz.f32 %f430, %f749, %f103; mul.ftz.f32 %f431, %f430, 0f3FB8AA3B; ex2.approx.ftz.f32 %f122, %f431; sub.ftz.f32 %f432, %f748, %f103; mul.ftz.f32 %f433, %f432, 0f3FB8AA3B; ex2.approx.ftz.f32 %f123, %f433; sub.ftz.f32 %f434, %f747, %f103; mul.ftz.f32 %f435, %f434, 0f3FB8AA3B; ex2.approx.ftz.f32 %f124, %f435; sub.ftz.f32 %f436, %f746, %f103; mul.ftz.f32 %f437, %f436, 0f3FB8AA3B; ex2.approx.ftz.f32 %f125, %f437; sub.ftz.f32 %f438, %f745, %f103; mul.ftz.f32 %f439, %f438, 0f3FB8AA3B; ex2.approx.ftz.f32 %f126, %f439; sub.ftz.f32 %f440, %f744, %f103; mul.ftz.f32 %f441, %f440, 0f3FB8AA3B; ex2.approx.ftz.f32 %f127, %f441; sub.ftz.f32 %f442, %f743, %f103; mul.ftz.f32 %f443, %f442, 0f3FB8AA3B; ex2.approx.ftz.f32 %f128, %f443; sub.ftz.f32 %f444, %f742, %f103; mul.ftz.f32 %f445, %f444, 0f3FB8AA3B; ex2.approx.ftz.f32 %f129, %f445; sub.ftz.f32 %f446, %f741, %f103; mul.ftz.f32 %f447, %f446, 0f3FB8AA3B; ex2.approx.ftz.f32 %f130, %f447; sub.ftz.f32 %f448, %f740, %f103; mul.ftz.f32 %f449, %f448, 0f3FB8AA3B; ex2.approx.ftz.f32 %f131, %f449; sub.ftz.f32 %f450, %f739, %f103; mul.ftz.f32 %f451, %f450, 0f3FB8AA3B; ex2.approx.ftz.f32 %f132, %f451; sub.ftz.f32 %f452, %f738, %f103; mul.ftz.f32 %f453, %f452, 0f3FB8AA3B; ex2.approx.ftz.f32 %f133, %f453; sub.ftz.f32 %f454, %f737, %f103; mul.ftz.f32 %f455, %f454, 0f3FB8AA3B; ex2.approx.ftz.f32 %f134, %f455; sub.ftz.f32 %f456, %f736, %f103; mul.ftz.f32 %f457, %f456, 0f3FB8AA3B; ex2.approx.ftz.f32 %f135, %f457; add.ftz.f32 %f458, %f104, %f105; add.ftz.f32 %f459, %f458, 0f00000000; add.ftz.f32 %f460, %f106, %f107; add.ftz.f32 %f461, %f460, 0f00000000; add.ftz.f32 %f462, %f108, %f109; add.ftz.f32 %f463, %f459, %f462; add.ftz.f32 %f464, %f110, %f111; add.ftz.f32 %f465, %f461, %f464; add.ftz.f32 %f466, %f112, %f113; add.ftz.f32 %f467, %f463, %f466; add.ftz.f32 %f468, %f114, %f115; add.ftz.f32 %f469, %f465, %f468; add.ftz.f32 %f470, %f116, %f117; add.ftz.f32 %f471, %f467, %f470; add.ftz.f32 %f472, %f118, %f119; add.ftz.f32 %f473, %f469, %f472; add.ftz.f32 %f474, %f471, %f473; add.ftz.f32 %f475, %f120, %f121; add.ftz.f32 %f476, %f475, 0f00000000; add.ftz.f32 %f477, %f122, %f123; add.ftz.f32 %f478, %f477, 0f00000000; add.ftz.f32 %f479, %f124, %f125; add.ftz.f32 %f480, %f476, %f479; add.ftz.f32 %f481, %f126, %f127; add.ftz.f32 %f482, %f478, %f481; add.ftz.f32 %f483, %f128, %f129; add.ftz.f32 %f484, %f480, %f483; add.ftz.f32 %f485, %f130, %f131; add.ftz.f32 %f486, %f482, %f485; add.ftz.f32 %f487, %f132, %f133; add.ftz.f32 %f488, %f484, %f487; add.ftz.f32 %f489, %f134, %f135; add.ftz.f32 %f490, %f486, %f489; add.ftz.f32 %f491, %f488, %f490; mov.b32 %r856, %f474; shfl.sync.bfly.b32 %r857|%p171, %r856, %r846, %r845, %r847; mov.b32 %f492, %r857; add.ftz.f32 %f493, %f474, %f492; mov.b32 %r858, %f493; shfl.sync.bfly.b32 %r859|%p172, %r858, %r850, %r845, %r847; mov.b32 %f494, %r859; add.ftz.f32 %f495, %f493, %f494; mov.b32 %r860, %f491; shfl.sync.bfly.b32 %r861|%p173, %r860, %r846, %r845, %r847; mov.b32 %f496, %r861; add.ftz.f32 %f497, %f491, %f496; mov.b32 %r862, %f497; shfl.sync.bfly.b32 %r863|%p174, %r862, %r850, %r845, %r847; mov.b32 %f498, %r863; add.ftz.f32 %f499, %f497, %f498; sub.ftz.f32 %f500, %f735, %f102; mul.ftz.f32 %f501, %f500, 0f3FB8AA3B; ex2.approx.ftz.f32 %f502, %f501; mul.ftz.f32 %f136, %f502, %f733; add.ftz.f32 %f733, %f136, %f495; sub.ftz.f32 %f503, %f734, %f103; mul.ftz.f32 %f504, %f503, 0f3FB8AA3B; ex2.approx.ftz.f32 %f505, %f504; mul.ftz.f32 %f138, %f505, %f732; add.ftz.f32 %f732, %f138, %f499; @%p185 bra $L__BB0_74; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1325, %r1324, %r1323, %r1322}, [%r463]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1321, %r1320, %r1319, %r1318}, [%r468]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1317, %r1316, %r1315, %r1314}, [%r473]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1313, %r1312, %r1311, %r1310}, [%r478]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1309, %r1308, %r1307, %r1306}, [%r483]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1305, %r1304, %r1303, %r1302}, [%r488]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1301, %r1326, %r1327, %r1328}, [%r493]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1329, %r1330, %r1331, %r1332}, [%r498]; // end inline asm $L__BB0_74: // begin inline asm cvt.rn.f16x2.f32 %r920, %f105, %f104; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r921, %f121, %f120; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r922, %f107, %f106; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r923, %f123, %f122; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r924, %f109, %f108; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r925, %f125, %f124; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r926, %f111, %f110; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r927, %f127, %f126; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r928, %f113, %f112; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r929, %f129, %f128; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r930, %f115, %f114; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r931, %f131, %f130; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r932, %f117, %f116; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r933, %f133, %f132; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r934, %f119, %f118; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r935, %f135, %f134; // end inline asm // begin inline asm mov.u32 %r936, 0; // end inline asm // begin inline asm mov.u32 %r937, 0; // end inline asm // begin inline asm mov.u32 %r938, 0; // end inline asm // begin inline asm mov.u32 %r939, 0; // end inline asm // begin inline asm mov.u32 %r940, 0; // end inline asm // begin inline asm mov.u32 %r941, 0; // end inline asm // begin inline asm mov.u32 %r942, 0; // end inline asm // begin inline asm mov.u32 %r943, 0; // end inline asm // begin inline asm mov.u32 %r944, 0; // end inline asm // begin inline asm mov.u32 %r945, 0; // end inline asm // begin inline asm mov.u32 %r946, 0; // end inline asm // begin inline asm mov.u32 %r947, 0; // end inline asm // begin inline asm mov.u32 %r948, 0; // end inline asm // begin inline asm mov.u32 %r949, 0; // end inline asm // begin inline asm mov.u32 %r950, 0; // end inline asm // begin inline asm mov.u32 %r951, 0; // end inline asm mov.b32 %f570, %r936; mov.b32 %f571, %r937; mov.b32 %f572, %r938; mov.b32 %f573, %r939; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r920, %r921, %r922, %r923}, {%r1364, %r1363}, {%f570, %f571, %f572, %f573}; // end inline asm mov.b32 %f578, %r940; mov.b32 %f579, %r941; mov.b32 %f580, %r942; mov.b32 %f581, %r943; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r920, %r921, %r922, %r923}, {%r1362, %r1361}, {%f578, %f579, %f580, %f581}; // end inline asm mov.b32 %f586, %r944; mov.b32 %f587, %r945; mov.b32 %f588, %r946; mov.b32 %f589, %r947; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r920, %r921, %r922, %r923}, {%r1360, %r1359}, {%f586, %f587, %f588, %f589}; // end inline asm mov.b32 %f594, %r948; mov.b32 %f595, %r949; mov.b32 %f596, %r950; mov.b32 %f597, %r951; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r920, %r921, %r922, %r923}, {%r1358, %r1357}, {%f594, %f595, %f596, %f597}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r924, %r925, %r926, %r927}, {%r1356, %r1355}, {%f570, %f571, %f572, %f573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r924, %r925, %r926, %r927}, {%r1354, %r1353}, {%f578, %f579, %f580, %f581}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r924, %r925, %r926, %r927}, {%r1352, %r1351}, {%f586, %f587, %f588, %f589}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r924, %r925, %r926, %r927}, {%r1350, %r1349}, {%f594, %f595, %f596, %f597}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r928, %r929, %r930, %r931}, {%r1348, %r1347}, {%f570, %f571, %f572, %f573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r928, %r929, %r930, %r931}, {%r1346, %r1345}, {%f578, %f579, %f580, %f581}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r928, %r929, %r930, %r931}, {%r1344, %r1343}, {%f586, %f587, %f588, %f589}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r928, %r929, %r930, %r931}, {%r1342, %r1341}, {%f594, %f595, %f596, %f597}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r932, %r933, %r934, %r935}, {%r1340, %r1339}, {%f570, %f571, %f572, %f573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r932, %r933, %r934, %r935}, {%r1338, %r1337}, {%f578, %f579, %f580, %f581}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r932, %r933, %r934, %r935}, {%r1336, %r1335}, {%f586, %f587, %f588, %f589}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r932, %r933, %r934, %r935}, {%r1334, %r1333}, {%f594, %f595, %f596, %f597}; // end inline asm setp.equ.ftz.f32 %p175, %f733, 0f00000000; mov.f32 %f769, 0f3F800000; mov.f32 %f768, %f769; @%p175 bra $L__BB0_76; rcp.approx.ftz.f32 %f768, %f733; $L__BB0_76: setp.equ.ftz.f32 %p176, %f732, 0f00000000; @%p176 bra $L__BB0_78; rcp.approx.ftz.f32 %f769, %f732; $L__BB0_78: add.s32 %r1167, %r1297, 64; setp.ge.s32 %p184, %r1167, %r104; mov.b32 %f668, %r1380; fma.rn.ftz.f32 %f669, %f136, %f668, %f570; mul.ftz.f32 %f670, %f768, %f669; mov.b32 %r1380, %f670; mov.b32 %f671, %r1379; fma.rn.ftz.f32 %f672, %f136, %f671, %f571; mul.ftz.f32 %f673, %f768, %f672; mov.b32 %r1379, %f673; mov.b32 %f674, %r1378; fma.rn.ftz.f32 %f675, %f138, %f674, %f572; mul.ftz.f32 %f676, %f769, %f675; mov.b32 %r1378, %f676; mov.b32 %f677, %r1377; fma.rn.ftz.f32 %f678, %f138, %f677, %f573; mul.ftz.f32 %f679, %f769, %f678; mov.b32 %r1377, %f679; mov.b32 %f680, %r1376; fma.rn.ftz.f32 %f681, %f136, %f680, %f578; mul.ftz.f32 %f682, %f768, %f681; mov.b32 %r1376, %f682; mov.b32 %f683, %r1375; fma.rn.ftz.f32 %f684, %f136, %f683, %f579; mul.ftz.f32 %f685, %f768, %f684; mov.b32 %r1375, %f685; mov.b32 %f686, %r1374; fma.rn.ftz.f32 %f687, %f138, %f686, %f580; mul.ftz.f32 %f688, %f769, %f687; mov.b32 %r1374, %f688; mov.b32 %f689, %r1373; fma.rn.ftz.f32 %f690, %f138, %f689, %f581; mul.ftz.f32 %f691, %f769, %f690; mov.b32 %r1373, %f691; mov.b32 %f692, %r1372; fma.rn.ftz.f32 %f693, %f136, %f692, %f586; mul.ftz.f32 %f694, %f768, %f693; mov.b32 %r1372, %f694; mov.b32 %f695, %r1371; fma.rn.ftz.f32 %f696, %f136, %f695, %f587; mul.ftz.f32 %f697, %f768, %f696; mov.b32 %r1371, %f697; mov.b32 %f698, %r1370; fma.rn.ftz.f32 %f699, %f138, %f698, %f588; mul.ftz.f32 %f700, %f769, %f699; mov.b32 %r1370, %f700; mov.b32 %f701, %r1369; fma.rn.ftz.f32 %f702, %f138, %f701, %f589; mul.ftz.f32 %f703, %f769, %f702; mov.b32 %r1369, %f703; mov.b32 %f704, %r1368; fma.rn.ftz.f32 %f705, %f136, %f704, %f594; mul.ftz.f32 %f706, %f768, %f705; mov.b32 %r1368, %f706; mov.b32 %f707, %r1367; fma.rn.ftz.f32 %f708, %f136, %f707, %f595; mul.ftz.f32 %f709, %f768, %f708; mov.b32 %r1367, %f709; mov.b32 %f710, %r1366; fma.rn.ftz.f32 %f711, %f138, %f710, %f596; mul.ftz.f32 %f712, %f769, %f711; mov.b32 %r1366, %f712; mov.b32 %f713, %r1365; fma.rn.ftz.f32 %f714, %f138, %f713, %f597; mul.ftz.f32 %f715, %f769, %f714; mov.b32 %r1365, %f715; @%p184 bra $L__BB0_80; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1364, %r1363, %r1362, %r1361}, [%r503]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1360, %r1359, %r1358, %r1357}, [%r508]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1356, %r1355, %r1354, %r1353}, [%r513]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1352, %r1351, %r1350, %r1349}, [%r518]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1348, %r1347, %r1346, %r1345}, [%r523]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1344, %r1343, %r1342, %r1341}, [%r528]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1340, %r1339, %r1338, %r1337}, [%r533]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1336, %r1335, %r1334, %r1333}, [%r538]; // end inline asm $L__BB0_80: add.s32 %r1297, %r1297, 64; setp.lt.s32 %p178, %r1297, %r104; add.s32 %r1298, %r1298, 64; mov.f32 %f734, %f103; mov.f32 %f735, %f102; @%p178 bra $L__BB0_3; $L__BB0_81: mov.u32 %r1198, %tid.x; shr.s32 %r1197, %r1198, 31; shr.u32 %r1196, %r1197, 29; add.s32 %r1195, %r1198, %r1196; shr.s32 %r1194, %r1195, 3; mov.u32 %r1193, _ZN25fused_multihead_attention5smem_E; shr.u32 %r1192, %r1197, 30; add.s32 %r1191, %r1198, %r1192; and.b32 %r1190, %r1191, -4; sub.s32 %r1189, %r1198, %r1190; shl.b32 %r1188, %r1189, 4; cvt.s64.s32 %rd121, %r1188; mov.b64 %rd120, fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0; mov.u64 %rd119, %rd120; ld.param.u32 %r1187, [%rd119+60]; mov.u32 %r1186, %ctaid.y; mov.u32 %r1185, %ctaid.x; ld.param.u32 %r1184, [fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1183, %r1, %r1186; mad.lo.s32 %r1182, %r1183, %r1184, %r1185; shr.s32 %r1181, %r1195, 31; shr.u32 %r1180, %r1181, 30; add.s32 %r1179, %r1194, %r1180; and.b32 %r1178, %r1179, 268435452; and.b32 %r1177, %r1195, 268435448; sub.s32 %r1176, %r1198, %r1177; sub.s32 %r1175, %r1194, %r1178; xor.b32 %r1174, %r1175, %r1176; shl.b32 %r1173, %r1194, 7; shl.b32 %r1172, %r1174, 4; add.s32 %r1171, %r1193, 4096; add.s32 %r1170, %r1172, %r1173; add.s32 %r1169, %r1170, %r1171; bar.sync 0; mov.b32 %f716, %r1379; mov.b32 %f717, %r1380; // begin inline asm cvt.rn.f16x2.f32 %r1107, %f716, %f717; // end inline asm mov.b32 %f718, %r1377; mov.b32 %f719, %r1378; // begin inline asm cvt.rn.f16x2.f32 %r1108, %f718, %f719; // end inline asm and.b32 %r1142, %r1198, 224; shr.u32 %r1143, %r1142, 2; and.b32 %r1144, %r1198, 24; shr.u32 %r1145, %r1144, 3; or.b32 %r1146, %r1143, %r1145; shl.b32 %r1147, %r1146, 7; add.s32 %r1149, %r1147, %r1193; and.b32 %r1150, %r1198, 4; and.b32 %r1151, %r1198, 3; bfi.b32 %r1152, %r1150, %r1151, 2, 30; shr.u32 %r1153, %r1144, 1; or.b32 %r1154, %r1152, %r1153; shl.b32 %r1155, %r1154, 2; add.s32 %r1156, %r1149, %r1155; add.s32 %r1109, %r1156, 4096; // begin inline asm st.shared.b32 [%r1109], %r1107; // end inline asm add.s32 %r1111, %r1156, 4608; // begin inline asm st.shared.b32 [%r1111], %r1108; // end inline asm xor.b32 %r1115, %r1109, 16; mov.b32 %f720, %r1375; mov.b32 %f721, %r1376; // begin inline asm cvt.rn.f16x2.f32 %r1113, %f720, %f721; // end inline asm mov.b32 %f722, %r1373; mov.b32 %f723, %r1374; // begin inline asm cvt.rn.f16x2.f32 %r1114, %f722, %f723; // end inline asm // begin inline asm st.shared.b32 [%r1115], %r1113; // end inline asm add.s32 %r1117, %r1115, 512; // begin inline asm st.shared.b32 [%r1117], %r1114; // end inline asm xor.b32 %r1121, %r1109, 32; mov.b32 %f724, %r1371; mov.b32 %f725, %r1372; // begin inline asm cvt.rn.f16x2.f32 %r1119, %f724, %f725; // end inline asm mov.b32 %f726, %r1369; mov.b32 %f727, %r1370; // begin inline asm cvt.rn.f16x2.f32 %r1120, %f726, %f727; // end inline asm // begin inline asm st.shared.b32 [%r1121], %r1119; // end inline asm add.s32 %r1123, %r1121, 512; // begin inline asm st.shared.b32 [%r1123], %r1120; // end inline asm xor.b32 %r1127, %r1109, 48; mov.b32 %f728, %r1367; mov.b32 %f729, %r1368; // begin inline asm cvt.rn.f16x2.f32 %r1125, %f728, %f729; // end inline asm mov.b32 %f730, %r1365; mov.b32 %f731, %r1366; // begin inline asm cvt.rn.f16x2.f32 %r1126, %f730, %f731; // end inline asm // begin inline asm st.shared.b32 [%r1127], %r1125; // end inline asm add.s32 %r1129, %r1127, 512; // begin inline asm st.shared.b32 [%r1129], %r1126; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1131, %r1132, %r1133, %r1134}, [%r1169]; // end inline asm add.s32 %r1140, %r1169, 2048; // begin inline asm ld.shared.v4.b32 {%r1136, %r1137, %r1138, %r1139}, [%r1140]; // end inline asm mul.lo.s32 %r1161, %r1182, %r1187; shl.b32 %r1162, %r1161, 1; cvt.s64.s32 %rd108, %r1162; add.s64 %rd57, %rd108, %rd121; cvt.u32.u64 %r1164, %rd5; setp.ge.s32 %p179, %r1164, %r1; @%p179 bra $L__BB0_86; mov.b64 %rd123, fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0; mov.u64 %rd122, %rd123; ld.param.u32 %r1209, [%rd122+60]; mov.u32 %r1208, %tid.x; shr.s32 %r1207, %r1209, 31; shr.u32 %r1206, %r1207, 29; add.s32 %r1205, %r1209, %r1206; shr.s32 %r1204, %r1205, 3; shr.s32 %r1203, %r1208, 31; shr.u32 %r1202, %r1203, 30; add.s32 %r1201, %r1208, %r1202; and.b32 %r1200, %r1201, -4; sub.s32 %r1199, %r1208, %r1200; setp.ge.s32 %p180, %r1199, %r1204; @%p180 bra $L__BB0_84; mov.b64 %rd131, fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0; mov.u64 %rd130, %rd131; ld.param.u64 %rd129, [%rd130+8]; ld.param.u64 %rd128, [%rd130+32]; mul.lo.s64 %rd110, %rd128, %rd5; add.s64 %rd111, %rd57, %rd110; cvta.to.global.u64 %rd112, %rd129; add.s64 %rd113, %rd112, %rd111; st.global.v4.u32 [%rd113], {%r1131, %r1132, %r1133, %r1134}; $L__BB0_84: add.s32 %r1166, %r1164, 32; setp.ge.s32 %p181, %r1166, %r1; or.pred %p183, %p181, %p180; @%p183 bra $L__BB0_86; mov.b64 %rd127, fmha_v2_flash_attention_fp16_64_64_S_32_sm86_kernel_nl_param_0; mov.u64 %rd126, %rd127; ld.param.u64 %rd125, [%rd126+8]; ld.param.u64 %rd124, [%rd126+32]; add.s64 %rd114, %rd5, 32; mul.lo.s64 %rd115, %rd114, %rd124; add.s64 %rd116, %rd57, %rd115; cvta.to.global.u64 %rd117, %rd125; add.s64 %rd118, %rd117, %rd116; st.global.v4.u32 [%rd118], {%r1136, %r1137, %r1138, %r1139}; $L__BB0_86: ret; }