// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_86 .address_size 64 // .globl fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl( .param .align 8 .b8 fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<129>; .reg .b16 %rs<147>; .reg .f32 %f<554>; .reg .b32 %r<1340>; .reg .b64 %rd<159>; mov.b64 %rd42, fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd42; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0+52]; mov.u32 %r418, %ctaid.z; shl.b32 %r3, %r418, 6; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_73; mov.u32 %r419, %tid.x; mov.u32 %r420, %ctaid.y; mov.u32 %r421, %ctaid.x; mul.lo.s32 %r422, %r1, %r420; mad.lo.s32 %r423, %r422, %r2, %r421; shr.s32 %r424, %r419, 31; shr.u32 %r425, %r424, 27; add.s32 %r426, %r419, %r425; and.b32 %r427, %r426, -32; sub.s32 %r428, %r419, %r427; shr.u32 %r429, %r424, 25; add.s32 %r430, %r419, %r429; shr.s32 %r431, %r430, 7; shl.b32 %r432, %r431, 4; shr.s32 %r433, %r428, 31; shr.u32 %r434, %r433, 30; add.s32 %r435, %r428, %r434; and.b32 %r436, %r435, 2147483644; sub.s32 %r437, %r428, %r436; shl.b32 %r438, %r437, 1; add.s32 %r1265, %r438, %r432; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r439, %r426, 5; shr.s32 %r440, %r426, 31; shr.u32 %r441, %r440, 30; add.s32 %r442, %r439, %r441; and.b32 %r443, %r442, 268435452; sub.s32 %r444, %r439, %r443; shl.b32 %r445, %r444, 4; shr.s32 %r446, %r435, 2; add.s32 %r5, %r445, %r446; shr.u32 %r447, %r424, 29; add.s32 %r448, %r419, %r447; and.b32 %r449, %r448, -8; sub.s32 %r6, %r419, %r449; setp.gt.s32 %p6, %r6, 4; shr.s32 %r7, %r448, 3; add.s32 %r450, %r7, %r3; cvt.s64.s32 %rd5, %r450; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd43, %rd6, %rd5; mul.lo.s32 %r451, %r423, 3; mul.wide.s32 %rd44, %r451, 80; shl.b32 %r452, %r6, 4; cvt.s64.s32 %rd45, %r452; add.s64 %rd46, %rd44, %rd45; add.s64 %rd47, %rd46, %rd43; ld.param.u64 %rd48, [%rd1]; add.s64 %rd7, %rd48, %rd47; shr.s32 %r453, %r448, 31; shr.u32 %r454, %r453, 29; add.s32 %r455, %r7, %r454; and.b32 %r456, %r455, 268435448; sub.s32 %r457, %r7, %r456; xor.b32 %r458, %r457, %r6; shl.b32 %r459, %r7, 7; shl.b32 %r460, %r458, 4; mov.u32 %r461, 31; mov.u32 %r462, 0; mov.u32 %r463, -1; shfl.sync.idx.b32 %r8|%p1, %r462, %r462, %r461, %r463; shfl.sync.idx.b32 %r21|%p7, %r462, %r462, %r461, %r463; and.b32 %r464, %r419, 96; shr.u32 %r465, %r464, 1; and.b32 %r466, %r419, 15; or.b32 %r467, %r465, %r466; and.b32 %r468, %r419, 7; shl.b32 %r469, %r419, 4; and.b32 %r470, %r469, 112; and.b32 %r471, %r419, 16; xor.b32 %r472, %r470, %r471; cvt.s64.s32 %rd49, %r7; mul.lo.s64 %rd50, %rd6, %rd49; add.s64 %rd51, %rd46, %rd50; add.s64 %rd52, %rd48, %rd51; add.s64 %rd158, %rd52, 80; shfl.sync.idx.b32 %r9|%p2, %r462, %r462, %r461, %r463; shfl.sync.idx.b32 %r10|%p3, %r462, %r462, %r461, %r463; shr.u32 %r473, %r471, 1; or.b32 %r474, %r473, %r468; and.b32 %r475, %r419, 8; shr.u32 %r476, %r475, 3; xor.b32 %r477, %r476, %r468; add.s64 %rd157, %rd52, 160; shfl.sync.idx.b32 %r478|%p8, %r462, %r462, %r461, %r463; shfl.sync.idx.b32 %r11|%p4, %r462, %r462, %r461, %r463; ld.param.u64 %rd12, [%rd1+32]; ld.param.u64 %rd13, [%rd1+8]; sub.s32 %r479, %r1, %r3; min.s32 %r13, %r479, 64; shl.b32 %r483, %r419, 7; and.b32 %r484, %r483, 1920; shl.b32 %r485, %r477, 4; shl.b32 %r486, %r474, 7; shl.b32 %r487, %r467, 7; add.s32 %r15, %r7, 16; add.s32 %r16, %r460, %r459; or.b32 %r17, %r487, %r472; or.b32 %r18, %r486, %r485; or.b32 %r19, %r472, %r484; mov.u32 %r488, _ZN25fused_multihead_attention5smem_E; add.s32 %r489, %r16, %r488; add.s32 %r20, %r489, 8192; @%p6 bra $L__BB0_3; shl.b64 %rd57, %rd6, 4; add.s32 %r498, %r7, 48; setp.lt.s32 %p9, %r498, %r13; add.s32 %r499, %r7, 32; setp.lt.s32 %p10, %r499, %r13; add.s32 %r490, %r489, %r21; add.s32 %r492, %r490, 2048; add.s32 %r494, %r490, 4096; add.s32 %r496, %r490, 6144; setp.lt.s32 %p11, %r7, %r13; selp.b32 %r491, 16, 0, %p11; // begin inline asm cp.async.cg.shared.global [%r490], [%rd7], 16, %r491; // end inline asm setp.lt.s32 %p12, %r15, %r13; selp.b32 %r493, 16, 0, %p12; add.s64 %rd54, %rd7, %rd57; // begin inline asm cp.async.cg.shared.global [%r492], [%rd54], 16, %r493; // end inline asm selp.b32 %r495, 16, 0, %p10; add.s64 %rd55, %rd54, %rd57; // begin inline asm cp.async.cg.shared.global [%r494], [%rd55], 16, %r495; // end inline asm selp.b32 %r497, 16, 0, %p9; add.s64 %rd56, %rd55, %rd57; // begin inline asm cp.async.cg.shared.global [%r496], [%rd56], 16, %r497; // end inline asm $L__BB0_3: min.s32 %r23, %r1, 32; @%p6 bra $L__BB0_5; setp.lt.s32 %p14, %r15, %r23; add.s32 %r502, %r20, %r10; add.s32 %r504, %r502, 2048; setp.lt.s32 %p15, %r7, %r23; selp.b32 %r503, 16, 0, %p15; // begin inline asm cp.async.cg.shared.global [%r502], [%rd158], 16, %r503; // end inline asm selp.b32 %r505, 16, 0, %p14; shl.b64 %rd60, %rd6, 4; add.s64 %rd59, %rd158, %rd60; // begin inline asm cp.async.cg.shared.global [%r504], [%rd59], 16, %r505; // end inline asm $L__BB0_5: @%p6 bra $L__BB0_7; setp.lt.s32 %p17, %r15, %r23; add.s32 %r512, %r489, %r11; add.s32 %r506, %r512, 12288; add.s32 %r508, %r512, 14336; setp.lt.s32 %p18, %r7, %r23; selp.b32 %r507, 16, 0, %p18; // begin inline asm cp.async.cg.shared.global [%r506], [%rd157], 16, %r507; // end inline asm selp.b32 %r509, 16, 0, %p17; shl.b64 %rd63, %rd6, 4; add.s64 %rd62, %rd157, %rd63; // begin inline asm cp.async.cg.shared.global [%r508], [%rd62], 16, %r509; // end inline asm $L__BB0_7: setp.lt.s32 %p19, %r6, 5; // begin inline asm cp.async.commit_group; // end inline asm @%p19 bra $L__BB0_9; add.s32 %r513, %r489, %r21; add.s32 %r518, %r513, 2048; add.s32 %r523, %r513, 4096; add.s32 %r528, %r513, 6144; mov.u32 %r552, 0; // begin inline asm st.shared.v4.b32 [%r513], {%r552, %r552, %r552, %r552}; // end inline asm // begin inline asm st.shared.v4.b32 [%r518], {%r552, %r552, %r552, %r552}; // end inline asm // begin inline asm st.shared.v4.b32 [%r523], {%r552, %r552, %r552, %r552}; // end inline asm // begin inline asm st.shared.v4.b32 [%r528], {%r552, %r552, %r552, %r552}; // end inline asm add.s32 %r533, %r20, %r10; add.s32 %r538, %r533, 2048; // begin inline asm st.shared.v4.b32 [%r533], {%r552, %r552, %r552, %r552}; // end inline asm // begin inline asm st.shared.v4.b32 [%r538], {%r552, %r552, %r552, %r552}; // end inline asm add.s32 %r555, %r489, %r11; add.s32 %r543, %r555, 12288; add.s32 %r548, %r555, 14336; // begin inline asm st.shared.v4.b32 [%r543], {%r552, %r552, %r552, %r552}; // end inline asm // begin inline asm st.shared.v4.b32 [%r548], {%r552, %r552, %r552, %r552}; // end inline asm $L__BB0_9: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r656, %r8, %r488; add.s32 %r560, %r656, %r17; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r556, %r557, %r558, %r559}, [%r560]; // end inline asm xor.b32 %r657, %r17, 32; add.s32 %r565, %r656, %r657; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r561, %r562, %r563, %r564}, [%r565]; // end inline asm xor.b32 %r658, %r17, 64; add.s32 %r570, %r656, %r658; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r566, %r567, %r568, %r569}, [%r570]; // end inline asm add.s32 %r659, %r9, %r488; add.s32 %r37, %r659, 8192; add.s32 %r575, %r37, %r18; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1288, %r1287, %r1286, %r1285}, [%r575]; // end inline asm add.s32 %r580, %r575, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1284, %r1283, %r1282, %r1281}, [%r580]; // end inline asm xor.b32 %r660, %r18, 32; add.s32 %r585, %r37, %r660; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1280, %r1279, %r1278, %r1277}, [%r585]; // end inline asm add.s32 %r590, %r585, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1276, %r1275, %r1274, %r1273}, [%r590]; // end inline asm xor.b32 %r661, %r18, 64; add.s32 %r595, %r37, %r661; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1272, %r1271, %r1270, %r1269}, [%r595]; // end inline asm add.s32 %r600, %r595, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1268, %r1289, %r1290, %r1291}, [%r600]; // end inline asm add.s32 %r662, %r488, 12288; add.s32 %r605, %r19, %r662; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1315, %r1314, %r1313, %r1312}, [%r605]; // end inline asm xor.b32 %r663, %r19, 32; add.s32 %r610, %r663, %r662; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1311, %r1310, %r1309, %r1308}, [%r610]; // end inline asm xor.b32 %r664, %r19, 64; add.s32 %r615, %r664, %r662; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1307, %r1306, %r1305, %r1304}, [%r615]; // end inline asm add.s32 %r665, %r488, 14336; add.s32 %r620, %r19, %r665; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1303, %r1302, %r1301, %r1300}, [%r620]; // end inline asm add.s32 %r625, %r663, %r665; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1299, %r1298, %r1297, %r1296}, [%r625]; // end inline asm add.s32 %r630, %r664, %r665; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1295, %r1294, %r1293, %r1292}, [%r630]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1339, 0; // end inline asm // begin inline asm mov.u32 %r1338, 0; // end inline asm // begin inline asm mov.u32 %r1337, 0; // end inline asm // begin inline asm mov.u32 %r1336, 0; // end inline asm // begin inline asm mov.u32 %r1335, 0; // end inline asm // begin inline asm mov.u32 %r1334, 0; // end inline asm // begin inline asm mov.u32 %r1333, 0; // end inline asm // begin inline asm mov.u32 %r1332, 0; // end inline asm // begin inline asm mov.u32 %r1331, 0; // end inline asm // begin inline asm mov.u32 %r1330, 0; // end inline asm // begin inline asm mov.u32 %r1329, 0; // end inline asm // begin inline asm mov.u32 %r1328, 0; // end inline asm // begin inline asm mov.u32 %r1327, 0; // end inline asm // begin inline asm mov.u32 %r1326, 0; // end inline asm // begin inline asm mov.u32 %r1325, 0; // end inline asm // begin inline asm mov.u32 %r1324, 0; // end inline asm // begin inline asm mov.u32 %r1323, 0; // end inline asm // begin inline asm mov.u32 %r1322, 0; // end inline asm // begin inline asm mov.u32 %r1321, 0; // end inline asm // begin inline asm mov.u32 %r1320, 0; // end inline asm // begin inline asm mov.u32 %r1319, 0; // end inline asm // begin inline asm mov.u32 %r1318, 0; // end inline asm // begin inline asm mov.u32 %r1317, 0; // end inline asm // begin inline asm mov.u32 %r1316, 0; // end inline asm add.s32 %r666, %r1, 31; shr.s32 %r667, %r666, 31; shr.u32 %r668, %r667, 27; add.s32 %r669, %r666, %r668; and.b32 %r110, %r669, -32; setp.lt.s32 %p20, %r1, 1; @%p20 bra $L__BB0_62; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r673, %r489, %r11; add.s32 %r171, %r673, 12288; add.s32 %r172, %r20, %r10; add.s32 %r173, %r172, 2048; cvt.s64.s32 %rd14, %r5; cvt.s64.s32 %rd15, %r1265; add.s32 %r174, %r673, 14336; add.s32 %r674, %r1265, 1; cvt.s64.s32 %rd16, %r674; add.s32 %r675, %r1265, 8; cvt.s64.s32 %rd17, %r675; add.s32 %r676, %r1265, 9; cvt.s64.s32 %rd18, %r676; add.s32 %r677, %r1265, 16; cvt.s64.s32 %rd19, %r677; add.s32 %r678, %r1265, 17; cvt.s64.s32 %rd20, %r678; add.s32 %r679, %r1265, 24; cvt.s64.s32 %rd21, %r679; add.s32 %r680, %r1265, 25; cvt.s64.s32 %rd22, %r680; add.s32 %r175, %r5, 8; mov.u32 %r1264, 0; mov.f32 %f534, 0fFF800000; mov.f32 %f532, 0f00000000; mov.u32 %r1266, %r1; mov.f32 %f533, %f532; mov.f32 %f535, %f534; mov.u32 %r1267, %r1; $L__BB0_11: add.s32 %r681, %r1264, 32; setp.ge.s32 %p21, %r681, %r110; @%p21 bra $L__BB0_18; bar.sync 0; shl.b64 %rd64, %rd6, 5; add.s64 %rd25, %rd158, %rd64; add.s32 %r1267, %r1267, -32; @%p6 bra $L__BB0_14; min.s32 %r686, %r1267, 32; setp.lt.s32 %p23, %r15, %r686; setp.lt.s32 %p24, %r7, %r686; selp.b32 %r683, 16, 0, %p24; // begin inline asm cp.async.cg.shared.global [%r172], [%rd25], 16, %r683; // end inline asm selp.b32 %r685, 16, 0, %p23; mul.lo.s64 %rd67, %rd6, 48; add.s64 %rd66, %rd158, %rd67; // begin inline asm cp.async.cg.shared.global [%r173], [%rd66], 16, %r685; // end inline asm $L__BB0_14: add.s64 %rd26, %rd157, %rd64; add.s32 %r1266, %r1266, -32; @%p6 bra $L__BB0_16; min.s32 %r692, %r1266, 32; setp.lt.s32 %p26, %r15, %r692; setp.lt.s32 %p27, %r7, %r692; selp.b32 %r689, 16, 0, %p27; // begin inline asm cp.async.cg.shared.global [%r171], [%rd26], 16, %r689; // end inline asm selp.b32 %r691, 16, 0, %p26; mul.lo.s64 %rd71, %rd6, 48; add.s64 %rd70, %rd157, %rd71; // begin inline asm cp.async.cg.shared.global [%r174], [%rd70], 16, %r691; // end inline asm $L__BB0_16: // begin inline asm cp.async.commit_group; // end inline asm mov.u64 %rd157, %rd26; mov.u64 %rd158, %rd25; @%p19 bra $L__BB0_18; mov.u32 %r713, 0; // begin inline asm st.shared.v4.b32 [%r172], {%r713, %r713, %r713, %r713}; // end inline asm // begin inline asm st.shared.v4.b32 [%r173], {%r713, %r713, %r713, %r713}; // end inline asm // begin inline asm st.shared.v4.b32 [%r171], {%r713, %r713, %r713, %r713}; // end inline asm // begin inline asm st.shared.v4.b32 [%r174], {%r713, %r713, %r713, %r713}; // end inline asm mov.u64 %rd157, %rd26; mov.u64 %rd158, %rd25; $L__BB0_18: setp.eq.s16 %p29, %rs1, 0; @%p29 bra $L__BB0_51; mov.u32 %r1188, %ctaid.x; mov.u32 %r1187, %ctaid.y; mov.u32 %r1186, %ctaid.z; shl.b32 %r1185, %r1186, 6; ld.param.u32 %r1184, [fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0+52]; cvt.s64.s32 %rd72, %r1185; add.s64 %rd73, %rd14, %rd72; setp.ge.u64 %p30, %rd73, %rd2; mul.lo.s32 %r716, %r1, %r1185; cvt.s64.s32 %rd74, %r716; cvt.u64.u32 %rd29, %r1264; add.s64 %rd30, %rd74, %rd29; mul.lo.s64 %rd75, %rd2, %rd14; add.s64 %rd76, %rd30, %rd75; add.s64 %rd77, %rd76, %rd15; add.s64 %rd31, %rd15, %rd29; setp.ge.u64 %p31, %rd31, %rd2; shl.b64 %rd78, %rd77, 1; mad.lo.s32 %r719, %r1184, %r1187, %r1188; cvt.s64.s32 %rd79, %r719; mul.lo.s64 %rd80, %rd4, %rd79; add.s64 %rd81, %rd80, %rd78; cvta.to.global.u64 %rd82, %rd3; add.s64 %rd32, %rd82, %rd81; mov.u16 %rs132, 0; or.pred %p32, %p31, %p30; mov.u16 %rs131, %rs132; @%p32 bra $L__BB0_21; ld.global.u16 %rs131, [%rd32]; $L__BB0_21: add.s64 %rd33, %rd16, %rd29; setp.ge.u64 %p34, %rd33, %rd2; or.pred %p35, %p34, %p30; @%p35 bra $L__BB0_23; ld.global.u16 %rs132, [%rd32+2]; $L__BB0_23: add.s64 %rd34, %rd17, %rd29; setp.ge.u64 %p37, %rd34, %rd2; mov.u16 %rs134, 0; or.pred %p38, %p37, %p30; mov.u16 %rs133, %rs134; @%p38 bra $L__BB0_25; ld.global.u16 %rs133, [%rd32+16]; $L__BB0_25: add.s64 %rd35, %rd18, %rd29; setp.ge.u64 %p40, %rd35, %rd2; or.pred %p41, %p40, %p30; @%p41 bra $L__BB0_27; ld.global.u16 %rs134, [%rd32+18]; $L__BB0_27: add.s64 %rd36, %rd19, %rd29; setp.ge.u64 %p43, %rd36, %rd2; mov.u16 %rs136, 0; or.pred %p44, %p43, %p30; mov.u16 %rs135, %rs136; @%p44 bra $L__BB0_29; ld.global.u16 %rs135, [%rd32+32]; $L__BB0_29: add.s64 %rd37, %rd20, %rd29; setp.ge.u64 %p46, %rd37, %rd2; or.pred %p47, %p46, %p30; @%p47 bra $L__BB0_31; ld.global.u16 %rs136, [%rd32+34]; $L__BB0_31: add.s64 %rd38, %rd21, %rd29; setp.ge.u64 %p49, %rd38, %rd2; mov.u16 %rs138, 0; or.pred %p50, %p49, %p30; mov.u16 %rs137, %rs138; @%p50 bra $L__BB0_33; ld.global.u16 %rs137, [%rd32+48]; $L__BB0_33: add.s64 %rd39, %rd22, %rd29; setp.ge.u64 %p52, %rd39, %rd2; or.pred %p53, %p52, %p30; @%p53 bra $L__BB0_35; ld.global.u16 %rs138, [%rd32+50]; $L__BB0_35: cvt.s64.s32 %rd98, %r175; add.s64 %rd99, %rd98, %rd72; setp.ge.u64 %p54, %rd99, %rd2; mul.lo.s64 %rd100, %rd2, %rd98; add.s64 %rd101, %rd30, %rd100; add.s64 %rd102, %rd101, %rd15; shl.b64 %rd103, %rd102, 1; add.s64 %rd106, %rd80, %rd103; add.s64 %rd40, %rd82, %rd106; mov.u16 %rs140, 0; or.pred %p56, %p31, %p54; mov.u16 %rs139, %rs140; @%p56 bra $L__BB0_37; ld.global.u16 %rs139, [%rd40]; $L__BB0_37: or.pred %p59, %p34, %p54; @%p59 bra $L__BB0_39; ld.global.u16 %rs140, [%rd40+2]; $L__BB0_39: mov.u16 %rs142, 0; or.pred %p62, %p37, %p54; mov.u16 %rs141, %rs142; @%p62 bra $L__BB0_41; ld.global.u16 %rs141, [%rd40+16]; $L__BB0_41: or.pred %p65, %p40, %p54; @%p65 bra $L__BB0_43; ld.global.u16 %rs142, [%rd40+18]; $L__BB0_43: mov.u16 %rs144, 0; or.pred %p68, %p43, %p54; mov.u16 %rs143, %rs144; @%p68 bra $L__BB0_45; ld.global.u16 %rs143, [%rd40+32]; $L__BB0_45: or.pred %p71, %p46, %p54; @%p71 bra $L__BB0_47; ld.global.u16 %rs144, [%rd40+34]; $L__BB0_47: mov.u16 %rs146, 0; or.pred %p74, %p49, %p54; mov.u16 %rs145, %rs146; @%p74 bra $L__BB0_49; ld.global.u16 %rs145, [%rd40+48]; $L__BB0_49: or.pred %p77, %p52, %p54; @%p77 bra $L__BB0_51; ld.global.u16 %rs146, [%rd40+50]; $L__BB0_51: // begin inline asm mov.u32 %r753, 0; // end inline asm // begin inline asm mov.u32 %r754, 0; // end inline asm // begin inline asm mov.u32 %r755, 0; // end inline asm // begin inline asm mov.u32 %r756, 0; // end inline asm // begin inline asm mov.u32 %r757, 0; // end inline asm // begin inline asm mov.u32 %r758, 0; // end inline asm // begin inline asm mov.u32 %r759, 0; // end inline asm // begin inline asm mov.u32 %r760, 0; // end inline asm // begin inline asm mov.u32 %r761, 0; // end inline asm // begin inline asm mov.u32 %r762, 0; // end inline asm // begin inline asm mov.u32 %r763, 0; // end inline asm // begin inline asm mov.u32 %r764, 0; // end inline asm // begin inline asm mov.u32 %r765, 0; // end inline asm // begin inline asm mov.u32 %r766, 0; // end inline asm // begin inline asm mov.u32 %r767, 0; // end inline asm // begin inline asm mov.u32 %r768, 0; // end inline asm mov.b32 %f140, %r753; mov.b32 %f141, %r754; mov.b32 %f142, %r755; mov.b32 %f143, %r756; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f140, %f141, %f142, %f143}, {%r556, %r557, %r558, %r559}, {%r1288, %r1287}, {%f140, %f141, %f142, %f143}; // end inline asm mov.b32 %f148, %r757; mov.b32 %f149, %r758; mov.b32 %f150, %r759; mov.b32 %f151, %r760; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r556, %r557, %r558, %r559}, {%r1286, %r1285}, {%f148, %f149, %f150, %f151}; // end inline asm mov.b32 %f156, %r761; mov.b32 %f157, %r762; mov.b32 %f158, %r763; mov.b32 %f159, %r764; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r556, %r557, %r558, %r559}, {%r1284, %r1283}, {%f156, %f157, %f158, %f159}; // end inline asm mov.b32 %f164, %r765; mov.b32 %f165, %r766; mov.b32 %f166, %r767; mov.b32 %f167, %r768; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r556, %r557, %r558, %r559}, {%r1282, %r1281}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f140, %f141, %f142, %f143}, {%r561, %r562, %r563, %r564}, {%r1280, %r1279}, {%f140, %f141, %f142, %f143}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r561, %r562, %r563, %r564}, {%r1278, %r1277}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r561, %r562, %r563, %r564}, {%r1276, %r1275}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r561, %r562, %r563, %r564}, {%r1274, %r1273}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f140, %f141, %f142, %f143}, {%r566, %r567, %r568, %r569}, {%r1272, %r1271}, {%f140, %f141, %f142, %f143}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r566, %r567, %r568, %r569}, {%r1270, %r1269}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r566, %r567, %r568, %r569}, {%r1268, %r1289}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r566, %r567, %r568, %r569}, {%r1290, %r1291}, {%f164, %f165, %f166, %f167}; // end inline asm mul.ftz.f32 %f204, %f1, %f140; mul.ftz.f32 %f205, %f1, %f141; mul.ftz.f32 %f206, %f1, %f148; mul.ftz.f32 %f207, %f1, %f149; mul.ftz.f32 %f208, %f1, %f142; mul.ftz.f32 %f209, %f1, %f143; mul.ftz.f32 %f210, %f1, %f150; mul.ftz.f32 %f211, %f1, %f151; mul.ftz.f32 %f212, %f1, %f156; mul.ftz.f32 %f213, %f1, %f157; mul.ftz.f32 %f214, %f1, %f164; mul.ftz.f32 %f215, %f1, %f165; mul.ftz.f32 %f216, %f1, %f158; mul.ftz.f32 %f217, %f1, %f159; mul.ftz.f32 %f218, %f1, %f166; mul.ftz.f32 %f219, %f1, %f167; setp.lt.s32 %p78, %r1265, %r1; selp.f32 %f551, %f204, 0fFF800000, %p78; add.s32 %r841, %r1265, 1; setp.lt.s32 %p79, %r841, %r1; selp.f32 %f550, %f205, 0fFF800000, %p79; add.s32 %r842, %r1265, 8; setp.lt.s32 %p80, %r842, %r1; selp.f32 %f549, %f206, 0fFF800000, %p80; add.s32 %r843, %r1265, 9; setp.lt.s32 %p81, %r843, %r1; selp.f32 %f548, %f207, 0fFF800000, %p81; add.s32 %r844, %r1265, 16; setp.lt.s32 %p82, %r844, %r1; selp.f32 %f547, %f212, 0fFF800000, %p82; add.s32 %r845, %r1265, 17; setp.lt.s32 %p83, %r845, %r1; selp.f32 %f546, %f213, 0fFF800000, %p83; add.s32 %r846, %r1265, 24; setp.lt.s32 %p84, %r846, %r1; selp.f32 %f545, %f214, 0fFF800000, %p84; add.s32 %r847, %r1265, 25; setp.lt.s32 %p85, %r847, %r1; selp.f32 %f544, %f215, 0fFF800000, %p85; selp.f32 %f543, %f208, 0fFF800000, %p78; selp.f32 %f542, %f209, 0fFF800000, %p79; selp.f32 %f541, %f210, 0fFF800000, %p80; selp.f32 %f540, %f211, 0fFF800000, %p81; selp.f32 %f539, %f216, 0fFF800000, %p82; selp.f32 %f538, %f217, 0fFF800000, %p83; selp.f32 %f537, %f218, 0fFF800000, %p84; selp.f32 %f536, %f219, 0fFF800000, %p85; @%p29 bra $L__BB0_53; // begin inline asm cvt.f32.f16 %f220, %rs131; // end inline asm add.ftz.f32 %f551, %f220, %f551; // begin inline asm cvt.f32.f16 %f221, %rs132; // end inline asm add.ftz.f32 %f550, %f221, %f550; // begin inline asm cvt.f32.f16 %f222, %rs133; // end inline asm add.ftz.f32 %f549, %f222, %f549; // begin inline asm cvt.f32.f16 %f223, %rs134; // end inline asm add.ftz.f32 %f548, %f223, %f548; // begin inline asm cvt.f32.f16 %f224, %rs135; // end inline asm add.ftz.f32 %f547, %f224, %f547; // begin inline asm cvt.f32.f16 %f225, %rs136; // end inline asm add.ftz.f32 %f546, %f225, %f546; // begin inline asm cvt.f32.f16 %f226, %rs137; // end inline asm add.ftz.f32 %f545, %f226, %f545; // begin inline asm cvt.f32.f16 %f227, %rs138; // end inline asm add.ftz.f32 %f544, %f227, %f544; // begin inline asm cvt.f32.f16 %f228, %rs139; // end inline asm add.ftz.f32 %f543, %f228, %f543; // begin inline asm cvt.f32.f16 %f229, %rs140; // end inline asm add.ftz.f32 %f542, %f229, %f542; // begin inline asm cvt.f32.f16 %f230, %rs141; // end inline asm add.ftz.f32 %f541, %f230, %f541; // begin inline asm cvt.f32.f16 %f231, %rs142; // end inline asm add.ftz.f32 %f540, %f231, %f540; // begin inline asm cvt.f32.f16 %f232, %rs143; // end inline asm add.ftz.f32 %f539, %f232, %f539; // begin inline asm cvt.f32.f16 %f233, %rs144; // end inline asm add.ftz.f32 %f538, %f233, %f538; // begin inline asm cvt.f32.f16 %f234, %rs145; // end inline asm add.ftz.f32 %f537, %f234, %f537; // begin inline asm cvt.f32.f16 %f235, %rs146; // end inline asm add.ftz.f32 %f536, %f235, %f536; $L__BB0_53: add.s32 %r1189, %r1264, 32; setp.ge.s32 %p128, %r1189, %r110; setp.gt.ftz.f32 %p88, %f551, %f550; selp.f32 %f236, %f551, %f550, %p88; setp.gt.ftz.f32 %p89, %f236, %f549; selp.f32 %f237, %f236, %f549, %p89; setp.gt.ftz.f32 %p90, %f237, %f548; selp.f32 %f238, %f237, %f548, %p90; setp.gt.ftz.f32 %p91, %f238, %f547; selp.f32 %f239, %f238, %f547, %p91; setp.gt.ftz.f32 %p92, %f239, %f546; selp.f32 %f240, %f239, %f546, %p92; setp.gt.ftz.f32 %p93, %f240, %f545; selp.f32 %f241, %f240, %f545, %p93; setp.gt.ftz.f32 %p94, %f241, %f544; selp.f32 %f242, %f241, %f544, %p94; setp.gt.ftz.f32 %p95, %f543, %f542; selp.f32 %f243, %f543, %f542, %p95; setp.gt.ftz.f32 %p96, %f243, %f541; selp.f32 %f244, %f243, %f541, %p96; setp.gt.ftz.f32 %p97, %f244, %f540; selp.f32 %f245, %f244, %f540, %p97; setp.gt.ftz.f32 %p98, %f245, %f539; selp.f32 %f246, %f245, %f539, %p98; setp.gt.ftz.f32 %p99, %f246, %f538; selp.f32 %f247, %f246, %f538, %p99; setp.gt.ftz.f32 %p100, %f247, %f537; selp.f32 %f248, %f247, %f537, %p100; setp.gt.ftz.f32 %p101, %f248, %f536; selp.f32 %f249, %f248, %f536, %p101; mov.b32 %r849, %f242; mov.u32 %r850, 31; mov.u32 %r851, 1; mov.u32 %r852, -1; shfl.sync.bfly.b32 %r853|%p102, %r849, %r851, %r850, %r852; mov.b32 %f250, %r853; setp.gt.ftz.f32 %p103, %f242, %f250; selp.f32 %f251, %f242, %f250, %p103; mov.b32 %r854, %f251; mov.u32 %r855, 2; shfl.sync.bfly.b32 %r856|%p104, %r854, %r855, %r850, %r852; mov.b32 %f252, %r856; setp.gt.ftz.f32 %p105, %f251, %f252; selp.f32 %f253, %f251, %f252, %p105; mov.b32 %r857, %f249; shfl.sync.bfly.b32 %r858|%p106, %r857, %r851, %r850, %r852; mov.b32 %f254, %r858; setp.gt.ftz.f32 %p107, %f249, %f254; selp.f32 %f255, %f249, %f254, %p107; mov.b32 %r859, %f255; shfl.sync.bfly.b32 %r860|%p108, %r859, %r855, %r850, %r852; mov.b32 %f256, %r860; setp.gt.ftz.f32 %p109, %f255, %f256; selp.f32 %f257, %f255, %f256, %p109; max.ftz.f32 %f54, %f253, %f535; max.ftz.f32 %f55, %f257, %f534; sub.ftz.f32 %f258, %f551, %f54; mul.ftz.f32 %f259, %f258, 0f3FB8AA3B; ex2.approx.ftz.f32 %f56, %f259; sub.ftz.f32 %f260, %f550, %f54; mul.ftz.f32 %f261, %f260, 0f3FB8AA3B; ex2.approx.ftz.f32 %f57, %f261; sub.ftz.f32 %f262, %f549, %f54; mul.ftz.f32 %f263, %f262, 0f3FB8AA3B; ex2.approx.ftz.f32 %f58, %f263; sub.ftz.f32 %f264, %f548, %f54; mul.ftz.f32 %f265, %f264, 0f3FB8AA3B; ex2.approx.ftz.f32 %f59, %f265; sub.ftz.f32 %f266, %f547, %f54; mul.ftz.f32 %f267, %f266, 0f3FB8AA3B; ex2.approx.ftz.f32 %f60, %f267; sub.ftz.f32 %f268, %f546, %f54; mul.ftz.f32 %f269, %f268, 0f3FB8AA3B; ex2.approx.ftz.f32 %f61, %f269; sub.ftz.f32 %f270, %f545, %f54; mul.ftz.f32 %f271, %f270, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f271; sub.ftz.f32 %f272, %f544, %f54; mul.ftz.f32 %f273, %f272, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f273; sub.ftz.f32 %f274, %f543, %f55; mul.ftz.f32 %f275, %f274, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f275; sub.ftz.f32 %f276, %f542, %f55; mul.ftz.f32 %f277, %f276, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f277; sub.ftz.f32 %f278, %f541, %f55; mul.ftz.f32 %f279, %f278, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f279; sub.ftz.f32 %f280, %f540, %f55; mul.ftz.f32 %f281, %f280, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f281; sub.ftz.f32 %f282, %f539, %f55; mul.ftz.f32 %f283, %f282, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f283; sub.ftz.f32 %f284, %f538, %f55; mul.ftz.f32 %f285, %f284, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f285; sub.ftz.f32 %f286, %f537, %f55; mul.ftz.f32 %f287, %f286, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f287; sub.ftz.f32 %f288, %f536, %f55; mul.ftz.f32 %f289, %f288, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f289; add.ftz.f32 %f290, %f56, %f57; add.ftz.f32 %f291, %f290, 0f00000000; add.ftz.f32 %f292, %f58, %f59; add.ftz.f32 %f293, %f292, 0f00000000; add.ftz.f32 %f294, %f60, %f61; add.ftz.f32 %f295, %f291, %f294; add.ftz.f32 %f296, %f62, %f63; add.ftz.f32 %f297, %f293, %f296; add.ftz.f32 %f298, %f295, %f297; add.ftz.f32 %f299, %f64, %f65; add.ftz.f32 %f300, %f299, 0f00000000; add.ftz.f32 %f301, %f66, %f67; add.ftz.f32 %f302, %f301, 0f00000000; add.ftz.f32 %f303, %f68, %f69; add.ftz.f32 %f304, %f300, %f303; add.ftz.f32 %f305, %f70, %f71; add.ftz.f32 %f306, %f302, %f305; add.ftz.f32 %f307, %f304, %f306; mov.b32 %r861, %f298; shfl.sync.bfly.b32 %r862|%p110, %r861, %r851, %r850, %r852; mov.b32 %f308, %r862; add.ftz.f32 %f309, %f298, %f308; mov.b32 %r863, %f309; shfl.sync.bfly.b32 %r864|%p111, %r863, %r855, %r850, %r852; mov.b32 %f310, %r864; add.ftz.f32 %f311, %f309, %f310; mov.b32 %r865, %f307; shfl.sync.bfly.b32 %r866|%p112, %r865, %r851, %r850, %r852; mov.b32 %f312, %r866; add.ftz.f32 %f313, %f307, %f312; mov.b32 %r867, %f313; shfl.sync.bfly.b32 %r868|%p113, %r867, %r855, %r850, %r852; mov.b32 %f314, %r868; add.ftz.f32 %f315, %f313, %f314; sub.ftz.f32 %f316, %f535, %f54; mul.ftz.f32 %f317, %f316, 0f3FB8AA3B; ex2.approx.ftz.f32 %f318, %f317; mul.ftz.f32 %f72, %f318, %f533; add.ftz.f32 %f533, %f72, %f311; sub.ftz.f32 %f319, %f534, %f55; mul.ftz.f32 %f320, %f319, 0f3FB8AA3B; ex2.approx.ftz.f32 %f321, %f320; mul.ftz.f32 %f74, %f321, %f532; add.ftz.f32 %f532, %f74, %f315; @%p128 bra $L__BB0_55; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1288, %r1287, %r1286, %r1285}, [%r575]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1284, %r1283, %r1282, %r1281}, [%r580]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1280, %r1279, %r1278, %r1277}, [%r585]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1276, %r1275, %r1274, %r1273}, [%r590]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1272, %r1271, %r1270, %r1269}, [%r595]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1268, %r1289, %r1290, %r1291}, [%r600]; // end inline asm $L__BB0_55: // begin inline asm cvt.rn.f16x2.f32 %r912, %f57, %f56; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r913, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r914, %f59, %f58; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r915, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r916, %f61, %f60; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r917, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r918, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r919, %f71, %f70; // end inline asm // begin inline asm mov.u32 %r920, 0; // end inline asm // begin inline asm mov.u32 %r921, 0; // end inline asm // begin inline asm mov.u32 %r922, 0; // end inline asm // begin inline asm mov.u32 %r923, 0; // end inline asm // begin inline asm mov.u32 %r924, 0; // end inline asm // begin inline asm mov.u32 %r925, 0; // end inline asm // begin inline asm mov.u32 %r926, 0; // end inline asm // begin inline asm mov.u32 %r927, 0; // end inline asm // begin inline asm mov.u32 %r928, 0; // end inline asm // begin inline asm mov.u32 %r929, 0; // end inline asm // begin inline asm mov.u32 %r930, 0; // end inline asm // begin inline asm mov.u32 %r931, 0; // end inline asm // begin inline asm mov.u32 %r932, 0; // end inline asm // begin inline asm mov.u32 %r933, 0; // end inline asm // begin inline asm mov.u32 %r934, 0; // end inline asm // begin inline asm mov.u32 %r935, 0; // end inline asm // begin inline asm mov.u32 %r936, 0; // end inline asm // begin inline asm mov.u32 %r937, 0; // end inline asm // begin inline asm mov.u32 %r938, 0; // end inline asm // begin inline asm mov.u32 %r939, 0; // end inline asm // begin inline asm mov.u32 %r940, 0; // end inline asm // begin inline asm mov.u32 %r941, 0; // end inline asm // begin inline asm mov.u32 %r942, 0; // end inline asm // begin inline asm mov.u32 %r943, 0; // end inline asm mov.b32 %f386, %r920; mov.b32 %f387, %r921; mov.b32 %f388, %r922; mov.b32 %f389, %r923; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f386, %f387, %f388, %f389}, {%r912, %r913, %r914, %r915}, {%r1315, %r1314}, {%f386, %f387, %f388, %f389}; // end inline asm mov.b32 %f394, %r924; mov.b32 %f395, %r925; mov.b32 %f396, %r926; mov.b32 %f397, %r927; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f394, %f395, %f396, %f397}, {%r912, %r913, %r914, %r915}, {%r1313, %r1312}, {%f394, %f395, %f396, %f397}; // end inline asm mov.b32 %f402, %r928; mov.b32 %f403, %r929; mov.b32 %f404, %r930; mov.b32 %f405, %r931; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f402, %f403, %f404, %f405}, {%r912, %r913, %r914, %r915}, {%r1311, %r1310}, {%f402, %f403, %f404, %f405}; // end inline asm mov.b32 %f410, %r932; mov.b32 %f411, %r933; mov.b32 %f412, %r934; mov.b32 %f413, %r935; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f410, %f411, %f412, %f413}, {%r912, %r913, %r914, %r915}, {%r1309, %r1308}, {%f410, %f411, %f412, %f413}; // end inline asm mov.b32 %f418, %r936; mov.b32 %f419, %r937; mov.b32 %f420, %r938; mov.b32 %f421, %r939; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f418, %f419, %f420, %f421}, {%r912, %r913, %r914, %r915}, {%r1307, %r1306}, {%f418, %f419, %f420, %f421}; // end inline asm mov.b32 %f426, %r940; mov.b32 %f427, %r941; mov.b32 %f428, %r942; mov.b32 %f429, %r943; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f426, %f427, %f428, %f429}, {%r912, %r913, %r914, %r915}, {%r1305, %r1304}, {%f426, %f427, %f428, %f429}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f386, %f387, %f388, %f389}, {%r916, %r917, %r918, %r919}, {%r1303, %r1302}, {%f386, %f387, %f388, %f389}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f394, %f395, %f396, %f397}, {%r916, %r917, %r918, %r919}, {%r1301, %r1300}, {%f394, %f395, %f396, %f397}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f402, %f403, %f404, %f405}, {%r916, %r917, %r918, %r919}, {%r1299, %r1298}, {%f402, %f403, %f404, %f405}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f410, %f411, %f412, %f413}, {%r916, %r917, %r918, %r919}, {%r1297, %r1296}, {%f410, %f411, %f412, %f413}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f418, %f419, %f420, %f421}, {%r916, %r917, %r918, %r919}, {%r1295, %r1294}, {%f418, %f419, %f420, %f421}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f426, %f427, %f428, %f429}, {%r916, %r917, %r918, %r919}, {%r1293, %r1292}, {%f426, %f427, %f428, %f429}; // end inline asm setp.equ.ftz.f32 %p114, %f533, 0f00000000; mov.f32 %f553, 0f3F800000; mov.f32 %f552, %f553; @%p114 bra $L__BB0_57; rcp.approx.ftz.f32 %f552, %f533; $L__BB0_57: setp.equ.ftz.f32 %p115, %f532, 0f00000000; @%p115 bra $L__BB0_59; rcp.approx.ftz.f32 %f553, %f532; $L__BB0_59: add.s32 %r1142, %r1264, 32; setp.ge.s32 %p127, %r1142, %r110; mov.b32 %f436, %r1339; fma.rn.ftz.f32 %f437, %f72, %f436, %f386; mul.ftz.f32 %f438, %f552, %f437; mov.b32 %r1339, %f438; mov.b32 %f439, %r1338; fma.rn.ftz.f32 %f440, %f72, %f439, %f387; mul.ftz.f32 %f441, %f552, %f440; mov.b32 %r1338, %f441; mov.b32 %f442, %r1337; fma.rn.ftz.f32 %f443, %f74, %f442, %f388; mul.ftz.f32 %f444, %f553, %f443; mov.b32 %r1337, %f444; mov.b32 %f445, %r1336; fma.rn.ftz.f32 %f446, %f74, %f445, %f389; mul.ftz.f32 %f447, %f553, %f446; mov.b32 %r1336, %f447; mov.b32 %f448, %r1335; fma.rn.ftz.f32 %f449, %f72, %f448, %f394; mul.ftz.f32 %f450, %f552, %f449; mov.b32 %r1335, %f450; mov.b32 %f451, %r1334; fma.rn.ftz.f32 %f452, %f72, %f451, %f395; mul.ftz.f32 %f453, %f552, %f452; mov.b32 %r1334, %f453; mov.b32 %f454, %r1333; fma.rn.ftz.f32 %f455, %f74, %f454, %f396; mul.ftz.f32 %f456, %f553, %f455; mov.b32 %r1333, %f456; mov.b32 %f457, %r1332; fma.rn.ftz.f32 %f458, %f74, %f457, %f397; mul.ftz.f32 %f459, %f553, %f458; mov.b32 %r1332, %f459; mov.b32 %f460, %r1331; fma.rn.ftz.f32 %f461, %f72, %f460, %f402; mul.ftz.f32 %f462, %f552, %f461; mov.b32 %r1331, %f462; mov.b32 %f463, %r1330; fma.rn.ftz.f32 %f464, %f72, %f463, %f403; mul.ftz.f32 %f465, %f552, %f464; mov.b32 %r1330, %f465; mov.b32 %f466, %r1329; fma.rn.ftz.f32 %f467, %f74, %f466, %f404; mul.ftz.f32 %f468, %f553, %f467; mov.b32 %r1329, %f468; mov.b32 %f469, %r1328; fma.rn.ftz.f32 %f470, %f74, %f469, %f405; mul.ftz.f32 %f471, %f553, %f470; mov.b32 %r1328, %f471; mov.b32 %f472, %r1327; fma.rn.ftz.f32 %f473, %f72, %f472, %f410; mul.ftz.f32 %f474, %f552, %f473; mov.b32 %r1327, %f474; mov.b32 %f475, %r1326; fma.rn.ftz.f32 %f476, %f72, %f475, %f411; mul.ftz.f32 %f477, %f552, %f476; mov.b32 %r1326, %f477; mov.b32 %f478, %r1325; fma.rn.ftz.f32 %f479, %f74, %f478, %f412; mul.ftz.f32 %f480, %f553, %f479; mov.b32 %r1325, %f480; mov.b32 %f481, %r1324; fma.rn.ftz.f32 %f482, %f74, %f481, %f413; mul.ftz.f32 %f483, %f553, %f482; mov.b32 %r1324, %f483; mov.b32 %f484, %r1323; fma.rn.ftz.f32 %f485, %f72, %f484, %f418; mul.ftz.f32 %f486, %f552, %f485; mov.b32 %r1323, %f486; mov.b32 %f487, %r1322; fma.rn.ftz.f32 %f488, %f72, %f487, %f419; mul.ftz.f32 %f489, %f552, %f488; mov.b32 %r1322, %f489; mov.b32 %f490, %r1321; fma.rn.ftz.f32 %f491, %f74, %f490, %f420; mul.ftz.f32 %f492, %f553, %f491; mov.b32 %r1321, %f492; mov.b32 %f493, %r1320; fma.rn.ftz.f32 %f494, %f74, %f493, %f421; mul.ftz.f32 %f495, %f553, %f494; mov.b32 %r1320, %f495; mov.b32 %f496, %r1319; fma.rn.ftz.f32 %f497, %f72, %f496, %f426; mul.ftz.f32 %f498, %f552, %f497; mov.b32 %r1319, %f498; mov.b32 %f499, %r1318; fma.rn.ftz.f32 %f500, %f72, %f499, %f427; mul.ftz.f32 %f501, %f552, %f500; mov.b32 %r1318, %f501; mov.b32 %f502, %r1317; fma.rn.ftz.f32 %f503, %f74, %f502, %f428; mul.ftz.f32 %f504, %f553, %f503; mov.b32 %r1317, %f504; mov.b32 %f505, %r1316; fma.rn.ftz.f32 %f506, %f74, %f505, %f429; mul.ftz.f32 %f507, %f553, %f506; mov.b32 %r1316, %f507; @%p127 bra $L__BB0_61; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1315, %r1314, %r1313, %r1312}, [%r605]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1311, %r1310, %r1309, %r1308}, [%r610]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1307, %r1306, %r1305, %r1304}, [%r615]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1303, %r1302, %r1301, %r1300}, [%r620]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1299, %r1298, %r1297, %r1296}, [%r625]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1295, %r1294, %r1293, %r1292}, [%r630]; // end inline asm $L__BB0_61: add.s32 %r1264, %r1264, 32; setp.lt.s32 %p117, %r1264, %r110; add.s32 %r1265, %r1265, 32; mov.f32 %f534, %f55; mov.f32 %f535, %f54; @%p117 bra $L__BB0_11; $L__BB0_62: mov.u32 %r1172, %tid.x; mov.u32 %r1171, %tid.x; shr.s32 %r1170, %r1171, 31; shr.u32 %r1169, %r1170, 29; add.s32 %r1168, %r1171, %r1169; and.b32 %r1167, %r1168, -8; sub.s32 %r1166, %r1171, %r1167; shl.b32 %r1165, %r1166, 4; cvt.s64.s32 %rd152, %r1165; mov.b64 %rd151, fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0; mov.u64 %rd150, %rd151; ld.param.u32 %r1164, [%rd150+60]; mov.u32 %r1163, %ctaid.y; mov.u32 %r1162, %ctaid.x; ld.param.u32 %r1161, [fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1160, %r1, %r1163; mad.lo.s32 %r1159, %r1160, %r1161, %r1162; and.b32 %r1158, %r1171, 96; shr.u32 %r1157, %r1158, 1; mov.u32 %r1156, _ZN25fused_multihead_attention5smem_E; shr.s32 %r1155, %r1168, 31; shr.u32 %r1154, %r1155, 29; add.s32 %r1153, %r7, %r1154; and.b32 %r1152, %r1153, 268435448; sub.s32 %r1151, %r7, %r1152; xor.b32 %r1150, %r1151, %r1166; shl.b32 %r1149, %r7, 7; shl.b32 %r1148, %r1150, 4; mov.u32 %r1147, _ZN25fused_multihead_attention5smem_E; add.s32 %r1146, %r1148, %r1149; add.s32 %r1145, %r1146, %r1147; add.s32 %r1144, %r1145, 8192; bar.sync 0; mov.b32 %f508, %r1338; mov.b32 %f509, %r1339; // begin inline asm cvt.rn.f16x2.f32 %r1060, %f508, %f509; // end inline asm mov.b32 %f510, %r1336; mov.b32 %f511, %r1337; // begin inline asm cvt.rn.f16x2.f32 %r1061, %f510, %f511; // end inline asm shl.b32 %r1117, %r1171, 2; and.b32 %r1118, %r1117, 124; add.s32 %r1120, %r1118, %r1147; and.b32 %r1123, %r1171, 28; shr.u32 %r1124, %r1123, 2; or.b32 %r1125, %r1157, %r1124; shl.b32 %r1126, %r1125, 7; add.s32 %r1127, %r1120, %r1126; add.s32 %r1062, %r1127, 8192; // begin inline asm st.shared.b32 [%r1062], %r1060; // end inline asm add.s32 %r1064, %r1127, 9216; // begin inline asm st.shared.b32 [%r1064], %r1061; // end inline asm xor.b32 %r1068, %r1062, 16; mov.b32 %f512, %r1334; mov.b32 %f513, %r1335; // begin inline asm cvt.rn.f16x2.f32 %r1066, %f512, %f513; // end inline asm mov.b32 %f514, %r1332; mov.b32 %f515, %r1333; // begin inline asm cvt.rn.f16x2.f32 %r1067, %f514, %f515; // end inline asm // begin inline asm st.shared.b32 [%r1068], %r1066; // end inline asm add.s32 %r1070, %r1068, 1024; // begin inline asm st.shared.b32 [%r1070], %r1067; // end inline asm xor.b32 %r1074, %r1062, 32; mov.b32 %f516, %r1330; mov.b32 %f517, %r1331; // begin inline asm cvt.rn.f16x2.f32 %r1072, %f516, %f517; // end inline asm mov.b32 %f518, %r1328; mov.b32 %f519, %r1329; // begin inline asm cvt.rn.f16x2.f32 %r1073, %f518, %f519; // end inline asm // begin inline asm st.shared.b32 [%r1074], %r1072; // end inline asm add.s32 %r1076, %r1074, 1024; // begin inline asm st.shared.b32 [%r1076], %r1073; // end inline asm xor.b32 %r1080, %r1062, 48; mov.b32 %f520, %r1326; mov.b32 %f521, %r1327; // begin inline asm cvt.rn.f16x2.f32 %r1078, %f520, %f521; // end inline asm mov.b32 %f522, %r1324; mov.b32 %f523, %r1325; // begin inline asm cvt.rn.f16x2.f32 %r1079, %f522, %f523; // end inline asm // begin inline asm st.shared.b32 [%r1080], %r1078; // end inline asm add.s32 %r1082, %r1080, 1024; // begin inline asm st.shared.b32 [%r1082], %r1079; // end inline asm xor.b32 %r1086, %r1062, 64; mov.b32 %f524, %r1322; mov.b32 %f525, %r1323; // begin inline asm cvt.rn.f16x2.f32 %r1084, %f524, %f525; // end inline asm mov.b32 %f526, %r1320; mov.b32 %f527, %r1321; // begin inline asm cvt.rn.f16x2.f32 %r1085, %f526, %f527; // end inline asm // begin inline asm st.shared.b32 [%r1086], %r1084; // end inline asm add.s32 %r1088, %r1086, 1024; // begin inline asm st.shared.b32 [%r1088], %r1085; // end inline asm xor.b32 %r1092, %r1062, 80; mov.b32 %f528, %r1318; mov.b32 %f529, %r1319; // begin inline asm cvt.rn.f16x2.f32 %r1090, %f528, %f529; // end inline asm mov.b32 %f530, %r1316; mov.b32 %f531, %r1317; // begin inline asm cvt.rn.f16x2.f32 %r1091, %f530, %f531; // end inline asm // begin inline asm st.shared.b32 [%r1092], %r1090; // end inline asm add.s32 %r1094, %r1092, 1024; // begin inline asm st.shared.b32 [%r1094], %r1091; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1096, %r1097, %r1098, %r1099}, [%r1144]; // end inline asm add.s32 %r1105, %r1144, 2048; // begin inline asm ld.shared.v4.b32 {%r1101, %r1102, %r1103, %r1104}, [%r1105]; // end inline asm add.s32 %r1110, %r1144, 4096; // begin inline asm ld.shared.v4.b32 {%r1106, %r1107, %r1108, %r1109}, [%r1110]; // end inline asm add.s32 %r1115, %r1144, 6144; // begin inline asm ld.shared.v4.b32 {%r1111, %r1112, %r1113, %r1114}, [%r1115]; // end inline asm mul.lo.s32 %r1132, %r1159, %r1164; shl.b32 %r1133, %r1132, 1; cvt.s64.s32 %rd129, %r1133; add.s64 %rd41, %rd129, %rd152; cvt.u32.u64 %r1135, %rd5; setp.ge.s32 %p118, %r1135, %r1; @%p118 bra $L__BB0_73; mov.b64 %rd154, fmha_v2_flash_attention_fp16_64_32_S_40_sm86_kernel_nl_param_0; mov.u64 %rd153, %rd154; ld.param.u32 %r1183, [%rd153+60]; mov.u32 %r1182, %tid.x; shr.s32 %r1181, %r1183, 31; shr.u32 %r1180, %r1181, 29; add.s32 %r1179, %r1183, %r1180; shr.s32 %r1178, %r1179, 3; shr.s32 %r1177, %r1182, 31; shr.u32 %r1176, %r1177, 29; add.s32 %r1175, %r1182, %r1176; and.b32 %r1174, %r1175, -8; sub.s32 %r1173, %r1182, %r1174; setp.ge.s32 %p119, %r1173, %r1178; @%p119 bra $L__BB0_65; mul.lo.s64 %rd131, %rd12, %rd5; add.s64 %rd132, %rd41, %rd131; cvta.to.global.u64 %rd133, %rd13; add.s64 %rd134, %rd133, %rd132; st.global.v4.u32 [%rd134], {%r1096, %r1097, %r1098, %r1099}; $L__BB0_65: add.s32 %r1137, %r1135, 16; setp.ge.s32 %p120, %r1137, %r1; @%p120 bra $L__BB0_73; @%p119 bra $L__BB0_68; add.s64 %rd135, %rd5, 16; mul.lo.s64 %rd136, %rd135, %rd12; add.s64 %rd137, %rd41, %rd136; cvta.to.global.u64 %rd138, %rd13; add.s64 %rd139, %rd138, %rd137; st.global.v4.u32 [%rd139], {%r1101, %r1102, %r1103, %r1104}; $L__BB0_68: add.s32 %r1139, %r1135, 32; setp.ge.s32 %p122, %r1139, %r1; @%p122 bra $L__BB0_73; @%p119 bra $L__BB0_71; add.s64 %rd140, %rd5, 32; mul.lo.s64 %rd141, %rd140, %rd12; add.s64 %rd142, %rd41, %rd141; cvta.to.global.u64 %rd143, %rd13; add.s64 %rd144, %rd143, %rd142; st.global.v4.u32 [%rd144], {%r1106, %r1107, %r1108, %r1109}; $L__BB0_71: add.s32 %r1141, %r1135, 48; setp.ge.s32 %p124, %r1141, %r1; or.pred %p126, %p124, %p119; @%p126 bra $L__BB0_73; add.s64 %rd145, %rd5, 48; mul.lo.s64 %rd146, %rd145, %rd12; add.s64 %rd147, %rd41, %rd146; cvta.to.global.u64 %rd148, %rd13; add.s64 %rd149, %rd148, %rd147; st.global.v4.u32 [%rd149], {%r1111, %r1112, %r1113, %r1114}; $L__BB0_73: ret; }