ase 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_86 .address_size 64 // .globl fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl( .param .align 8 .b8 fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<179>; .reg .b16 %rs<291>; .reg .f32 %f<602>; .reg .b32 %r<847>; .reg .b64 %rd<121>; mov.b64 %rd56, fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd56; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0+52]; mov.u32 %r230, %ctaid.z; shl.b32 %r3, %r230, 6; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_84; mov.u32 %r290, %tid.x; mov.u32 %r291, %ctaid.y; mov.u32 %r292, %ctaid.x; mul.lo.s32 %r293, %r1, %r291; mad.lo.s32 %r294, %r293, %r2, %r292; shr.s32 %r295, %r290, 31; shr.u32 %r296, %r295, 27; add.s32 %r297, %r290, %r296; and.b32 %r298, %r297, -32; sub.s32 %r299, %r290, %r298; shr.u32 %r300, %r295, 25; add.s32 %r301, %r290, %r300; shr.s32 %r302, %r301, 7; shl.b32 %r303, %r302, 4; shr.s32 %r304, %r299, 31; shr.u32 %r305, %r304, 30; add.s32 %r306, %r299, %r305; and.b32 %r307, %r306, 2147483644; sub.s32 %r308, %r299, %r307; shl.b32 %r309, %r308, 1; add.s32 %r802, %r309, %r303; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r310, %r297, 5; shr.s32 %r311, %r297, 31; shr.u32 %r312, %r311, 30; add.s32 %r313, %r310, %r312; and.b32 %r314, %r313, 268435452; sub.s32 %r315, %r310, %r314; shl.b32 %r316, %r315, 4; shr.s32 %r317, %r306, 2; add.s32 %r5, %r316, %r317; shr.u32 %r318, %r290, 31; add.s32 %r319, %r290, %r318; shr.s32 %r6, %r319, 1; add.s32 %r7, %r6, %r3; cvt.s64.s32 %rd60, %r7; ld.param.u64 %rd5, [%rd1+24]; mul.lo.s64 %rd61, %rd5, %rd60; mul.lo.s32 %r320, %r294, 3; mul.wide.s32 %rd62, %r320, 32; and.b32 %r321, %r319, -2; sub.s32 %r8, %r290, %r321; shl.b32 %r322, %r8, 4; cvt.s64.s32 %rd63, %r322; add.s64 %rd64, %rd62, %rd63; add.s64 %rd65, %rd64, %rd61; ld.param.u64 %rd66, [%rd1]; add.s64 %rd57, %rd66, %rd65; shr.u32 %r323, %r295, 29; add.s32 %r324, %r290, %r323; shr.u32 %r325, %r324, 31; shr.s32 %r326, %r324, 3; add.s32 %r327, %r326, %r325; and.b32 %r328, %r327, 268435454; sub.s32 %r329, %r326, %r328; and.b32 %r330, %r324, 268435448; sub.s32 %r331, %r290, %r330; xor.b32 %r332, %r329, %r331; shl.b32 %r333, %r326, 7; shl.b32 %r334, %r332, 4; add.s32 %r335, %r334, %r333; mov.u32 %r336, 31; mov.u32 %r801, 0; mov.u32 %r338, -1; shfl.sync.idx.b32 %r339|%p2, %r801, %r801, %r336, %r338; shfl.sync.idx.b32 %r340|%p3, %r801, %r801, %r336, %r338; and.b32 %r341, %r290, 96; shr.u32 %r342, %r341, 3; and.b32 %r343, %r290, 12; shr.u32 %r344, %r343, 2; or.b32 %r345, %r342, %r344; and.b32 %r346, %r290, 4; shr.u32 %r347, %r346, 2; shl.b32 %r348, %r290, 1; and.b32 %r349, %r348, 6; or.b32 %r350, %r347, %r349; shl.b32 %r351, %r345, 7; shl.b32 %r352, %r350, 4; and.b32 %r353, %r290, 16; xor.b32 %r354, %r352, %r353; cvt.s64.s32 %rd67, %r6; mul.lo.s64 %rd68, %rd5, %rd67; add.s32 %r355, %r320, 1; mul.wide.s32 %rd69, %r355, 32; add.s64 %rd70, %rd69, %rd63; add.s64 %rd71, %rd70, %rd68; add.s64 %rd119, %rd66, %rd71; shfl.sync.idx.b32 %r356|%p4, %r801, %r801, %r336, %r338; shfl.sync.idx.b32 %r357|%p5, %r801, %r801, %r336, %r338; shr.u32 %r358, %r353, 3; or.b32 %r359, %r347, %r358; and.b32 %r360, %r290, 8; shr.u32 %r361, %r360, 3; xor.b32 %r362, %r350, %r361; shl.b32 %r363, %r359, 7; shl.b32 %r364, %r362, 4; or.b32 %r365, %r364, %r363; add.s32 %r366, %r320, 2; mul.wide.s32 %rd72, %r366, 32; add.s64 %rd73, %rd72, %rd63; add.s64 %rd74, %rd73, %rd68; add.s64 %rd120, %rd66, %rd74; shfl.sync.idx.b32 %r367|%p6, %r801, %r801, %r336, %r338; shfl.sync.idx.b32 %r368|%p7, %r801, %r801, %r336, %r338; shl.b32 %r369, %r290, 5; and.b32 %r370, %r369, 384; or.b32 %r371, %r354, %r370; mov.u32 %r378, _ZN25fused_multihead_attention5smem_E; add.s32 %r379, %r378, 2048; add.s32 %r10, %r335, %r379; sub.s32 %r380, %r1, %r3; min.s32 %r381, %r380, 64; setp.lt.s32 %p8, %r6, %r381; add.s32 %r382, %r335, %r378; add.s32 %r231, %r382, %r340; selp.b32 %r232, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r231], [%rd57], 16, %r232; // end inline asm min.s32 %r383, %r1, 64; setp.lt.s32 %p9, %r6, %r383; add.s32 %r233, %r10, %r357; selp.b32 %r236, 16, 0, %p9; // begin inline asm cp.async.cg.shared.global [%r233], [%rd119], 16, %r236; // end inline asm add.s32 %r384, %r378, 4096; add.s32 %r385, %r335, %r384; add.s32 %r235, %r385, %r368; // begin inline asm cp.async.cg.shared.global [%r235], [%rd120], 16, %r236; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r386, %r351, %r378; add.s32 %r387, %r386, %r354; add.s32 %r241, %r387, %r339; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r237, %r238, %r239, %r240}, [%r241]; // end inline asm add.s32 %r388, %r365, %r379; add.s32 %r246, %r388, %r356; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r807, %r808, %r809, %r810}, [%r246]; // end inline asm add.s32 %r251, %r246, 512; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r811, %r812, %r813, %r814}, [%r251]; // end inline asm add.s32 %r256, %r246, 1024; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r815, %r816, %r817, %r818}, [%r256]; // end inline asm add.s32 %r261, %r246, 1536; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r819, %r820, %r821, %r822}, [%r261]; // end inline asm add.s32 %r266, %r371, %r384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r825, %r824, %r823, %r826}, [%r266]; // end inline asm add.s32 %r271, %r266, 512; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r827, %r828, %r829, %r830}, [%r271]; // end inline asm add.s32 %r276, %r266, 1024; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r831, %r832, %r833, %r834}, [%r276]; // end inline asm add.s32 %r281, %r266, 1536; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r835, %r836, %r837, %r838}, [%r281]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r846, 0; // end inline asm // begin inline asm mov.u32 %r845, 0; // end inline asm // begin inline asm mov.u32 %r844, 0; // end inline asm // begin inline asm mov.u32 %r843, 0; // end inline asm // begin inline asm mov.u32 %r842, 0; // end inline asm // begin inline asm mov.u32 %r841, 0; // end inline asm // begin inline asm mov.u32 %r840, 0; // end inline asm // begin inline asm mov.u32 %r839, 0; // end inline asm add.s32 %r389, %r1, 63; shr.s32 %r390, %r389, 31; shr.u32 %r391, %r390, 26; add.s32 %r392, %r389, %r391; and.b32 %r58, %r392, -64; setp.lt.s32 %p10, %r1, 1; @%p10 bra $L__BB0_81; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd11, %r802; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd79, %r3; add.s64 %rd13, %rd12, %rd79; add.s32 %r394, %r802, 1; cvt.s64.s32 %rd14, %r394; add.s32 %r395, %r802, 8; cvt.s64.s32 %rd15, %r395; add.s32 %r396, %r802, 9; cvt.s64.s32 %rd16, %r396; add.s32 %r397, %r802, 16; cvt.s64.s32 %rd17, %r397; add.s32 %r398, %r802, 17; cvt.s64.s32 %rd18, %r398; add.s32 %r399, %r802, 24; cvt.s64.s32 %rd19, %r399; add.s32 %r400, %r802, 25; cvt.s64.s32 %rd20, %r400; add.s32 %r401, %r802, 32; cvt.s64.s32 %rd21, %r401; add.s32 %r402, %r802, 33; cvt.s64.s32 %rd22, %r402; add.s32 %r403, %r802, 40; cvt.s64.s32 %rd23, %r403; add.s32 %r404, %r802, 41; cvt.s64.s32 %rd24, %r404; add.s32 %r405, %r802, 48; cvt.s64.s32 %rd25, %r405; add.s32 %r406, %r802, 49; cvt.s64.s32 %rd26, %r406; add.s32 %r407, %r802, 56; cvt.s64.s32 %rd27, %r407; add.s32 %r408, %r802, 57; cvt.s64.s32 %rd28, %r408; add.s32 %r95, %r5, 8; cvt.s64.s32 %rd80, %r95; add.s64 %rd29, %rd80, %rd79; mov.f32 %f566, 0fFF800000; mov.f32 %f564, 0f00000000; mov.f32 %f565, %f564; mov.f32 %f567, %f566; mov.u32 %r806, %r1; mov.u32 %r805, %r1; $L__BB0_3: add.s32 %r409, %r801, 64; setp.ge.s32 %p11, %r409, %r58; @%p11 bra $L__BB0_5; bar.sync 0; shl.b64 %rd83, %rd5, 6; add.s64 %rd119, %rd119, %rd83; add.s32 %r806, %r806, -64; min.s32 %r414, %r806, 64; setp.lt.s32 %p12, %r6, %r414; selp.b32 %r411, 16, 0, %p12; // begin inline asm cp.async.cg.shared.global [%r233], [%rd119], 16, %r411; // end inline asm add.s64 %rd120, %rd120, %rd83; add.s32 %r805, %r805, -64; min.s32 %r415, %r805, 64; setp.lt.s32 %p13, %r6, %r415; selp.b32 %r413, 16, 0, %p13; // begin inline asm cp.async.cg.shared.global [%r235], [%rd120], 16, %r413; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm $L__BB0_5: setp.eq.s16 %p14, %rs1, 0; @%p14 bra $L__BB0_70; mov.u32 %r759, %ctaid.x; mov.u32 %r758, %ctaid.y; mov.u32 %r757, %ctaid.z; ld.param.u32 %r756, [fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0+52]; setp.ge.u64 %p15, %rd13, %rd2; mul.lo.s32 %r417, %r757, %r1; shl.b32 %r418, %r417, 6; cvt.s64.s32 %rd85, %r418; cvt.u64.u32 %rd36, %r801; add.s64 %rd37, %rd85, %rd36; mul.lo.s64 %rd86, %rd2, %rd12; add.s64 %rd87, %rd37, %rd86; add.s64 %rd88, %rd87, %rd11; add.s64 %rd38, %rd11, %rd36; setp.ge.u64 %p16, %rd38, %rd2; shl.b64 %rd89, %rd88, 1; mad.lo.s32 %r421, %r756, %r758, %r759; cvt.s64.s32 %rd90, %r421; mul.lo.s64 %rd91, %rd4, %rd90; add.s64 %rd92, %rd91, %rd89; cvta.to.global.u64 %rd93, %rd3; add.s64 %rd39, %rd93, %rd92; mov.u16 %rs260, 0; or.pred %p17, %p16, %p15; mov.u16 %rs259, %rs260; @%p17 bra $L__BB0_8; ld.global.u16 %rs259, [%rd39]; $L__BB0_8: add.s64 %rd40, %rd14, %rd36; setp.ge.u64 %p18, %rd40, %rd2; or.pred %p20, %p18, %p15; @%p20 bra $L__BB0_10; ld.global.u16 %rs260, [%rd39+2]; $L__BB0_10: add.s64 %rd41, %rd15, %rd36; setp.ge.u64 %p21, %rd41, %rd2; mov.u16 %rs262, 0; or.pred %p23, %p21, %p15; mov.u16 %rs261, %rs262; @%p23 bra $L__BB0_12; ld.global.u16 %rs261, [%rd39+16]; $L__BB0_12: add.s64 %rd42, %rd16, %rd36; setp.ge.u64 %p24, %rd42, %rd2; or.pred %p26, %p24, %p15; @%p26 bra $L__BB0_14; ld.global.u16 %rs262, [%rd39+18]; $L__BB0_14: add.s64 %rd43, %rd17, %rd36; setp.ge.u64 %p27, %rd43, %rd2; mov.u16 %rs264, 0; or.pred %p29, %p27, %p15; mov.u16 %rs263, %rs264; @%p29 bra $L__BB0_16; ld.global.u16 %rs263, [%rd39+32]; $L__BB0_16: add.s64 %rd44, %rd18, %rd36; setp.ge.u64 %p30, %rd44, %rd2; or.pred %p32, %p30, %p15; @%p32 bra $L__BB0_18; ld.global.u16 %rs264, [%rd39+34]; $L__BB0_18: add.s64 %rd45, %rd19, %rd36; setp.ge.u64 %p33, %rd45, %rd2; mov.u16 %rs266, 0; or.pred %p35, %p33, %p15; mov.u16 %rs265, %rs266; @%p35 bra $L__BB0_20; ld.global.u16 %rs265, [%rd39+48]; $L__BB0_20: add.s64 %rd46, %rd20, %rd36; setp.ge.u64 %p36, %rd46, %rd2; or.pred %p38, %p36, %p15; @%p38 bra $L__BB0_22; ld.global.u16 %rs266, [%rd39+50]; $L__BB0_22: add.s64 %rd47, %rd21, %rd36; setp.ge.u64 %p39, %rd47, %rd2; mov.u16 %rs268, 0; or.pred %p41, %p39, %p15; mov.u16 %rs267, %rs268; @%p41 bra $L__BB0_24; ld.global.u16 %rs267, [%rd39+64]; $L__BB0_24: add.s64 %rd48, %rd22, %rd36; setp.ge.u64 %p42, %rd48, %rd2; or.pred %p44, %p42, %p15; @%p44 bra $L__BB0_26; ld.global.u16 %rs268, [%rd39+66]; $L__BB0_26: add.s64 %rd49, %rd23, %rd36; setp.ge.u64 %p45, %rd49, %rd2; mov.u16 %rs270, 0; or.pred %p47, %p45, %p15; mov.u16 %rs269, %rs270; @%p47 bra $L__BB0_28; ld.global.u16 %rs269, [%rd39+80]; $L__BB0_28: add.s64 %rd50, %rd24, %rd36; setp.ge.u64 %p48, %rd50, %rd2; or.pred %p50, %p48, %p15; @%p50 bra $L__BB0_30; ld.global.u16 %rs270, [%rd39+82]; $L__BB0_30: add.s64 %rd51, %rd25, %rd36; setp.ge.u64 %p51, %rd51, %rd2; mov.u16 %rs272, 0; or.pred %p53, %p51, %p15; mov.u16 %rs271, %rs272; @%p53 bra $L__BB0_32; ld.global.u16 %rs271, [%rd39+96]; $L__BB0_32: add.s64 %rd52, %rd26, %rd36; setp.ge.u64 %p54, %rd52, %rd2; or.pred %p56, %p54, %p15; @%p56 bra $L__BB0_34; ld.global.u16 %rs272, [%rd39+98]; $L__BB0_34: add.s64 %rd53, %rd27, %rd36; setp.ge.u64 %p57, %rd53, %rd2; mov.u16 %rs274, 0; or.pred %p59, %p57, %p15; mov.u16 %rs273, %rs274; @%p59 bra $L__BB0_36; ld.global.u16 %rs273, [%rd39+112]; $L__BB0_36: add.s64 %rd54, %rd28, %rd36; setp.ge.u64 %p60, %rd54, %rd2; or.pred %p62, %p60, %p15; @%p62 bra $L__BB0_38; ld.global.u16 %rs274, [%rd39+114]; $L__BB0_38: mul.lo.s64 %rd95, %rd2, %rd80; add.s64 %rd96, %rd37, %rd95; add.s64 %rd97, %rd96, %rd11; setp.ge.u64 %p63, %rd29, %rd2; shl.b64 %rd98, %rd97, 1; add.s64 %rd101, %rd91, %rd98; add.s64 %rd55, %rd93, %rd101; mov.u16 %rs276, 0; or.pred %p65, %p16, %p63; mov.u16 %rs275, %rs276; @%p65 bra $L__BB0_40; ld.global.u16 %rs275, [%rd55]; $L__BB0_40: or.pred %p68, %p18, %p63; @%p68 bra $L__BB0_42; ld.global.u16 %rs276, [%rd55+2]; $L__BB0_42: mov.u16 %rs278, 0; or.pred %p71, %p21, %p63; mov.u16 %rs277, %rs278; @%p71 bra $L__BB0_44; ld.global.u16 %rs277, [%rd55+16]; $L__BB0_44: or.pred %p74, %p24, %p63; @%p74 bra $L__BB0_46; ld.global.u16 %rs278, [%rd55+18]; $L__BB0_46: mov.u16 %rs280, 0; or.pred %p77, %p27, %p63; mov.u16 %rs279, %rs280; @%p77 bra $L__BB0_48; ld.global.u16 %rs279, [%rd55+32]; $L__BB0_48: or.pred %p80, %p30, %p63; @%p80 bra $L__BB0_50; ld.global.u16 %rs280, [%rd55+34]; $L__BB0_50: mov.u16 %rs282, 0; or.pred %p83, %p33, %p63; mov.u16 %rs281, %rs282; @%p83 bra $L__BB0_52; ld.global.u16 %rs281, [%rd55+48]; $L__BB0_52: or.pred %p86, %p36, %p63; @%p86 bra $L__BB0_54; ld.global.u16 %rs282, [%rd55+50]; $L__BB0_54: mov.u16 %rs284, 0; or.pred %p89, %p39, %p63; mov.u16 %rs283, %rs284; @%p89 bra $L__BB0_56; ld.global.u16 %rs283, [%rd55+64]; $L__BB0_56: or.pred %p92, %p42, %p63; @%p92 bra $L__BB0_58; ld.global.u16 %rs284, [%rd55+66]; $L__BB0_58: mov.u16 %rs286, 0; or.pred %p95, %p45, %p63; mov.u16 %rs285, %rs286; @%p95 bra $L__BB0_60; ld.global.u16 %rs285, [%rd55+80]; $L__BB0_60: or.pred %p98, %p48, %p63; @%p98 bra $L__BB0_62; ld.global.u16 %rs286, [%rd55+82]; $L__BB0_62: mov.u16 %rs288, 0; or.pred %p101, %p51, %p63; mov.u16 %rs287, %rs288; @%p101 bra $L__BB0_64; ld.global.u16 %rs287, [%rd55+96]; $L__BB0_64: or.pred %p104, %p54, %p63; @%p104 bra $L__BB0_66; ld.global.u16 %rs288, [%rd55+98]; $L__BB0_66: mov.u16 %rs290, 0; or.pred %p107, %p57, %p63; mov.u16 %rs289, %rs290; @%p107 bra $L__BB0_68; ld.global.u16 %rs289, [%rd55+112]; $L__BB0_68: or.pred %p110, %p60, %p63; @%p110 bra $L__BB0_70; ld.global.u16 %rs290, [%rd55+114]; $L__BB0_70: // begin inline asm mov.u32 %r425, 0; // end inline asm // begin inline asm mov.u32 %r426, 0; // end inline asm // begin inline asm mov.u32 %r427, 0; // end inline asm // begin inline asm mov.u32 %r428, 0; // end inline asm // begin inline asm mov.u32 %r429, 0; // end inline asm // begin inline asm mov.u32 %r430, 0; // end inline asm // begin inline asm mov.u32 %r431, 0; // end inline asm // begin inline asm mov.u32 %r432, 0; // end inline asm // begin inline asm mov.u32 %r433, 0; // end inline asm // begin inline asm mov.u32 %r434, 0; // end inline asm // begin inline asm mov.u32 %r435, 0; // end inline asm // begin inline asm mov.u32 %r436, 0; // end inline asm // begin inline asm mov.u32 %r437, 0; // end inline asm // begin inline asm mov.u32 %r438, 0; // end inline asm // begin inline asm mov.u32 %r439, 0; // end inline asm // begin inline asm mov.u32 %r440, 0; // end inline asm // begin inline asm mov.u32 %r441, 0; // end inline asm // begin inline asm mov.u32 %r442, 0; // end inline asm // begin inline asm mov.u32 %r443, 0; // end inline asm // begin inline asm mov.u32 %r444, 0; // end inline asm // begin inline asm mov.u32 %r445, 0; // end inline asm // begin inline asm mov.u32 %r446, 0; // end inline asm // begin inline asm mov.u32 %r447, 0; // end inline asm // begin inline asm mov.u32 %r448, 0; // end inline asm // begin inline asm mov.u32 %r449, 0; // end inline asm // begin inline asm mov.u32 %r450, 0; // end inline asm // begin inline asm mov.u32 %r451, 0; // end inline asm // begin inline asm mov.u32 %r452, 0; // end inline asm // begin inline asm mov.u32 %r453, 0; // end inline asm // begin inline asm mov.u32 %r454, 0; // end inline asm // begin inline asm mov.u32 %r455, 0; // end inline asm // begin inline asm mov.u32 %r456, 0; // end inline asm mov.b32 %f156, %r425; mov.b32 %f157, %r426; mov.b32 %f158, %r427; mov.b32 %f159, %r428; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r237, %r238, %r239, %r240}, {%r807, %r808}, {%f156, %f157, %f158, %f159}; // end inline asm mov.b32 %f164, %r429; mov.b32 %f165, %r430; mov.b32 %f166, %r431; mov.b32 %f167, %r432; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r237, %r238, %r239, %r240}, {%r809, %r810}, {%f164, %f165, %f166, %f167}; // end inline asm mov.b32 %f172, %r433; mov.b32 %f173, %r434; mov.b32 %f174, %r435; mov.b32 %f175, %r436; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r237, %r238, %r239, %r240}, {%r811, %r812}, {%f172, %f173, %f174, %f175}; // end inline asm mov.b32 %f180, %r437; mov.b32 %f181, %r438; mov.b32 %f182, %r439; mov.b32 %f183, %r440; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r237, %r238, %r239, %r240}, {%r813, %r814}, {%f180, %f181, %f182, %f183}; // end inline asm mov.b32 %f188, %r441; mov.b32 %f189, %r442; mov.b32 %f190, %r443; mov.b32 %f191, %r444; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r237, %r238, %r239, %r240}, {%r815, %r816}, {%f188, %f189, %f190, %f191}; // end inline asm mov.b32 %f196, %r445; mov.b32 %f197, %r446; mov.b32 %f198, %r447; mov.b32 %f199, %r448; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r237, %r238, %r239, %r240}, {%r817, %r818}, {%f196, %f197, %f198, %f199}; // end inline asm mov.b32 %f204, %r449; mov.b32 %f205, %r450; mov.b32 %f206, %r451; mov.b32 %f207, %r452; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r237, %r238, %r239, %r240}, {%r819, %r820}, {%f204, %f205, %f206, %f207}; // end inline asm mov.b32 %f212, %r453; mov.b32 %f213, %r454; mov.b32 %f214, %r455; mov.b32 %f215, %r456; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f212, %f213, %f214, %f215}, {%r237, %r238, %r239, %r240}, {%r821, %r822}, {%f212, %f213, %f214, %f215}; // end inline asm mul.ftz.f32 %f220, %f1, %f156; mul.ftz.f32 %f221, %f1, %f157; mul.ftz.f32 %f222, %f1, %f164; mul.ftz.f32 %f223, %f1, %f165; mul.ftz.f32 %f224, %f1, %f158; mul.ftz.f32 %f225, %f1, %f159; mul.ftz.f32 %f226, %f1, %f166; mul.ftz.f32 %f227, %f1, %f167; mul.ftz.f32 %f228, %f1, %f172; mul.ftz.f32 %f229, %f1, %f173; mul.ftz.f32 %f230, %f1, %f180; mul.ftz.f32 %f231, %f1, %f181; mul.ftz.f32 %f232, %f1, %f174; mul.ftz.f32 %f233, %f1, %f175; mul.ftz.f32 %f234, %f1, %f182; mul.ftz.f32 %f235, %f1, %f183; mul.ftz.f32 %f236, %f1, %f188; mul.ftz.f32 %f237, %f1, %f189; mul.ftz.f32 %f238, %f1, %f196; mul.ftz.f32 %f239, %f1, %f197; mul.ftz.f32 %f240, %f1, %f190; mul.ftz.f32 %f241, %f1, %f191; mul.ftz.f32 %f242, %f1, %f198; mul.ftz.f32 %f243, %f1, %f199; mul.ftz.f32 %f244, %f1, %f204; mul.ftz.f32 %f245, %f1, %f205; mul.ftz.f32 %f246, %f1, %f212; mul.ftz.f32 %f247, %f1, %f213; mul.ftz.f32 %f248, %f1, %f206; mul.ftz.f32 %f249, %f1, %f207; mul.ftz.f32 %f250, %f1, %f214; mul.ftz.f32 %f251, %f1, %f215; setp.lt.s32 %p111, %r802, %r1; selp.f32 %f599, %f220, 0fFF800000, %p111; add.s32 %r505, %r802, 1; setp.lt.s32 %p112, %r505, %r1; selp.f32 %f598, %f221, 0fFF800000, %p112; add.s32 %r506, %r802, 8; setp.lt.s32 %p113, %r506, %r1; selp.f32 %f597, %f222, 0fFF800000, %p113; add.s32 %r507, %r802, 9; setp.lt.s32 %p114, %r507, %r1; selp.f32 %f596, %f223, 0fFF800000, %p114; add.s32 %r508, %r802, 16; setp.lt.s32 %p115, %r508, %r1; selp.f32 %f595, %f228, 0fFF800000, %p115; add.s32 %r509, %r802, 17; setp.lt.s32 %p116, %r509, %r1; selp.f32 %f594, %f229, 0fFF800000, %p116; add.s32 %r510, %r802, 24; setp.lt.s32 %p117, %r510, %r1; selp.f32 %f593, %f230, 0fFF800000, %p117; add.s32 %r511, %r802, 25; setp.lt.s32 %p118, %r511, %r1; selp.f32 %f592, %f231, 0fFF800000, %p118; add.s32 %r512, %r802, 32; setp.lt.s32 %p119, %r512, %r1; selp.f32 %f591, %f236, 0fFF800000, %p119; add.s32 %r513, %r802, 33; setp.lt.s32 %p120, %r513, %r1; selp.f32 %f590, %f237, 0fFF800000, %p120; add.s32 %r514, %r802, 40; setp.lt.s32 %p121, %r514, %r1; selp.f32 %f589, %f238, 0fFF800000, %p121; add.s32 %r515, %r802, 41; setp.lt.s32 %p122, %r515, %r1; selp.f32 %f588, %f239, 0fFF800000, %p122; add.s32 %r516, %r802, 48; setp.lt.s32 %p123, %r516, %r1; selp.f32 %f587, %f244, 0fFF800000, %p123; add.s32 %r517, %r802, 49; setp.lt.s32 %p124, %r517, %r1; selp.f32 %f586, %f245, 0fFF800000, %p124; add.s32 %r518, %r802, 56; setp.lt.s32 %p125, %r518, %r1; selp.f32 %f585, %f246, 0fFF800000, %p125; add.s32 %r519, %r802, 57; setp.lt.s32 %p126, %r519, %r1; selp.f32 %f584, %f247, 0fFF800000, %p126; selp.f32 %f583, %f224, 0fFF800000, %p111; selp.f32 %f582, %f225, 0fFF800000, %p112; selp.f32 %f581, %f226, 0fFF800000, %p113; selp.f32 %f580, %f227, 0fFF800000, %p114; selp.f32 %f579, %f232, 0fFF800000, %p115; selp.f32 %f578, %f233, 0fFF800000, %p116; selp.f32 %f577, %f234, 0fFF800000, %p117; selp.f32 %f576, %f235, 0fFF800000, %p118; selp.f32 %f575, %f240, 0fFF800000, %p119; selp.f32 %f574, %f241, 0fFF800000, %p120; selp.f32 %f573, %f242, 0fFF800000, %p121; selp.f32 %f572, %f243, 0fFF800000, %p122; selp.f32 %f571, %f248, 0fFF800000, %p123; selp.f32 %f570, %f249, 0fFF800000, %p124; selp.f32 %f569, %f250, 0fFF800000, %p125; selp.f32 %f568, %f251, 0fFF800000, %p126; @%p14 bra $L__BB0_72; // begin inline asm cvt.f32.f16 %f252, %rs259; // end inline asm add.ftz.f32 %f599, %f252, %f599; // begin inline asm cvt.f32.f16 %f253, %rs260; // end inline asm add.ftz.f32 %f598, %f253, %f598; // begin inline asm cvt.f32.f16 %f254, %rs261; // end inline asm add.ftz.f32 %f597, %f254, %f597; // begin inline asm cvt.f32.f16 %f255, %rs262; // end inline asm add.ftz.f32 %f596, %f255, %f596; // begin inline asm cvt.f32.f16 %f256, %rs263; // end inline asm add.ftz.f32 %f595, %f256, %f595; // begin inline asm cvt.f32.f16 %f257, %rs264; // end inline asm add.ftz.f32 %f594, %f257, %f594; // begin inline asm cvt.f32.f16 %f258, %rs265; // end inline asm add.ftz.f32 %f593, %f258, %f593; // begin inline asm cvt.f32.f16 %f259, %rs266; // end inline asm add.ftz.f32 %f592, %f259, %f592; // begin inline asm cvt.f32.f16 %f260, %rs267; // end inline asm add.ftz.f32 %f591, %f260, %f591; // begin inline asm cvt.f32.f16 %f261, %rs268; // end inline asm add.ftz.f32 %f590, %f261, %f590; // begin inline asm cvt.f32.f16 %f262, %rs269; // end inline asm add.ftz.f32 %f589, %f262, %f589; // begin inline asm cvt.f32.f16 %f263, %rs270; // end inline asm add.ftz.f32 %f588, %f263, %f588; // begin inline asm cvt.f32.f16 %f264, %rs271; // end inline asm add.ftz.f32 %f587, %f264, %f587; // begin inline asm cvt.f32.f16 %f265, %rs272; // end inline asm add.ftz.f32 %f586, %f265, %f586; // begin inline asm cvt.f32.f16 %f266, %rs273; // end inline asm add.ftz.f32 %f585, %f266, %f585; // begin inline asm cvt.f32.f16 %f267, %rs274; // end inline asm add.ftz.f32 %f584, %f267, %f584; // begin inline asm cvt.f32.f16 %f268, %rs275; // end inline asm add.ftz.f32 %f583, %f268, %f583; // begin inline asm cvt.f32.f16 %f269, %rs276; // end inline asm add.ftz.f32 %f582, %f269, %f582; // begin inline asm cvt.f32.f16 %f270, %rs277; // end inline asm add.ftz.f32 %f581, %f270, %f581; // begin inline asm cvt.f32.f16 %f271, %rs278; // end inline asm add.ftz.f32 %f580, %f271, %f580; // begin inline asm cvt.f32.f16 %f272, %rs279; // end inline asm add.ftz.f32 %f579, %f272, %f579; // begin inline asm cvt.f32.f16 %f273, %rs280; // end inline asm add.ftz.f32 %f578, %f273, %f578; // begin inline asm cvt.f32.f16 %f274, %rs281; // end inline asm add.ftz.f32 %f577, %f274, %f577; // begin inline asm cvt.f32.f16 %f275, %rs282; // end inline asm add.ftz.f32 %f576, %f275, %f576; // begin inline asm cvt.f32.f16 %f276, %rs283; // end inline asm add.ftz.f32 %f575, %f276, %f575; // begin inline asm cvt.f32.f16 %f277, %rs284; // end inline asm add.ftz.f32 %f574, %f277, %f574; // begin inline asm cvt.f32.f16 %f278, %rs285; // end inline asm add.ftz.f32 %f573, %f278, %f573; // begin inline asm cvt.f32.f16 %f279, %rs286; // end inline asm add.ftz.f32 %f572, %f279, %f572; // begin inline asm cvt.f32.f16 %f280, %rs287; // end inline asm add.ftz.f32 %f571, %f280, %f571; // begin inline asm cvt.f32.f16 %f281, %rs288; // end inline asm add.ftz.f32 %f570, %f281, %f570; // begin inline asm cvt.f32.f16 %f282, %rs289; // end inline asm add.ftz.f32 %f569, %f282, %f569; // begin inline asm cvt.f32.f16 %f283, %rs290; // end inline asm add.ftz.f32 %f568, %f283, %f568; $L__BB0_72: add.s32 %r760, %r801, 64; setp.ge.s32 %p178, %r760, %r58; setp.gt.ftz.f32 %p129, %f599, %f598; selp.f32 %f284, %f599, %f598, %p129; setp.gt.ftz.f32 %p130, %f284, %f597; selp.f32 %f285, %f284, %f597, %p130; setp.gt.ftz.f32 %p131, %f285, %f596; selp.f32 %f286, %f285, %f596, %p131; setp.gt.ftz.f32 %p132, %f286, %f595; selp.f32 %f287, %f286, %f595, %p132; setp.gt.ftz.f32 %p133, %f287, %f594; selp.f32 %f288, %f287, %f594, %p133; setp.gt.ftz.f32 %p134, %f288, %f593; selp.f32 %f289, %f288, %f593, %p134; setp.gt.ftz.f32 %p135, %f289, %f592; selp.f32 %f290, %f289, %f592, %p135; setp.gt.ftz.f32 %p136, %f290, %f591; selp.f32 %f291, %f290, %f591, %p136; setp.gt.ftz.f32 %p137, %f291, %f590; selp.f32 %f292, %f291, %f590, %p137; setp.gt.ftz.f32 %p138, %f292, %f589; selp.f32 %f293, %f292, %f589, %p138; setp.gt.ftz.f32 %p139, %f293, %f588; selp.f32 %f294, %f293, %f588, %p139; setp.gt.ftz.f32 %p140, %f294, %f587; selp.f32 %f295, %f294, %f587, %p140; setp.gt.ftz.f32 %p141, %f295, %f586; selp.f32 %f296, %f295, %f586, %p141; setp.gt.ftz.f32 %p142, %f296, %f585; selp.f32 %f297, %f296, %f585, %p142; setp.gt.ftz.f32 %p143, %f297, %f584; selp.f32 %f298, %f297, %f584, %p143; setp.gt.ftz.f32 %p144, %f583, %f582; selp.f32 %f299, %f583, %f582, %p144; setp.gt.ftz.f32 %p145, %f299, %f581; selp.f32 %f300, %f299, %f581, %p145; setp.gt.ftz.f32 %p146, %f300, %f580; selp.f32 %f301, %f300, %f580, %p146; setp.gt.ftz.f32 %p147, %f301, %f579; selp.f32 %f302, %f301, %f579, %p147; setp.gt.ftz.f32 %p148, %f302, %f578; selp.f32 %f303, %f302, %f578, %p148; setp.gt.ftz.f32 %p149, %f303, %f577; selp.f32 %f304, %f303, %f577, %p149; setp.gt.ftz.f32 %p150, %f304, %f576; selp.f32 %f305, %f304, %f576, %p150; setp.gt.ftz.f32 %p151, %f305, %f575; selp.f32 %f306, %f305, %f575, %p151; setp.gt.ftz.f32 %p152, %f306, %f574; selp.f32 %f307, %f306, %f574, %p152; setp.gt.ftz.f32 %p153, %f307, %f573; selp.f32 %f308, %f307, %f573, %p153; setp.gt.ftz.f32 %p154, %f308, %f572; selp.f32 %f309, %f308, %f572, %p154; setp.gt.ftz.f32 %p155, %f309, %f571; selp.f32 %f310, %f309, %f571, %p155; setp.gt.ftz.f32 %p156, %f310, %f570; selp.f32 %f311, %f310, %f570, %p156; setp.gt.ftz.f32 %p157, %f311, %f569; selp.f32 %f312, %f311, %f569, %p157; setp.gt.ftz.f32 %p158, %f312, %f568; selp.f32 %f313, %f312, %f568, %p158; mov.b32 %r521, %f298; mov.u32 %r522, 31; mov.u32 %r523, 1; mov.u32 %r524, -1; shfl.sync.bfly.b32 %r525|%p159, %r521, %r523, %r522, %r524; mov.b32 %f314, %r525; setp.gt.ftz.f32 %p160, %f298, %f314; selp.f32 %f315, %f298, %f314, %p160; mov.b32 %r526, %f315; mov.u32 %r527, 2; shfl.sync.bfly.b32 %r528|%p161, %r526, %r527, %r522, %r524; mov.b32 %f316, %r528; setp.gt.ftz.f32 %p162, %f315, %f316; selp.f32 %f317, %f315, %f316, %p162; mov.b32 %r529, %f313; shfl.sync.bfly.b32 %r530|%p163, %r529, %r523, %r522, %r524; mov.b32 %f318, %r530; setp.gt.ftz.f32 %p164, %f313, %f318; selp.f32 %f319, %f313, %f318, %p164; mov.b32 %r531, %f319; shfl.sync.bfly.b32 %r532|%p165, %r531, %r527, %r522, %r524; mov.b32 %f320, %r532; setp.gt.ftz.f32 %p166, %f319, %f320; selp.f32 %f321, %f319, %f320, %p166; max.ftz.f32 %f102, %f317, %f567; max.ftz.f32 %f103, %f321, %f566; sub.ftz.f32 %f322, %f599, %f102; mul.ftz.f32 %f323, %f322, 0f3FB8AA3B; ex2.approx.ftz.f32 %f104, %f323; sub.ftz.f32 %f324, %f598, %f102; mul.ftz.f32 %f325, %f324, 0f3FB8AA3B; ex2.approx.ftz.f32 %f105, %f325; sub.ftz.f32 %f326, %f597, %f102; mul.ftz.f32 %f327, %f326, 0f3FB8AA3B; ex2.approx.ftz.f32 %f106, %f327; sub.ftz.f32 %f328, %f596, %f102; mul.ftz.f32 %f329, %f328, 0f3FB8AA3B; ex2.approx.ftz.f32 %f107, %f329; sub.ftz.f32 %f330, %f595, %f102; mul.ftz.f32 %f331, %f330, 0f3FB8AA3B; ex2.approx.ftz.f32 %f108, %f331; sub.ftz.f32 %f332, %f594, %f102; mul.ftz.f32 %f333, %f332, 0f3FB8AA3B; ex2.approx.ftz.f32 %f109, %f333; sub.ftz.f32 %f334, %f593, %f102; mul.ftz.f32 %f335, %f334, 0f3FB8AA3B; ex2.approx.ftz.f32 %f110, %f335; sub.ftz.f32 %f336, %f592, %f102; mul.ftz.f32 %f337, %f336, 0f3FB8AA3B; ex2.approx.ftz.f32 %f111, %f337; sub.ftz.f32 %f338, %f591, %f102; mul.ftz.f32 %f339, %f338, 0f3FB8AA3B; ex2.approx.ftz.f32 %f112, %f339; sub.ftz.f32 %f340, %f590, %f102; mul.ftz.f32 %f341, %f340, 0f3FB8AA3B; ex2.approx.ftz.f32 %f113, %f341; sub.ftz.f32 %f342, %f589, %f102; mul.ftz.f32 %f343, %f342, 0f3FB8AA3B; ex2.approx.ftz.f32 %f114, %f343; sub.ftz.f32 %f344, %f588, %f102; mul.ftz.f32 %f345, %f344, 0f3FB8AA3B; ex2.approx.ftz.f32 %f115, %f345; sub.ftz.f32 %f346, %f587, %f102; mul.ftz.f32 %f347, %f346, 0f3FB8AA3B; ex2.approx.ftz.f32 %f116, %f347; sub.ftz.f32 %f348, %f586, %f102; mul.ftz.f32 %f349, %f348, 0f3FB8AA3B; ex2.approx.ftz.f32 %f117, %f349; sub.ftz.f32 %f350, %f585, %f102; mul.ftz.f32 %f351, %f350, 0f3FB8AA3B; ex2.approx.ftz.f32 %f118, %f351; sub.ftz.f32 %f352, %f584, %f102; mul.ftz.f32 %f353, %f352, 0f3FB8AA3B; ex2.approx.ftz.f32 %f119, %f353; sub.ftz.f32 %f354, %f583, %f103; mul.ftz.f32 %f355, %f354, 0f3FB8AA3B; ex2.approx.ftz.f32 %f120, %f355; sub.ftz.f32 %f356, %f582, %f103; mul.ftz.f32 %f357, %f356, 0f3FB8AA3B; ex2.approx.ftz.f32 %f121, %f357; sub.ftz.f32 %f358, %f581, %f103; mul.ftz.f32 %f359, %f358, 0f3FB8AA3B; ex2.approx.ftz.f32 %f122, %f359; sub.ftz.f32 %f360, %f580, %f103; mul.ftz.f32 %f361, %f360, 0f3FB8AA3B; ex2.approx.ftz.f32 %f123, %f361; sub.ftz.f32 %f362, %f579, %f103; mul.ftz.f32 %f363, %f362, 0f3FB8AA3B; ex2.approx.ftz.f32 %f124, %f363; sub.ftz.f32 %f364, %f578, %f103; mul.ftz.f32 %f365, %f364, 0f3FB8AA3B; ex2.approx.ftz.f32 %f125, %f365; sub.ftz.f32 %f366, %f577, %f103; mul.ftz.f32 %f367, %f366, 0f3FB8AA3B; ex2.approx.ftz.f32 %f126, %f367; sub.ftz.f32 %f368, %f576, %f103; mul.ftz.f32 %f369, %f368, 0f3FB8AA3B; ex2.approx.ftz.f32 %f127, %f369; sub.ftz.f32 %f370, %f575, %f103; mul.ftz.f32 %f371, %f370, 0f3FB8AA3B; ex2.approx.ftz.f32 %f128, %f371; sub.ftz.f32 %f372, %f574, %f103; mul.ftz.f32 %f373, %f372, 0f3FB8AA3B; ex2.approx.ftz.f32 %f129, %f373; sub.ftz.f32 %f374, %f573, %f103; mul.ftz.f32 %f375, %f374, 0f3FB8AA3B; ex2.approx.ftz.f32 %f130, %f375; sub.ftz.f32 %f376, %f572, %f103; mul.ftz.f32 %f377, %f376, 0f3FB8AA3B; ex2.approx.ftz.f32 %f131, %f377; sub.ftz.f32 %f378, %f571, %f103; mul.ftz.f32 %f379, %f378, 0f3FB8AA3B; ex2.approx.ftz.f32 %f132, %f379; sub.ftz.f32 %f380, %f570, %f103; mul.ftz.f32 %f381, %f380, 0f3FB8AA3B; ex2.approx.ftz.f32 %f133, %f381; sub.ftz.f32 %f382, %f569, %f103; mul.ftz.f32 %f383, %f382, 0f3FB8AA3B; ex2.approx.ftz.f32 %f134, %f383; sub.ftz.f32 %f384, %f568, %f103; mul.ftz.f32 %f385, %f384, 0f3FB8AA3B; ex2.approx.ftz.f32 %f135, %f385; add.ftz.f32 %f386, %f104, %f105; add.ftz.f32 %f387, %f386, 0f00000000; add.ftz.f32 %f388, %f106, %f107; add.ftz.f32 %f389, %f388, 0f00000000; add.ftz.f32 %f390, %f108, %f109; add.ftz.f32 %f391, %f387, %f390; add.ftz.f32 %f392, %f110, %f111; add.ftz.f32 %f393, %f389, %f392; add.ftz.f32 %f394, %f112, %f113; add.ftz.f32 %f395, %f391, %f394; add.ftz.f32 %f396, %f114, %f115; add.ftz.f32 %f397, %f393, %f396; add.ftz.f32 %f398, %f116, %f117; add.ftz.f32 %f399, %f395, %f398; add.ftz.f32 %f400, %f118, %f119; add.ftz.f32 %f401, %f397, %f400; add.ftz.f32 %f402, %f399, %f401; add.ftz.f32 %f403, %f120, %f121; add.ftz.f32 %f404, %f403, 0f00000000; add.ftz.f32 %f405, %f122, %f123; add.ftz.f32 %f406, %f405, 0f00000000; add.ftz.f32 %f407, %f124, %f125; add.ftz.f32 %f408, %f404, %f407; add.ftz.f32 %f409, %f126, %f127; add.ftz.f32 %f410, %f406, %f409; add.ftz.f32 %f411, %f128, %f129; add.ftz.f32 %f412, %f408, %f411; add.ftz.f32 %f413, %f130, %f131; add.ftz.f32 %f414, %f410, %f413; add.ftz.f32 %f415, %f132, %f133; add.ftz.f32 %f416, %f412, %f415; add.ftz.f32 %f417, %f134, %f135; add.ftz.f32 %f418, %f414, %f417; add.ftz.f32 %f419, %f416, %f418; mov.b32 %r533, %f402; shfl.sync.bfly.b32 %r534|%p167, %r533, %r523, %r522, %r524; mov.b32 %f420, %r534; add.ftz.f32 %f421, %f402, %f420; mov.b32 %r535, %f421; shfl.sync.bfly.b32 %r536|%p168, %r535, %r527, %r522, %r524; mov.b32 %f422, %r536; add.ftz.f32 %f423, %f421, %f422; mov.b32 %r537, %f419; shfl.sync.bfly.b32 %r538|%p169, %r537, %r523, %r522, %r524; mov.b32 %f424, %r538; add.ftz.f32 %f425, %f419, %f424; mov.b32 %r539, %f425; shfl.sync.bfly.b32 %r540|%p170, %r539, %r527, %r522, %r524; mov.b32 %f426, %r540; add.ftz.f32 %f427, %f425, %f426; sub.ftz.f32 %f428, %f567, %f102; mul.ftz.f32 %f429, %f428, 0f3FB8AA3B; ex2.approx.ftz.f32 %f430, %f429; mul.ftz.f32 %f136, %f430, %f565; add.ftz.f32 %f565, %f136, %f423; sub.ftz.f32 %f431, %f566, %f103; mul.ftz.f32 %f432, %f431, 0f3FB8AA3B; ex2.approx.ftz.f32 %f433, %f432; mul.ftz.f32 %f138, %f433, %f564; add.ftz.f32 %f564, %f138, %f427; @%p178 bra $L__BB0_74; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r807, %r808, %r809, %r810}, [%r246]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r811, %r812, %r813, %r814}, [%r251]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r815, %r816, %r817, %r818}, [%r256]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r819, %r820, %r821, %r822}, [%r261]; // end inline asm $L__BB0_74: // begin inline asm cvt.rn.f16x2.f32 %r561, %f105, %f104; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r562, %f121, %f120; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r563, %f107, %f106; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r564, %f123, %f122; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r565, %f109, %f108; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r566, %f125, %f124; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r567, %f111, %f110; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r568, %f127, %f126; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r569, %f113, %f112; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r570, %f129, %f128; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r571, %f115, %f114; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r572, %f131, %f130; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r573, %f117, %f116; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r574, %f133, %f132; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r575, %f119, %f118; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r576, %f135, %f134; // end inline asm // begin inline asm mov.u32 %r577, 0; // end inline asm // begin inline asm mov.u32 %r578, 0; // end inline asm // begin inline asm mov.u32 %r579, 0; // end inline asm // begin inline asm mov.u32 %r580, 0; // end inline asm // begin inline asm mov.u32 %r581, 0; // end inline asm // begin inline asm mov.u32 %r582, 0; // end inline asm // begin inline asm mov.u32 %r583, 0; // end inline asm // begin inline asm mov.u32 %r584, 0; // end inline asm mov.b32 %f493, %r584; mov.b32 %f492, %r583; mov.b32 %f491, %r582; mov.b32 %f490, %r581; mov.b32 %f485, %r580; mov.b32 %f484, %r579; mov.b32 %f483, %r578; mov.b32 %f482, %r577; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r561, %r562, %r563, %r564}, {%r825, %r824}, {%f482, %f483, %f484, %f485}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r561, %r562, %r563, %r564}, {%r823, %r826}, {%f490, %f491, %f492, %f493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r565, %r566, %r567, %r568}, {%r827, %r828}, {%f482, %f483, %f484, %f485}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r565, %r566, %r567, %r568}, {%r829, %r830}, {%f490, %f491, %f492, %f493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r569, %r570, %r571, %r572}, {%r831, %r832}, {%f482, %f483, %f484, %f485}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r569, %r570, %r571, %r572}, {%r833, %r834}, {%f490, %f491, %f492, %f493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r573, %r574, %r575, %r576}, {%r835, %r836}, {%f482, %f483, %f484, %f485}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r573, %r574, %r575, %r576}, {%r837, %r838}, {%f490, %f491, %f492, %f493}; // end inline asm setp.equ.ftz.f32 %p171, %f565, 0f00000000; mov.f32 %f601, 0f3F800000; mov.f32 %f600, %f601; @%p171 bra $L__BB0_76; rcp.approx.ftz.f32 %f600, %f565; $L__BB0_76: setp.equ.ftz.f32 %p172, %f564, 0f00000000; @%p172 bra $L__BB0_78; rcp.approx.ftz.f32 %f601, %f564; $L__BB0_78: add.s32 %r701, %r801, 64; setp.ge.s32 %p177, %r701, %r58; mov.b32 %f532, %r846; fma.rn.ftz.f32 %f533, %f136, %f532, %f482; mul.ftz.f32 %f534, %f600, %f533; mov.b32 %r846, %f534; mov.b32 %f535, %r845; fma.rn.ftz.f32 %f536, %f136, %f535, %f483; mul.ftz.f32 %f537, %f600, %f536; mov.b32 %r845, %f537; mov.b32 %f538, %r844; fma.rn.ftz.f32 %f539, %f138, %f538, %f484; mul.ftz.f32 %f540, %f601, %f539; mov.b32 %r844, %f540; mov.b32 %f541, %r843; fma.rn.ftz.f32 %f542, %f138, %f541, %f485; mul.ftz.f32 %f543, %f601, %f542; mov.b32 %r843, %f543; mov.b32 %f544, %r842; fma.rn.ftz.f32 %f545, %f136, %f544, %f490; mul.ftz.f32 %f546, %f600, %f545; mov.b32 %r842, %f546; mov.b32 %f547, %r841; fma.rn.ftz.f32 %f548, %f136, %f547, %f491; mul.ftz.f32 %f549, %f600, %f548; mov.b32 %r841, %f549; mov.b32 %f550, %r840; fma.rn.ftz.f32 %f551, %f138, %f550, %f492; mul.ftz.f32 %f552, %f601, %f551; mov.b32 %r840, %f552; mov.b32 %f553, %r839; fma.rn.ftz.f32 %f554, %f138, %f553, %f493; mul.ftz.f32 %f555, %f601, %f554; mov.b32 %r839, %f555; @%p177 bra $L__BB0_80; add.s32 %r667, %r371, %r378; add.s32 %r638, %r667, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r825, %r824, %r823, %r826}, [%r638]; // end inline asm add.s32 %r643, %r667, 4608; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r827, %r828, %r829, %r830}, [%r643]; // end inline asm add.s32 %r648, %r667, 5120; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r831, %r832, %r833, %r834}, [%r648]; // end inline asm add.s32 %r653, %r667, 5632; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r835, %r836, %r837, %r838}, [%r653]; // end inline asm $L__BB0_80: add.s32 %r801, %r801, 64; setp.lt.s32 %p174, %r801, %r58; add.s32 %r802, %r802, 64; mov.f32 %f566, %f103; mov.f32 %f567, %f102; @%p174 bra $L__BB0_3; $L__BB0_81: mov.u32 %r727, %tid.x; shr.s32 %r726, %r727, 31; shr.u32 %r725, %r726, 29; add.s32 %r724, %r727, %r725; shr.s32 %r723, %r724, 3; shr.u32 %r722, %r727, 31; add.s32 %r721, %r727, %r722; mov.u32 %r720, %ctaid.z; shl.b32 %r719, %r720, 6; shr.s32 %r718, %r721, 1; add.s32 %r717, %r718, %r719; shr.u32 %r716, %r724, 31; add.s32 %r715, %r723, %r716; and.b32 %r714, %r715, 268435454; and.b32 %r713, %r724, 268435448; sub.s32 %r712, %r727, %r713; sub.s32 %r711, %r723, %r714; xor.b32 %r710, %r711, %r712; shl.b32 %r709, %r723, 7; shl.b32 %r708, %r710, 4; add.s32 %r707, %r378, 2048; add.s32 %r706, %r708, %r709; add.s32 %r705, %r706, %r707; shl.b32 %r704, %r727, 1; and.b32 %r703, %r727, 16; bar.sync 0; mov.b32 %f556, %r845; mov.b32 %f557, %r846; // begin inline asm cvt.rn.f16x2.f32 %r668, %f556, %f557; // end inline asm mov.b32 %f558, %r843; mov.b32 %f559, %r844; // begin inline asm cvt.rn.f16x2.f32 %r669, %f558, %f559; // end inline asm and.b32 %r686, %r727, 224; shr.u32 %r687, %r686, 3; shr.u32 %r689, %r703, 4; or.b32 %r690, %r687, %r689; shl.b32 %r691, %r690, 7; add.s32 %r693, %r691, %r378; and.b32 %r695, %r704, 24; and.b32 %r696, %r727, 3; or.b32 %r697, %r695, %r696; shl.b32 %r698, %r697, 2; or.b32 %r699, %r698, %r703; add.s32 %r700, %r693, %r699; add.s32 %r670, %r700, 2048; // begin inline asm st.shared.b32 [%r670], %r668; // end inline asm add.s32 %r672, %r700, 2304; // begin inline asm st.shared.b32 [%r672], %r669; // end inline asm xor.b32 %r676, %r670, 16; mov.b32 %f560, %r841; mov.b32 %f561, %r842; // begin inline asm cvt.rn.f16x2.f32 %r674, %f560, %f561; // end inline asm mov.b32 %f562, %r839; mov.b32 %f563, %r840; // begin inline asm cvt.rn.f16x2.f32 %r675, %f562, %f563; // end inline asm // begin inline asm st.shared.b32 [%r676], %r674; // end inline asm add.s32 %r678, %r676, 256; // begin inline asm st.shared.b32 [%r678], %r675; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r680, %r681, %r682, %r683}, [%r705]; // end inline asm setp.ge.s32 %p175, %r717, %r1; @%p175 bra $L__BB0_84; mov.b64 %rd106, fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0; mov.u64 %rd105, %rd106; ld.param.u32 %r737, [%rd105+60]; mov.u32 %r736, %tid.x; shr.s32 %r735, %r737, 31; shr.u32 %r734, %r735, 29; add.s32 %r733, %r737, %r734; shr.s32 %r732, %r733, 3; shr.u32 %r731, %r736, 31; add.s32 %r730, %r736, %r731; and.b32 %r729, %r730, -2; sub.s32 %r728, %r736, %r729; setp.ge.s32 %p176, %r728, %r732; @%p176 bra $L__BB0_84; mov.u32 %r755, %tid.x; mov.b64 %rd116, fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0; mov.u64 %rd115, %rd116; shr.u32 %r754, %r755, 31; add.s32 %r753, %r755, %r754; mov.u32 %r752, %ctaid.z; shl.b32 %r751, %r752, 6; shr.s32 %r750, %r753, 1; add.s32 %r749, %r750, %r751; cvt.s64.s32 %rd114, %r749; ld.param.u64 %rd113, [%rd115+32]; and.b32 %r748, %r753, -2; sub.s32 %r747, %r755, %r748; shl.b32 %r746, %r747, 4; cvt.s64.s32 %rd112, %r746; mul.lo.s64 %rd111, %rd113, %rd114; mov.u32 %r745, %ctaid.y; mov.u32 %r744, %ctaid.x; ld.param.u32 %r743, [fmha_v2_flash_attention_fp16_64_64_S_16_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r742, %r1, %r745; ld.param.u32 %r741, [%rd115+60]; mad.lo.s32 %r740, %r742, %r743, %r744; mul.lo.s32 %r739, %r740, %r741; shl.b32 %r738, %r739, 1; cvt.s64.s32 %rd110, %r738; add.s64 %rd109, %rd111, %rd112; add.s64 %rd108, %rd109, %rd110; ld.param.u64 %rd107, [%rd115+8]; cvta.to.global.u64 %rd103, %rd107; add.s64 %rd104, %rd103, %rd108; st.global.v4.u32 [%rd104], {%r680, %r681, %r682, %r683}; $L__BB0_84: ret; }