// Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_86 .address_size 64 // .globl fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled( .param .align 8 .b8 fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0[208] ) { .reg .pred %p<361>; .reg .b16 %rs<4>; .reg .f32 %f<3142>; .reg .b32 %r<3168>; .reg .b64 %rd<180>; mov.b64 %rd36, fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0; mov.u64 %rd1, %rd36; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+36]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; shl.b32 %r5, %r4, 6; setp.le.s32 %p66, %r1, %r5; @%p66 bra $L__BB0_47; mov.u32 %r475, %tid.x; mov.u32 %r476, %ctaid.z; mul.lo.s32 %r477, %r1, %r476; mad.lo.s32 %r478, %r477, %r2, %r3; shr.s32 %r479, %r475, 31; shr.u32 %r480, %r479, 27; add.s32 %r481, %r475, %r480; and.b32 %r482, %r481, -32; sub.s32 %r483, %r475, %r482; shr.u32 %r484, %r479, 25; add.s32 %r485, %r475, %r484; shr.s32 %r486, %r485, 7; shl.b32 %r487, %r486, 4; shr.s32 %r488, %r483, 31; shr.u32 %r489, %r488, 30; add.s32 %r490, %r483, %r489; and.b32 %r491, %r490, 2147483644; sub.s32 %r492, %r483, %r491; shl.b32 %r493, %r492, 1; add.s32 %r6, %r493, %r487; shr.s32 %r494, %r481, 5; shr.s32 %r495, %r481, 31; shr.u32 %r496, %r495, 30; add.s32 %r497, %r494, %r496; and.b32 %r498, %r497, 268435452; sub.s32 %r499, %r494, %r498; shl.b32 %r500, %r499, 4; shr.s32 %r501, %r490, 2; add.s32 %r7, %r500, %r501; ld.param.u32 %r8, [%rd1+200]; shr.u32 %r502, %r479, 29; add.s32 %r503, %r475, %r502; and.b32 %r504, %r503, -8; sub.s32 %r505, %r475, %r504; shl.b32 %r506, %r505, 4; cvt.s64.s32 %rd172, %r506; shr.s32 %r9, %r503, 3; add.s32 %r507, %r9, %r5; cvt.s64.s32 %rd37, %r507; ld.param.u64 %rd3, [%rd1+168]; mul.lo.s64 %rd38, %rd3, %rd37; mul.wide.s32 %rd39, %r478, 192; add.s64 %rd40, %rd38, %rd172; add.s64 %rd41, %rd40, %rd39; ld.param.u64 %rd42, [%rd1+144]; add.s64 %rd176, %rd42, %rd41; sub.s32 %r10, %r1, %r5; shr.s32 %r508, %r503, 31; shr.u32 %r509, %r508, 29; add.s32 %r510, %r9, %r509; and.b32 %r511, %r510, 268435448; sub.s32 %r512, %r9, %r511; xor.b32 %r513, %r512, %r505; shl.b32 %r514, %r9, 7; shl.b32 %r515, %r513, 4; add.s32 %r11, %r515, %r514; mov.u32 %r516, 31; mov.u32 %r3009, 0; mov.u32 %r517, -1; shfl.sync.idx.b32 %r3062|%p67, %r3009, %r3009, %r516, %r517; shfl.sync.idx.b32 %r3067|%p68, %r3009, %r3009, %r516, %r517; ld.param.u32 %r518, [%rd1+196]; div.s32 %r519, %r3, %r518; ld.param.u64 %rd5, [%rd1+152]; ld.param.u32 %r520, [%rd1+192]; mad.lo.s32 %r521, %r520, %r477, %r519; cvt.s64.s32 %rd43, %r9; ld.param.u64 %rd6, [%rd1+176]; mul.lo.s64 %rd44, %rd6, %rd43; mul.wide.s32 %rd45, %r521, 192; add.s64 %rd46, %rd45, %rd172; add.s64 %rd7, %rd46, %rd44; shfl.sync.idx.b32 %r3064|%p69, %r3009, %r3009, %r516, %r517; shfl.sync.idx.b32 %r3063|%p70, %r3009, %r3009, %r516, %r517; ld.param.u64 %rd8, [%rd1+160]; shr.u32 %r522, %r479, 28; add.s32 %r523, %r475, %r522; and.b32 %r524, %r523, -16; sub.s32 %r16, %r475, %r524; shl.b32 %r525, %r16, 4; cvt.s64.s32 %rd9, %r525; shr.s32 %r17, %r523, 4; cvt.s64.s32 %rd47, %r17; ld.param.u64 %rd10, [%rd1+184]; mul.lo.s64 %rd48, %rd10, %rd47; add.s64 %rd49, %rd45, %rd9; add.s64 %rd11, %rd49, %rd48; shr.s32 %r526, %r523, 31; shr.u32 %r527, %r526, 29; add.s32 %r528, %r17, %r527; and.b32 %r529, %r528, 268435448; sub.s32 %r530, %r17, %r529; xor.b32 %r531, %r530, %r16; shl.b32 %r532, %r17, 8; shl.b32 %r533, %r531, 4; add.s32 %r18, %r533, %r532; shfl.sync.idx.b32 %r3066|%p71, %r3009, %r3009, %r516, %r517; shfl.sync.idx.b32 %r3069|%p72, %r3009, %r3009, %r516, %r517; ld.param.u64 %rd12, [%rd1+24]; ld.param.u64 %rd13, [%rd1+8]; add.s32 %r534, %r17, %r5; cvt.s64.s32 %rd14, %r534; setp.le.s32 %p73, %r1, %r8; setp.gt.s32 %p74, %r1, %r8; add.s32 %r535, %r5, 64; min.s32 %r536, %r535, %r1; add.s32 %r537, %r536, 127; shr.s32 %r538, %r537, 31; shr.u32 %r539, %r538, 25; add.s32 %r540, %r537, %r539; and.b32 %r23, %r540, -128; sub.s32 %r541, %r5, %r8; max.s32 %r542, %r541, 0; and.b32 %r543, %r542, 2147483520; selp.b32 %r24, %r543, 0, %p74; @%p73 bra $L__BB0_3; add.s32 %r544, %r5, 63; sub.s32 %r545, %r544, %r8; max.s32 %r546, %r545, 0; and.b32 %r3009, %r546, 2147483520; $L__BB0_3: mov.u32 %r619, _ZN25fused_multihead_attention5smem_E; add.s32 %r27, %r18, %r619; cvt.u64.u32 %rd62, %r24; mul.lo.s64 %rd63, %rd6, %rd62; add.s64 %rd64, %rd7, %rd63; add.s64 %rd171, %rd5, %rd64; mul.lo.s64 %rd65, %rd10, %rd62; add.s64 %rd66, %rd11, %rd65; add.s64 %rd177, %rd8, %rd66; min.s32 %r620, %r10, 64; setp.lt.s32 %p75, %r9, %r620; add.s32 %r621, %r9, 16; setp.lt.s32 %p76, %r621, %r620; add.s32 %r622, %r9, 32; setp.lt.s32 %p77, %r622, %r620; add.s32 %r623, %r9, 48; setp.lt.s32 %p78, %r623, %r620; add.s32 %r28, %r11, %r619; add.s32 %r547, %r28, %r3067; add.s32 %r549, %r547, 2048; add.s32 %r551, %r547, 4096; add.s32 %r553, %r547, 6144; selp.b32 %r548, 16, 0, %p75; // begin inline asm cp.async.cg.shared.global [%r547], [%rd176], 16, %r548; // end inline asm selp.b32 %r550, 16, 0, %p76; shl.b64 %rd67, %rd3, 4; add.s64 %rd51, %rd176, %rd67; // begin inline asm cp.async.cg.shared.global [%r549], [%rd51], 16, %r550; // end inline asm selp.b32 %r552, 16, 0, %p77; add.s64 %rd52, %rd51, %rd67; // begin inline asm cp.async.cg.shared.global [%r551], [%rd52], 16, %r552; // end inline asm selp.b32 %r554, 16, 0, %p78; add.s64 %rd53, %rd52, %rd67; // begin inline asm cp.async.cg.shared.global [%r553], [%rd53], 16, %r554; // end inline asm sub.s32 %r3068, %r1, %r24; min.s32 %r624, %r3068, 128; setp.lt.s32 %p79, %r9, %r624; setp.lt.s32 %p80, %r621, %r624; setp.lt.s32 %p81, %r622, %r624; setp.lt.s32 %p82, %r623, %r624; add.s32 %r625, %r9, 64; setp.lt.s32 %p83, %r625, %r624; add.s32 %r626, %r9, 80; setp.lt.s32 %p84, %r626, %r624; add.s32 %r627, %r9, 96; setp.lt.s32 %p85, %r627, %r624; add.s32 %r628, %r9, 112; setp.lt.s32 %p86, %r628, %r624; selp.b32 %r566, 16, 0, %p84; add.s32 %r30, %r28, 16384; add.s32 %r555, %r30, %r3063; add.s32 %r557, %r555, 2048; add.s32 %r559, %r555, 4096; add.s32 %r561, %r555, 6144; add.s32 %r563, %r555, 8192; add.s32 %r565, %r555, 10240; add.s32 %r567, %r555, 12288; add.s32 %r569, %r555, 14336; selp.b32 %r556, 16, 0, %p79; // begin inline asm cp.async.cg.shared.global [%r555], [%rd171], 16, %r556; // end inline asm selp.b32 %r558, 16, 0, %p80; shl.b64 %rd68, %rd6, 4; add.s64 %rd55, %rd171, %rd68; // begin inline asm cp.async.cg.shared.global [%r557], [%rd55], 16, %r558; // end inline asm selp.b32 %r560, 16, 0, %p81; add.s64 %rd56, %rd55, %rd68; // begin inline asm cp.async.cg.shared.global [%r559], [%rd56], 16, %r560; // end inline asm selp.b32 %r562, 16, 0, %p82; add.s64 %rd57, %rd56, %rd68; // begin inline asm cp.async.cg.shared.global [%r561], [%rd57], 16, %r562; // end inline asm selp.b32 %r564, 16, 0, %p83; add.s64 %rd58, %rd57, %rd68; // begin inline asm cp.async.cg.shared.global [%r563], [%rd58], 16, %r564; // end inline asm add.s64 %rd59, %rd58, %rd68; // begin inline asm cp.async.cg.shared.global [%r565], [%rd59], 16, %r566; // end inline asm selp.b32 %r568, 16, 0, %p85; add.s64 %rd60, %rd59, %rd68; // begin inline asm cp.async.cg.shared.global [%r567], [%rd60], 16, %r568; // end inline asm selp.b32 %r570, 16, 0, %p86; add.s64 %rd61, %rd60, %rd68; // begin inline asm cp.async.cg.shared.global [%r569], [%rd61], 16, %r570; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm ld.param.f32 %f1, [%rd1+48]; // begin inline asm mov.u32 %r3057, 0; // end inline asm // begin inline asm mov.u32 %r3056, 0; // end inline asm // begin inline asm mov.u32 %r3055, 0; // end inline asm // begin inline asm mov.u32 %r3054, 0; // end inline asm // begin inline asm mov.u32 %r3053, 0; // end inline asm // begin inline asm mov.u32 %r3052, 0; // end inline asm // begin inline asm mov.u32 %r3051, 0; // end inline asm // begin inline asm mov.u32 %r3050, 0; // end inline asm // begin inline asm mov.u32 %r3049, 0; // end inline asm // begin inline asm mov.u32 %r3048, 0; // end inline asm // begin inline asm mov.u32 %r3047, 0; // end inline asm // begin inline asm mov.u32 %r3046, 0; // end inline asm // begin inline asm mov.u32 %r3045, 0; // end inline asm // begin inline asm mov.u32 %r3044, 0; // end inline asm // begin inline asm mov.u32 %r3043, 0; // end inline asm // begin inline asm mov.u32 %r3042, 0; // end inline asm // begin inline asm mov.u32 %r3041, 0; // end inline asm // begin inline asm mov.u32 %r3040, 0; // end inline asm // begin inline asm mov.u32 %r3039, 0; // end inline asm // begin inline asm mov.u32 %r3038, 0; // end inline asm // begin inline asm mov.u32 %r3037, 0; // end inline asm // begin inline asm mov.u32 %r3036, 0; // end inline asm // begin inline asm mov.u32 %r3035, 0; // end inline asm // begin inline asm mov.u32 %r3034, 0; // end inline asm // begin inline asm mov.u32 %r3033, 0; // end inline asm // begin inline asm mov.u32 %r3032, 0; // end inline asm // begin inline asm mov.u32 %r3031, 0; // end inline asm // begin inline asm mov.u32 %r3030, 0; // end inline asm // begin inline asm mov.u32 %r3029, 0; // end inline asm // begin inline asm mov.u32 %r3028, 0; // end inline asm // begin inline asm mov.u32 %r3027, 0; // end inline asm // begin inline asm mov.u32 %r3026, 0; // end inline asm // begin inline asm mov.u32 %r3025, 0; // end inline asm // begin inline asm mov.u32 %r3024, 0; // end inline asm // begin inline asm mov.u32 %r3023, 0; // end inline asm // begin inline asm mov.u32 %r3022, 0; // end inline asm // begin inline asm mov.u32 %r3021, 0; // end inline asm // begin inline asm mov.u32 %r3020, 0; // end inline asm // begin inline asm mov.u32 %r3019, 0; // end inline asm // begin inline asm mov.u32 %r3018, 0; // end inline asm // begin inline asm mov.u32 %r3017, 0; // end inline asm // begin inline asm mov.u32 %r3016, 0; // end inline asm // begin inline asm mov.u32 %r3015, 0; // end inline asm // begin inline asm mov.u32 %r3014, 0; // end inline asm // begin inline asm mov.u32 %r3013, 0; // end inline asm // begin inline asm mov.u32 %r3012, 0; // end inline asm // begin inline asm mov.u32 %r3011, 0; // end inline asm // begin inline asm mov.u32 %r3010, 0; // end inline asm setp.ge.s32 %p87, %r24, %r23; @%p87 bra $L__BB0_20; ld.param.u8 %rs1, [%rd1+62]; add.s32 %r79, %r27, 49152; ld.param.v2.u32 {%r631, %r632}, [%rd1+72]; add.s32 %r633, %r632, %r3; ld.param.v2.u32 {%r634, %r635}, [%rd1+64]; mov.b32 %f589, %r635; setp.lt.s32 %p88, %r633, %r634; selp.b32 %r638, 2, 1, %p88; selp.b32 %r639, 0, %r634, %p88; sub.s32 %r640, %r633, %r639; shl.b32 %r641, %r640, 1; add.s32 %r642, %r641, %r638; cvt.rn.f32.s32 %f590, %r642; mul.ftz.f32 %f2, %f589, %f590; ld.param.u32 %r82, [%rd1+80]; add.s32 %r83, %r7, %r5; shr.u32 %r643, %r4, 31; add.s32 %r644, %r4, %r643; shl.b32 %r645, %r644, 6; and.b32 %r84, %r645, -128; ex2.approx.ftz.f32 %f1359, %f2; mov.u32 %r3059, %r3068; mov.u32 %r3060, %r24; mov.u64 %rd175, %rd172; $L__BB0_5: setp.le.u32 %p89, %r3060, %r3009; and.pred %p91, %p74, %p89; setp.ge.s32 %p92, %r3060, %r84; setp.ne.s16 %p93, %rs1, 0; or.pred %p94, %p92, %p93; // begin inline asm mov.u32 %r646, 0; // end inline asm // begin inline asm mov.u32 %r647, 0; // end inline asm // begin inline asm mov.u32 %r648, 0; // end inline asm // begin inline asm mov.u32 %r649, 0; // end inline asm // begin inline asm mov.u32 %r650, 0; // end inline asm // begin inline asm mov.u32 %r651, 0; // end inline asm // begin inline asm mov.u32 %r652, 0; // end inline asm // begin inline asm mov.u32 %r653, 0; // end inline asm // begin inline asm mov.u32 %r654, 0; // end inline asm // begin inline asm mov.u32 %r655, 0; // end inline asm // begin inline asm mov.u32 %r656, 0; // end inline asm // begin inline asm mov.u32 %r657, 0; // end inline asm // begin inline asm mov.u32 %r658, 0; // end inline asm // begin inline asm mov.u32 %r659, 0; // end inline asm // begin inline asm mov.u32 %r660, 0; // end inline asm // begin inline asm mov.u32 %r661, 0; // end inline asm // begin inline asm mov.u32 %r662, 0; // end inline asm // begin inline asm mov.u32 %r663, 0; // end inline asm // begin inline asm mov.u32 %r664, 0; // end inline asm // begin inline asm mov.u32 %r665, 0; // end inline asm // begin inline asm mov.u32 %r666, 0; // end inline asm // begin inline asm mov.u32 %r667, 0; // end inline asm // begin inline asm mov.u32 %r668, 0; // end inline asm // begin inline asm mov.u32 %r669, 0; // end inline asm // begin inline asm mov.u32 %r670, 0; // end inline asm // begin inline asm mov.u32 %r671, 0; // end inline asm // begin inline asm mov.u32 %r672, 0; // end inline asm // begin inline asm mov.u32 %r673, 0; // end inline asm // begin inline asm mov.u32 %r674, 0; // end inline asm // begin inline asm mov.u32 %r675, 0; // end inline asm // begin inline asm mov.u32 %r676, 0; // end inline asm // begin inline asm mov.u32 %r677, 0; // end inline asm // begin inline asm mov.u32 %r678, 0; // end inline asm // begin inline asm mov.u32 %r679, 0; // end inline asm // begin inline asm mov.u32 %r680, 0; // end inline asm // begin inline asm mov.u32 %r681, 0; // end inline asm // begin inline asm mov.u32 %r682, 0; // end inline asm // begin inline asm mov.u32 %r683, 0; // end inline asm // begin inline asm mov.u32 %r684, 0; // end inline asm // begin inline asm mov.u32 %r685, 0; // end inline asm // begin inline asm mov.u32 %r686, 0; // end inline asm // begin inline asm mov.u32 %r687, 0; // end inline asm // begin inline asm mov.u32 %r688, 0; // end inline asm // begin inline asm mov.u32 %r689, 0; // end inline asm // begin inline asm mov.u32 %r690, 0; // end inline asm // begin inline asm mov.u32 %r691, 0; // end inline asm // begin inline asm mov.u32 %r692, 0; // end inline asm // begin inline asm mov.u32 %r693, 0; // end inline asm // begin inline asm mov.u32 %r694, 0; // end inline asm // begin inline asm mov.u32 %r695, 0; // end inline asm // begin inline asm mov.u32 %r696, 0; // end inline asm // begin inline asm mov.u32 %r697, 0; // end inline asm // begin inline asm mov.u32 %r698, 0; // end inline asm // begin inline asm mov.u32 %r699, 0; // end inline asm // begin inline asm mov.u32 %r700, 0; // end inline asm // begin inline asm mov.u32 %r701, 0; // end inline asm // begin inline asm mov.u32 %r702, 0; // end inline asm // begin inline asm mov.u32 %r703, 0; // end inline asm // begin inline asm mov.u32 %r704, 0; // end inline asm // begin inline asm mov.u32 %r705, 0; // end inline asm // begin inline asm mov.u32 %r706, 0; // end inline asm // begin inline asm mov.u32 %r707, 0; // end inline asm // begin inline asm mov.u32 %r708, 0; // end inline asm // begin inline asm mov.u32 %r709, 0; // end inline asm setp.ne.s32 %p95, %r3060, %r24; or.pred %p1, %p91, %p94; @%p95 bra $L__BB0_7; setp.gt.s32 %p100, %r3067, 8191; selp.b32 %r731, -8192, 8192, %p100; setp.lt.s64 %p101, %rd175, 64; and.pred %p102, %p101, %p75; and.pred %p103, %p101, %p76; and.pred %p104, %p101, %p77; and.pred %p105, %p101, %p78; add.s32 %r3067, %r731, %r3067; add.s64 %rd176, %rd176, 128; add.s64 %rd70, %rd176, %rd67; add.s32 %r716, %r28, %r3067; add.s32 %r718, %r716, 2048; add.s32 %r720, %r716, 4096; add.s32 %r722, %r716, 6144; selp.b32 %r717, 16, 0, %p102; // begin inline asm cp.async.cg.shared.global [%r716], [%rd176], 16, %r717; // end inline asm selp.b32 %r719, 16, 0, %p103; // begin inline asm cp.async.cg.shared.global [%r718], [%rd70], 16, %r719; // end inline asm selp.b32 %r721, 16, 0, %p104; add.s64 %rd71, %rd70, %rd67; // begin inline asm cp.async.cg.shared.global [%r720], [%rd71], 16, %r721; // end inline asm selp.b32 %r723, 16, 0, %p105; add.s64 %rd72, %rd71, %rd67; // begin inline asm cp.async.cg.shared.global [%r722], [%rd72], 16, %r723; // end inline asm add.s64 %rd175, %rd175, 128; $L__BB0_7: setp.gt.s32 %p106, %r3063, 16383; selp.b32 %r1312, -16384, 16384, %p106; min.s32 %r1313, %r3059, 128; setp.lt.s32 %p107, %r9, %r1313; setp.lt.s64 %p108, %rd172, 64; and.pred %p109, %p107, %p108; setp.lt.s32 %p110, %r621, %r1313; and.pred %p111, %p110, %p108; setp.lt.s32 %p112, %r622, %r1313; and.pred %p113, %p112, %p108; setp.lt.s32 %p114, %r623, %r1313; and.pred %p115, %p114, %p108; setp.lt.s32 %p116, %r625, %r1313; and.pred %p117, %p116, %p108; setp.lt.s32 %p118, %r626, %r1313; and.pred %p119, %p118, %p108; setp.lt.s32 %p120, %r627, %r1313; and.pred %p121, %p120, %p108; setp.lt.s32 %p122, %r628, %r1313; and.pred %p123, %p122, %p108; shl.b64 %rd82, %rd6, 7; mul.lo.s64 %rd83, %rd6, -112; add.s64 %rd84, %rd82, %rd83; add.s64 %rd85, %rd171, %rd84; add.s64 %rd75, %rd85, 128; add.s32 %r3063, %r1312, %r3063; selp.b32 %r743, 16, 0, %p119; add.s32 %r732, %r30, %r3063; add.s32 %r734, %r732, 2048; add.s32 %r736, %r732, 4096; add.s32 %r738, %r732, 6144; add.s32 %r740, %r732, 8192; add.s32 %r742, %r732, 10240; add.s32 %r744, %r732, 12288; add.s32 %r746, %r732, 14336; selp.b32 %r733, 16, 0, %p109; add.s64 %rd171, %rd171, 128; // begin inline asm cp.async.cg.shared.global [%r732], [%rd171], 16, %r733; // end inline asm selp.b32 %r735, 16, 0, %p111; // begin inline asm cp.async.cg.shared.global [%r734], [%rd75], 16, %r735; // end inline asm selp.b32 %r737, 16, 0, %p113; add.s64 %rd76, %rd75, %rd68; // begin inline asm cp.async.cg.shared.global [%r736], [%rd76], 16, %r737; // end inline asm selp.b32 %r739, 16, 0, %p115; add.s64 %rd77, %rd76, %rd68; // begin inline asm cp.async.cg.shared.global [%r738], [%rd77], 16, %r739; // end inline asm selp.b32 %r741, 16, 0, %p117; add.s64 %rd78, %rd77, %rd68; // begin inline asm cp.async.cg.shared.global [%r740], [%rd78], 16, %r741; // end inline asm add.s64 %rd79, %rd78, %rd68; // begin inline asm cp.async.cg.shared.global [%r742], [%rd79], 16, %r743; // end inline asm selp.b32 %r745, 16, 0, %p121; add.s64 %rd80, %rd79, %rd68; // begin inline asm cp.async.cg.shared.global [%r744], [%rd80], 16, %r745; // end inline asm selp.b32 %r747, 16, 0, %p123; add.s64 %rd81, %rd80, %rd68; // begin inline asm cp.async.cg.shared.global [%r746], [%rd81], 16, %r747; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 1; // end inline asm bar.sync 0; and.b32 %r1322, %r475, 96; shr.u32 %r1323, %r1322, 1; and.b32 %r1324, %r475, 15; or.b32 %r1325, %r1323, %r1324; shl.b32 %r1326, %r1325, 7; and.b32 %r1327, %r475, 7; shl.b32 %r1328, %r475, 4; and.b32 %r1329, %r1328, 112; and.b32 %r1330, %r475, 16; xor.b32 %r1331, %r1329, %r1330; or.b32 %r1332, %r1326, %r1331; add.s32 %r1334, %r3062, %r619; add.s32 %r752, %r1334, %r1332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r748, %r749, %r750, %r751}, [%r752]; // end inline asm shr.u32 %r1335, %r1330, 1; or.b32 %r1336, %r1335, %r1327; shl.b32 %r1337, %r1336, 7; and.b32 %r1338, %r475, 8; shr.u32 %r1339, %r1338, 3; xor.b32 %r1340, %r1339, %r1327; shl.b32 %r1341, %r1340, 4; or.b32 %r1342, %r1337, %r1341; add.s32 %r1343, %r3064, %r619; add.s32 %r1344, %r1343, 16384; add.s32 %r757, %r1344, %r1342; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r753, %r754, %r755, %r756}, [%r757]; // end inline asm add.s32 %r762, %r757, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r758, %r759, %r760, %r761}, [%r762]; // end inline asm add.s32 %r767, %r757, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r763, %r764, %r765, %r766}, [%r767]; // end inline asm add.s32 %r772, %r757, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r768, %r769, %r770, %r771}, [%r772]; // end inline asm add.s32 %r777, %r757, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r773, %r774, %r775, %r776}, [%r777]; // end inline asm add.s32 %r782, %r757, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r778, %r779, %r780, %r781}, [%r782]; // end inline asm add.s32 %r787, %r757, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r783, %r784, %r785, %r786}, [%r787]; // end inline asm add.s32 %r792, %r757, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r788, %r789, %r790, %r791}, [%r792]; // end inline asm mov.b32 %f722, %r649; mov.b32 %f721, %r648; mov.b32 %f720, %r647; mov.b32 %f719, %r646; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r748, %r749, %r750, %r751}, {%r753, %r754}, {%f719, %f720, %f721, %f722}; // end inline asm mov.b32 %f730, %r653; mov.b32 %f729, %r652; mov.b32 %f728, %r651; mov.b32 %f727, %r650; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r748, %r749, %r750, %r751}, {%r755, %r756}, {%f727, %f728, %f729, %f730}; // end inline asm mov.b32 %f738, %r657; mov.b32 %f737, %r656; mov.b32 %f736, %r655; mov.b32 %f735, %r654; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r748, %r749, %r750, %r751}, {%r758, %r759}, {%f735, %f736, %f737, %f738}; // end inline asm mov.b32 %f746, %r661; mov.b32 %f745, %r660; mov.b32 %f744, %r659; mov.b32 %f743, %r658; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r748, %r749, %r750, %r751}, {%r760, %r761}, {%f743, %f744, %f745, %f746}; // end inline asm mov.b32 %f754, %r665; mov.b32 %f753, %r664; mov.b32 %f752, %r663; mov.b32 %f751, %r662; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r748, %r749, %r750, %r751}, {%r763, %r764}, {%f751, %f752, %f753, %f754}; // end inline asm mov.b32 %f762, %r669; mov.b32 %f761, %r668; mov.b32 %f760, %r667; mov.b32 %f759, %r666; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r748, %r749, %r750, %r751}, {%r765, %r766}, {%f759, %f760, %f761, %f762}; // end inline asm mov.b32 %f770, %r673; mov.b32 %f769, %r672; mov.b32 %f768, %r671; mov.b32 %f767, %r670; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r748, %r749, %r750, %r751}, {%r768, %r769}, {%f767, %f768, %f769, %f770}; // end inline asm mov.b32 %f778, %r677; mov.b32 %f777, %r676; mov.b32 %f776, %r675; mov.b32 %f775, %r674; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r748, %r749, %r750, %r751}, {%r770, %r771}, {%f775, %f776, %f777, %f778}; // end inline asm mov.b32 %f786, %r681; mov.b32 %f785, %r680; mov.b32 %f784, %r679; mov.b32 %f783, %r678; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r748, %r749, %r750, %r751}, {%r773, %r774}, {%f783, %f784, %f785, %f786}; // end inline asm mov.b32 %f794, %r685; mov.b32 %f793, %r684; mov.b32 %f792, %r683; mov.b32 %f791, %r682; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r748, %r749, %r750, %r751}, {%r775, %r776}, {%f791, %f792, %f793, %f794}; // end inline asm mov.b32 %f802, %r689; mov.b32 %f801, %r688; mov.b32 %f800, %r687; mov.b32 %f799, %r686; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r748, %r749, %r750, %r751}, {%r778, %r779}, {%f799, %f800, %f801, %f802}; // end inline asm mov.b32 %f810, %r693; mov.b32 %f809, %r692; mov.b32 %f808, %r691; mov.b32 %f807, %r690; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r748, %r749, %r750, %r751}, {%r780, %r781}, {%f807, %f808, %f809, %f810}; // end inline asm mov.b32 %f818, %r697; mov.b32 %f817, %r696; mov.b32 %f816, %r695; mov.b32 %f815, %r694; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r748, %r749, %r750, %r751}, {%r783, %r784}, {%f815, %f816, %f817, %f818}; // end inline asm mov.b32 %f826, %r701; mov.b32 %f825, %r700; mov.b32 %f824, %r699; mov.b32 %f823, %r698; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r748, %r749, %r750, %r751}, {%r785, %r786}, {%f823, %f824, %f825, %f826}; // end inline asm mov.b32 %f834, %r705; mov.b32 %f833, %r704; mov.b32 %f832, %r703; mov.b32 %f831, %r702; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r748, %r749, %r750, %r751}, {%r788, %r789}, {%f831, %f832, %f833, %f834}; // end inline asm mov.b32 %f842, %r709; mov.b32 %f841, %r708; mov.b32 %f840, %r707; mov.b32 %f839, %r706; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r748, %r749, %r750, %r751}, {%r790, %r791}, {%f839, %f840, %f841, %f842}; // end inline asm xor.b32 %r1345, %r1332, 32; add.s32 %r893, %r1334, %r1345; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r889, %r890, %r891, %r892}, [%r893]; // end inline asm xor.b32 %r1346, %r1342, 32; add.s32 %r898, %r1344, %r1346; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r894, %r895, %r896, %r897}, [%r898]; // end inline asm add.s32 %r903, %r898, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r899, %r900, %r901, %r902}, [%r903]; // end inline asm add.s32 %r908, %r898, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r904, %r905, %r906, %r907}, [%r908]; // end inline asm add.s32 %r913, %r898, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r909, %r910, %r911, %r912}, [%r913]; // end inline asm add.s32 %r918, %r898, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r914, %r915, %r916, %r917}, [%r918]; // end inline asm add.s32 %r923, %r898, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r919, %r920, %r921, %r922}, [%r923]; // end inline asm add.s32 %r928, %r898, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r924, %r925, %r926, %r927}, [%r928]; // end inline asm add.s32 %r933, %r898, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r929, %r930, %r931, %r932}, [%r933]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r889, %r890, %r891, %r892}, {%r894, %r895}, {%f719, %f720, %f721, %f722}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r889, %r890, %r891, %r892}, {%r896, %r897}, {%f727, %f728, %f729, %f730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r889, %r890, %r891, %r892}, {%r899, %r900}, {%f735, %f736, %f737, %f738}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r889, %r890, %r891, %r892}, {%r901, %r902}, {%f743, %f744, %f745, %f746}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r889, %r890, %r891, %r892}, {%r904, %r905}, {%f751, %f752, %f753, %f754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r889, %r890, %r891, %r892}, {%r906, %r907}, {%f759, %f760, %f761, %f762}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r889, %r890, %r891, %r892}, {%r909, %r910}, {%f767, %f768, %f769, %f770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r889, %r890, %r891, %r892}, {%r911, %r912}, {%f775, %f776, %f777, %f778}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r889, %r890, %r891, %r892}, {%r914, %r915}, {%f783, %f784, %f785, %f786}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r889, %r890, %r891, %r892}, {%r916, %r917}, {%f791, %f792, %f793, %f794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r889, %r890, %r891, %r892}, {%r919, %r920}, {%f799, %f800, %f801, %f802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r889, %r890, %r891, %r892}, {%r921, %r922}, {%f807, %f808, %f809, %f810}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r889, %r890, %r891, %r892}, {%r924, %r925}, {%f815, %f816, %f817, %f818}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r889, %r890, %r891, %r892}, {%r926, %r927}, {%f823, %f824, %f825, %f826}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r889, %r890, %r891, %r892}, {%r929, %r930}, {%f831, %f832, %f833, %f834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r889, %r890, %r891, %r892}, {%r931, %r932}, {%f839, %f840, %f841, %f842}; // end inline asm xor.b32 %r1347, %r1332, 64; add.s32 %r1034, %r1334, %r1347; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1030, %r1031, %r1032, %r1033}, [%r1034]; // end inline asm xor.b32 %r1348, %r1342, 64; add.s32 %r1039, %r1344, %r1348; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1035, %r1036, %r1037, %r1038}, [%r1039]; // end inline asm add.s32 %r1044, %r1039, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1040, %r1041, %r1042, %r1043}, [%r1044]; // end inline asm add.s32 %r1049, %r1039, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1045, %r1046, %r1047, %r1048}, [%r1049]; // end inline asm add.s32 %r1054, %r1039, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1050, %r1051, %r1052, %r1053}, [%r1054]; // end inline asm add.s32 %r1059, %r1039, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1055, %r1056, %r1057, %r1058}, [%r1059]; // end inline asm add.s32 %r1064, %r1039, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1060, %r1061, %r1062, %r1063}, [%r1064]; // end inline asm add.s32 %r1069, %r1039, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1065, %r1066, %r1067, %r1068}, [%r1069]; // end inline asm add.s32 %r1074, %r1039, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1070, %r1071, %r1072, %r1073}, [%r1074]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r1030, %r1031, %r1032, %r1033}, {%r1035, %r1036}, {%f719, %f720, %f721, %f722}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r1030, %r1031, %r1032, %r1033}, {%r1037, %r1038}, {%f727, %f728, %f729, %f730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r1030, %r1031, %r1032, %r1033}, {%r1040, %r1041}, {%f735, %f736, %f737, %f738}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r1030, %r1031, %r1032, %r1033}, {%r1042, %r1043}, {%f743, %f744, %f745, %f746}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r1030, %r1031, %r1032, %r1033}, {%r1045, %r1046}, {%f751, %f752, %f753, %f754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r1030, %r1031, %r1032, %r1033}, {%r1047, %r1048}, {%f759, %f760, %f761, %f762}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r1030, %r1031, %r1032, %r1033}, {%r1050, %r1051}, {%f767, %f768, %f769, %f770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r1030, %r1031, %r1032, %r1033}, {%r1052, %r1053}, {%f775, %f776, %f777, %f778}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r1030, %r1031, %r1032, %r1033}, {%r1055, %r1056}, {%f783, %f784, %f785, %f786}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r1030, %r1031, %r1032, %r1033}, {%r1057, %r1058}, {%f791, %f792, %f793, %f794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r1030, %r1031, %r1032, %r1033}, {%r1060, %r1061}, {%f799, %f800, %f801, %f802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r1030, %r1031, %r1032, %r1033}, {%r1062, %r1063}, {%f807, %f808, %f809, %f810}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r1030, %r1031, %r1032, %r1033}, {%r1065, %r1066}, {%f815, %f816, %f817, %f818}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r1030, %r1031, %r1032, %r1033}, {%r1067, %r1068}, {%f823, %f824, %f825, %f826}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r1030, %r1031, %r1032, %r1033}, {%r1070, %r1071}, {%f831, %f832, %f833, %f834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r1030, %r1031, %r1032, %r1033}, {%r1072, %r1073}, {%f839, %f840, %f841, %f842}; // end inline asm xor.b32 %r1349, %r1332, 96; add.s32 %r1175, %r1334, %r1349; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1171, %r1172, %r1173, %r1174}, [%r1175]; // end inline asm xor.b32 %r1350, %r1342, 96; add.s32 %r1180, %r1344, %r1350; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1176, %r1177, %r1178, %r1179}, [%r1180]; // end inline asm add.s32 %r1185, %r1180, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1181, %r1182, %r1183, %r1184}, [%r1185]; // end inline asm add.s32 %r1190, %r1180, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1186, %r1187, %r1188, %r1189}, [%r1190]; // end inline asm add.s32 %r1195, %r1180, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1191, %r1192, %r1193, %r1194}, [%r1195]; // end inline asm add.s32 %r1200, %r1180, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1196, %r1197, %r1198, %r1199}, [%r1200]; // end inline asm add.s32 %r1205, %r1180, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1201, %r1202, %r1203, %r1204}, [%r1205]; // end inline asm add.s32 %r1210, %r1180, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1206, %r1207, %r1208, %r1209}, [%r1210]; // end inline asm add.s32 %r1215, %r1180, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1211, %r1212, %r1213, %r1214}, [%r1215]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r1171, %r1172, %r1173, %r1174}, {%r1176, %r1177}, {%f719, %f720, %f721, %f722}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r1171, %r1172, %r1173, %r1174}, {%r1178, %r1179}, {%f727, %f728, %f729, %f730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r1171, %r1172, %r1173, %r1174}, {%r1181, %r1182}, {%f735, %f736, %f737, %f738}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r1171, %r1172, %r1173, %r1174}, {%r1183, %r1184}, {%f743, %f744, %f745, %f746}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r1171, %r1172, %r1173, %r1174}, {%r1186, %r1187}, {%f751, %f752, %f753, %f754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r1171, %r1172, %r1173, %r1174}, {%r1188, %r1189}, {%f759, %f760, %f761, %f762}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r1171, %r1172, %r1173, %r1174}, {%r1191, %r1192}, {%f767, %f768, %f769, %f770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r1171, %r1172, %r1173, %r1174}, {%r1193, %r1194}, {%f775, %f776, %f777, %f778}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r1171, %r1172, %r1173, %r1174}, {%r1196, %r1197}, {%f783, %f784, %f785, %f786}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r1171, %r1172, %r1173, %r1174}, {%r1198, %r1199}, {%f791, %f792, %f793, %f794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r1171, %r1172, %r1173, %r1174}, {%r1201, %r1202}, {%f799, %f800, %f801, %f802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r1171, %r1172, %r1173, %r1174}, {%r1203, %r1204}, {%f807, %f808, %f809, %f810}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r1171, %r1172, %r1173, %r1174}, {%r1206, %r1207}, {%f815, %f816, %f817, %f818}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r1171, %r1172, %r1173, %r1174}, {%r1208, %r1209}, {%f823, %f824, %f825, %f826}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r1171, %r1172, %r1173, %r1174}, {%r1211, %r1212}, {%f831, %f832, %f833, %f834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r1171, %r1172, %r1173, %r1174}, {%r1213, %r1214}, {%f839, %f840, %f841, %f842}; // end inline asm bar.sync 0; selp.b32 %r1356, %r543, 0, %p74; setp.le.u32 %p125, %r3060, %r1356; @%p125 bra $L__BB0_9; shl.b64 %rd87, %rd10, 6; add.s64 %rd177, %rd177, %rd87; add.s32 %r3068, %r3068, -64; setp.gt.s32 %p126, %r3069, 16383; selp.b32 %r1357, -16384, 16384, %p126; add.s32 %r3069, %r1357, %r3069; $L__BB0_9: setp.gt.s32 %p127, %r3062, 8191; selp.b32 %r1746, -8192, 8192, %p127; add.s32 %r213, %r1746, %r3062; setp.gt.s32 %p128, %r3064, 16383; selp.b32 %r1747, -16384, 16384, %p128; add.s32 %r214, %r1747, %r3064; min.s32 %r1748, %r3068, 64; setp.lt.s32 %p129, %r17, %r1748; setp.lt.s32 %p130, %r16, 12; and.pred %p131, %p129, %p130; add.s32 %r1749, %r17, 8; setp.lt.s32 %p132, %r1749, %r1748; and.pred %p133, %p132, %p130; add.s32 %r1750, %r17, 16; setp.lt.s32 %p134, %r1750, %r1748; and.pred %p135, %p134, %p130; add.s32 %r1751, %r17, 24; setp.lt.s32 %p136, %r1751, %r1748; and.pred %p137, %p136, %p130; add.s32 %r1752, %r17, 32; setp.lt.s32 %p138, %r1752, %r1748; and.pred %p139, %p138, %p130; add.s32 %r1753, %r17, 40; setp.lt.s32 %p140, %r1753, %r1748; and.pred %p141, %p140, %p130; add.s32 %r1754, %r17, 48; setp.lt.s32 %p142, %r1754, %r1748; and.pred %p143, %p142, %p130; add.s32 %r1755, %r17, 56; setp.lt.s32 %p144, %r1755, %r1748; and.pred %p145, %p144, %p130; shl.b64 %rd96, %rd10, 3; add.s64 %rd89, %rd177, %rd96; selp.b32 %r1369, 16, 0, %p141; add.s32 %r1358, %r79, %r3069; add.s32 %r1360, %r1358, 2048; add.s32 %r1362, %r1358, 4096; add.s32 %r1364, %r1358, 6144; add.s32 %r1366, %r1358, 8192; add.s32 %r1368, %r1358, 10240; add.s32 %r1370, %r1358, 12288; add.s32 %r1372, %r1358, 14336; selp.b32 %r1359, 16, 0, %p131; // begin inline asm cp.async.cg.shared.global [%r1358], [%rd177], 16, %r1359; // end inline asm selp.b32 %r1361, 16, 0, %p133; // begin inline asm cp.async.cg.shared.global [%r1360], [%rd89], 16, %r1361; // end inline asm selp.b32 %r1363, 16, 0, %p135; add.s64 %rd90, %rd89, %rd96; // begin inline asm cp.async.cg.shared.global [%r1362], [%rd90], 16, %r1363; // end inline asm selp.b32 %r1365, 16, 0, %p137; add.s64 %rd91, %rd90, %rd96; // begin inline asm cp.async.cg.shared.global [%r1364], [%rd91], 16, %r1365; // end inline asm selp.b32 %r1367, 16, 0, %p139; add.s64 %rd92, %rd91, %rd96; // begin inline asm cp.async.cg.shared.global [%r1366], [%rd92], 16, %r1367; // end inline asm add.s64 %rd93, %rd92, %rd96; // begin inline asm cp.async.cg.shared.global [%r1368], [%rd93], 16, %r1369; // end inline asm selp.b32 %r1371, 16, 0, %p143; add.s64 %rd94, %rd93, %rd96; // begin inline asm cp.async.cg.shared.global [%r1370], [%rd94], 16, %r1371; // end inline asm selp.b32 %r1373, 16, 0, %p145; add.s64 %rd95, %rd94, %rd96; // begin inline asm cp.async.cg.shared.global [%r1372], [%rd95], 16, %r1373; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 1; // end inline asm bar.sync 0; add.s32 %r1769, %r213, %r619; add.s32 %r1378, %r1769, %r1332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1374, %r1375, %r1376, %r1377}, [%r1378]; // end inline asm add.s32 %r1778, %r214, %r619; add.s32 %r1779, %r1778, 16384; add.s32 %r1383, %r1779, %r1342; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1379, %r1380, %r1381, %r1382}, [%r1383]; // end inline asm add.s32 %r1388, %r1383, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1384, %r1385, %r1386, %r1387}, [%r1388]; // end inline asm add.s32 %r1393, %r1383, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1389, %r1390, %r1391, %r1392}, [%r1393]; // end inline asm add.s32 %r1398, %r1383, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1394, %r1395, %r1396, %r1397}, [%r1398]; // end inline asm add.s32 %r1403, %r1383, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1399, %r1400, %r1401, %r1402}, [%r1403]; // end inline asm add.s32 %r1408, %r1383, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1404, %r1405, %r1406, %r1407}, [%r1408]; // end inline asm add.s32 %r1413, %r1383, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1409, %r1410, %r1411, %r1412}, [%r1413]; // end inline asm add.s32 %r1418, %r1383, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1414, %r1415, %r1416, %r1417}, [%r1418]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r1374, %r1375, %r1376, %r1377}, {%r1379, %r1380}, {%f719, %f720, %f721, %f722}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r1374, %r1375, %r1376, %r1377}, {%r1381, %r1382}, {%f727, %f728, %f729, %f730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r1374, %r1375, %r1376, %r1377}, {%r1384, %r1385}, {%f735, %f736, %f737, %f738}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r1374, %r1375, %r1376, %r1377}, {%r1386, %r1387}, {%f743, %f744, %f745, %f746}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r1374, %r1375, %r1376, %r1377}, {%r1389, %r1390}, {%f751, %f752, %f753, %f754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r1374, %r1375, %r1376, %r1377}, {%r1391, %r1392}, {%f759, %f760, %f761, %f762}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r1374, %r1375, %r1376, %r1377}, {%r1394, %r1395}, {%f767, %f768, %f769, %f770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r1374, %r1375, %r1376, %r1377}, {%r1396, %r1397}, {%f775, %f776, %f777, %f778}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r1374, %r1375, %r1376, %r1377}, {%r1399, %r1400}, {%f783, %f784, %f785, %f786}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r1374, %r1375, %r1376, %r1377}, {%r1401, %r1402}, {%f791, %f792, %f793, %f794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r1374, %r1375, %r1376, %r1377}, {%r1404, %r1405}, {%f799, %f800, %f801, %f802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r1374, %r1375, %r1376, %r1377}, {%r1406, %r1407}, {%f807, %f808, %f809, %f810}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r1374, %r1375, %r1376, %r1377}, {%r1409, %r1410}, {%f815, %f816, %f817, %f818}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r1374, %r1375, %r1376, %r1377}, {%r1411, %r1412}, {%f823, %f824, %f825, %f826}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r1374, %r1375, %r1376, %r1377}, {%r1414, %r1415}, {%f831, %f832, %f833, %f834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r1374, %r1375, %r1376, %r1377}, {%r1416, %r1417}, {%f839, %f840, %f841, %f842}; // end inline asm add.s32 %r1519, %r1769, %r1345; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1515, %r1516, %r1517, %r1518}, [%r1519]; // end inline asm add.s32 %r1524, %r1779, %r1346; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1520, %r1521, %r1522, %r1523}, [%r1524]; // end inline asm add.s32 %r1529, %r1524, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1525, %r1526, %r1527, %r1528}, [%r1529]; // end inline asm add.s32 %r1534, %r1524, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1530, %r1531, %r1532, %r1533}, [%r1534]; // end inline asm add.s32 %r1539, %r1524, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1535, %r1536, %r1537, %r1538}, [%r1539]; // end inline asm add.s32 %r1544, %r1524, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1540, %r1541, %r1542, %r1543}, [%r1544]; // end inline asm add.s32 %r1549, %r1524, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1545, %r1546, %r1547, %r1548}, [%r1549]; // end inline asm add.s32 %r1554, %r1524, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1550, %r1551, %r1552, %r1553}, [%r1554]; // end inline asm add.s32 %r1559, %r1524, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1555, %r1556, %r1557, %r1558}, [%r1559]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f719, %f720, %f721, %f722}, {%r1515, %r1516, %r1517, %r1518}, {%r1520, %r1521}, {%f719, %f720, %f721, %f722}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f727, %f728, %f729, %f730}, {%r1515, %r1516, %r1517, %r1518}, {%r1522, %r1523}, {%f727, %f728, %f729, %f730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f735, %f736, %f737, %f738}, {%r1515, %r1516, %r1517, %r1518}, {%r1525, %r1526}, {%f735, %f736, %f737, %f738}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f743, %f744, %f745, %f746}, {%r1515, %r1516, %r1517, %r1518}, {%r1527, %r1528}, {%f743, %f744, %f745, %f746}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f751, %f752, %f753, %f754}, {%r1515, %r1516, %r1517, %r1518}, {%r1530, %r1531}, {%f751, %f752, %f753, %f754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f759, %f760, %f761, %f762}, {%r1515, %r1516, %r1517, %r1518}, {%r1532, %r1533}, {%f759, %f760, %f761, %f762}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f767, %f768, %f769, %f770}, {%r1515, %r1516, %r1517, %r1518}, {%r1535, %r1536}, {%f767, %f768, %f769, %f770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f775, %f776, %f777, %f778}, {%r1515, %r1516, %r1517, %r1518}, {%r1537, %r1538}, {%f775, %f776, %f777, %f778}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f783, %f784, %f785, %f786}, {%r1515, %r1516, %r1517, %r1518}, {%r1540, %r1541}, {%f783, %f784, %f785, %f786}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f791, %f792, %f793, %f794}, {%r1515, %r1516, %r1517, %r1518}, {%r1542, %r1543}, {%f791, %f792, %f793, %f794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f799, %f800, %f801, %f802}, {%r1515, %r1516, %r1517, %r1518}, {%r1545, %r1546}, {%f799, %f800, %f801, %f802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f807, %f808, %f809, %f810}, {%r1515, %r1516, %r1517, %r1518}, {%r1547, %r1548}, {%f807, %f808, %f809, %f810}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f815, %f816, %f817, %f818}, {%r1515, %r1516, %r1517, %r1518}, {%r1550, %r1551}, {%f815, %f816, %f817, %f818}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f823, %f824, %f825, %f826}, {%r1515, %r1516, %r1517, %r1518}, {%r1552, %r1553}, {%f823, %f824, %f825, %f826}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f831, %f832, %f833, %f834}, {%r1515, %r1516, %r1517, %r1518}, {%r1555, %r1556}, {%f831, %f832, %f833, %f834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f839, %f840, %f841, %f842}, {%r1515, %r1516, %r1517, %r1518}, {%r1557, %r1558}, {%f839, %f840, %f841, %f842}; // end inline asm add.s32 %r1660, %r1769, %r1347; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1656, %r1657, %r1658, %r1659}, [%r1660]; // end inline asm add.s32 %r1665, %r1779, %r1348; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1661, %r1662, %r1663, %r1664}, [%r1665]; // end inline asm add.s32 %r1670, %r1665, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1666, %r1667, %r1668, %r1669}, [%r1670]; // end inline asm add.s32 %r1675, %r1665, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1671, %r1672, %r1673, %r1674}, [%r1675]; // end inline asm add.s32 %r1680, %r1665, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1676, %r1677, %r1678, %r1679}, [%r1680]; // end inline asm add.s32 %r1685, %r1665, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1681, %r1682, %r1683, %r1684}, [%r1685]; // end inline asm add.s32 %r1690, %r1665, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1686, %r1687, %r1688, %r1689}, [%r1690]; // end inline asm add.s32 %r1695, %r1665, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1691, %r1692, %r1693, %r1694}, [%r1695]; // end inline asm add.s32 %r1700, %r1665, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1696, %r1697, %r1698, %r1699}, [%r1700]; // end inline asm add.s32 %r1705, %r1769, %r1349; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1701, %r1702, %r1703, %r1704}, [%r1705]; // end inline asm add.s32 %r1710, %r1779, %r1350; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1706, %r1707, %r1708, %r1709}, [%r1710]; // end inline asm add.s32 %r1715, %r1710, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1711, %r1712, %r1713, %r1714}, [%r1715]; // end inline asm add.s32 %r1720, %r1710, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1716, %r1717, %r1718, %r1719}, [%r1720]; // end inline asm add.s32 %r1725, %r1710, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1721, %r1722, %r1723, %r1724}, [%r1725]; // end inline asm add.s32 %r1730, %r1710, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1726, %r1727, %r1728, %r1729}, [%r1730]; // end inline asm add.s32 %r1735, %r1710, 10240; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1731, %r1732, %r1733, %r1734}, [%r1735]; // end inline asm add.s32 %r1740, %r1710, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1736, %r1737, %r1738, %r1739}, [%r1740]; // end inline asm add.s32 %r1745, %r1710, 14336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1741, %r1742, %r1743, %r1744}, [%r1745]; // end inline asm mul.ftz.f32 %f3069, %f1, %f719; mul.ftz.f32 %f3068, %f1, %f720; mul.ftz.f32 %f3067, %f1, %f727; mul.ftz.f32 %f3066, %f1, %f728; mul.ftz.f32 %f3037, %f1, %f721; mul.ftz.f32 %f3036, %f1, %f722; mul.ftz.f32 %f3035, %f1, %f729; mul.ftz.f32 %f3034, %f1, %f730; mul.ftz.f32 %f3065, %f1, %f735; mul.ftz.f32 %f3064, %f1, %f736; mul.ftz.f32 %f3063, %f1, %f743; mul.ftz.f32 %f3062, %f1, %f744; mul.ftz.f32 %f3033, %f1, %f737; mul.ftz.f32 %f3032, %f1, %f738; mul.ftz.f32 %f3031, %f1, %f745; mul.ftz.f32 %f3030, %f1, %f746; mul.ftz.f32 %f3061, %f1, %f751; mul.ftz.f32 %f3060, %f1, %f752; mul.ftz.f32 %f3059, %f1, %f759; mul.ftz.f32 %f3058, %f1, %f760; mul.ftz.f32 %f3029, %f1, %f753; mul.ftz.f32 %f3028, %f1, %f754; mul.ftz.f32 %f3027, %f1, %f761; mul.ftz.f32 %f3026, %f1, %f762; mul.ftz.f32 %f3057, %f1, %f767; mul.ftz.f32 %f3056, %f1, %f768; mul.ftz.f32 %f3055, %f1, %f775; mul.ftz.f32 %f3054, %f1, %f776; mul.ftz.f32 %f3025, %f1, %f769; mul.ftz.f32 %f3024, %f1, %f770; mul.ftz.f32 %f3023, %f1, %f777; mul.ftz.f32 %f3022, %f1, %f778; mul.ftz.f32 %f3053, %f1, %f783; mul.ftz.f32 %f3052, %f1, %f784; mul.ftz.f32 %f3051, %f1, %f791; mul.ftz.f32 %f3050, %f1, %f792; mul.ftz.f32 %f3021, %f1, %f785; mul.ftz.f32 %f3020, %f1, %f786; mul.ftz.f32 %f3019, %f1, %f793; mul.ftz.f32 %f3018, %f1, %f794; mul.ftz.f32 %f3049, %f1, %f799; mul.ftz.f32 %f3048, %f1, %f800; mul.ftz.f32 %f3047, %f1, %f807; mul.ftz.f32 %f3046, %f1, %f808; mul.ftz.f32 %f3017, %f1, %f801; mul.ftz.f32 %f3016, %f1, %f802; mul.ftz.f32 %f3015, %f1, %f809; mul.ftz.f32 %f3014, %f1, %f810; mul.ftz.f32 %f3045, %f1, %f815; mul.ftz.f32 %f3044, %f1, %f816; mul.ftz.f32 %f3043, %f1, %f823; mul.ftz.f32 %f3042, %f1, %f824; mul.ftz.f32 %f3013, %f1, %f817; mul.ftz.f32 %f3012, %f1, %f818; mul.ftz.f32 %f3011, %f1, %f825; mul.ftz.f32 %f3010, %f1, %f826; mul.ftz.f32 %f3041, %f1, %f831; mul.ftz.f32 %f3040, %f1, %f832; mul.ftz.f32 %f3039, %f1, %f839; mul.ftz.f32 %f3038, %f1, %f840; mul.ftz.f32 %f3009, %f1, %f833; mul.ftz.f32 %f3008, %f1, %f834; mul.ftz.f32 %f3007, %f1, %f841; mul.ftz.f32 %f3006, %f1, %f842; not.pred %p146, %p1; @%p146 bra $L__BB0_13; setp.eq.s16 %p147, %rs1, 0; add.s32 %r215, %r6, %r3060; setp.lt.s32 %p148, %r83, %r215; sub.s32 %r1786, %r83, %r8; max.s32 %r1787, %r1786, 0; setp.gt.s32 %p149, %r1787, %r215; or.pred %p2, %p148, %p149; setp.le.s32 %p150, %r83, %r215; add.s32 %r1788, %r215, 1; setp.gt.s32 %p151, %r1787, %r1788; or.pred %p3, %p150, %p151; add.s32 %r1789, %r215, 8; setp.lt.s32 %p152, %r83, %r1789; setp.gt.s32 %p153, %r1787, %r1789; or.pred %p4, %p152, %p153; add.s32 %r1790, %r215, 9; setp.lt.s32 %p154, %r83, %r1790; setp.gt.s32 %p155, %r1787, %r1790; or.pred %p5, %p154, %p155; add.s32 %r1791, %r215, 16; setp.lt.s32 %p156, %r83, %r1791; setp.gt.s32 %p157, %r1787, %r1791; or.pred %p6, %p156, %p157; add.s32 %r1792, %r215, 17; setp.lt.s32 %p158, %r83, %r1792; setp.gt.s32 %p159, %r1787, %r1792; or.pred %p7, %p158, %p159; add.s32 %r1793, %r215, 24; setp.lt.s32 %p160, %r83, %r1793; setp.gt.s32 %p161, %r1787, %r1793; or.pred %p8, %p160, %p161; add.s32 %r1794, %r215, 25; setp.lt.s32 %p162, %r83, %r1794; setp.gt.s32 %p163, %r1787, %r1794; or.pred %p9, %p162, %p163; add.s32 %r1795, %r215, 32; setp.lt.s32 %p164, %r83, %r1795; setp.gt.s32 %p165, %r1787, %r1795; or.pred %p10, %p164, %p165; add.s32 %r1796, %r215, 33; setp.lt.s32 %p166, %r83, %r1796; setp.gt.s32 %p167, %r1787, %r1796; or.pred %p11, %p166, %p167; add.s32 %r1797, %r215, 40; setp.lt.s32 %p168, %r83, %r1797; setp.gt.s32 %p169, %r1787, %r1797; or.pred %p12, %p168, %p169; add.s32 %r1798, %r215, 41; setp.lt.s32 %p170, %r83, %r1798; setp.gt.s32 %p171, %r1787, %r1798; or.pred %p13, %p170, %p171; add.s32 %r1799, %r215, 48; setp.lt.s32 %p172, %r83, %r1799; setp.gt.s32 %p173, %r1787, %r1799; or.pred %p14, %p172, %p173; add.s32 %r1800, %r215, 49; setp.lt.s32 %p174, %r83, %r1800; setp.gt.s32 %p175, %r1787, %r1800; or.pred %p15, %p174, %p175; add.s32 %r1801, %r215, 56; setp.lt.s32 %p176, %r83, %r1801; setp.gt.s32 %p177, %r1787, %r1801; or.pred %p16, %p176, %p177; add.s32 %r1802, %r215, 57; setp.lt.s32 %p178, %r83, %r1802; setp.gt.s32 %p179, %r1787, %r1802; or.pred %p17, %p178, %p179; add.s32 %r1803, %r215, 64; setp.lt.s32 %p180, %r83, %r1803; setp.gt.s32 %p181, %r1787, %r1803; or.pred %p18, %p180, %p181; add.s32 %r1804, %r215, 65; setp.lt.s32 %p182, %r83, %r1804; setp.gt.s32 %p183, %r1787, %r1804; or.pred %p19, %p182, %p183; add.s32 %r1805, %r215, 72; setp.lt.s32 %p184, %r83, %r1805; setp.gt.s32 %p185, %r1787, %r1805; or.pred %p20, %p184, %p185; add.s32 %r1806, %r215, 73; setp.lt.s32 %p186, %r83, %r1806; setp.gt.s32 %p187, %r1787, %r1806; or.pred %p21, %p186, %p187; add.s32 %r1807, %r215, 80; setp.lt.s32 %p188, %r83, %r1807; setp.gt.s32 %p189, %r1787, %r1807; or.pred %p22, %p188, %p189; add.s32 %r1808, %r215, 81; setp.lt.s32 %p190, %r83, %r1808; setp.gt.s32 %p191, %r1787, %r1808; or.pred %p23, %p190, %p191; add.s32 %r1809, %r215, 88; setp.lt.s32 %p192, %r83, %r1809; setp.gt.s32 %p193, %r1787, %r1809; or.pred %p24, %p192, %p193; add.s32 %r1810, %r215, 89; setp.lt.s32 %p194, %r83, %r1810; setp.gt.s32 %p195, %r1787, %r1810; or.pred %p25, %p194, %p195; add.s32 %r1811, %r215, 96; setp.lt.s32 %p196, %r83, %r1811; setp.gt.s32 %p197, %r1787, %r1811; or.pred %p26, %p196, %p197; add.s32 %r1812, %r215, 97; setp.lt.s32 %p198, %r83, %r1812; setp.gt.s32 %p199, %r1787, %r1812; or.pred %p27, %p198, %p199; add.s32 %r1813, %r215, 104; setp.lt.s32 %p200, %r83, %r1813; setp.gt.s32 %p201, %r1787, %r1813; or.pred %p28, %p200, %p201; add.s32 %r1814, %r215, 105; setp.lt.s32 %p202, %r83, %r1814; setp.gt.s32 %p203, %r1787, %r1814; or.pred %p29, %p202, %p203; add.s32 %r1815, %r215, 112; setp.lt.s32 %p204, %r83, %r1815; setp.gt.s32 %p205, %r1787, %r1815; or.pred %p30, %p204, %p205; add.s32 %r1816, %r215, 113; setp.lt.s32 %p206, %r83, %r1816; setp.gt.s32 %p207, %r1787, %r1816; or.pred %p31, %p206, %p207; add.s32 %r1817, %r215, 120; setp.lt.s32 %p208, %r83, %r1817; setp.gt.s32 %p209, %r1787, %r1817; or.pred %p32, %p208, %p209; add.s32 %r1818, %r215, 121; setp.lt.s32 %p210, %r83, %r1818; setp.gt.s32 %p211, %r1787, %r1818; or.pred %p33, %p210, %p211; add.s32 %r1819, %r83, 8; setp.lt.s32 %p212, %r1819, %r215; sub.s32 %r1820, %r1819, %r8; max.s32 %r1821, %r1820, 0; setp.gt.s32 %p213, %r1821, %r215; or.pred %p34, %p212, %p213; setp.le.s32 %p214, %r1819, %r215; setp.gt.s32 %p215, %r1821, %r1788; or.pred %p35, %p214, %p215; setp.lt.s32 %p216, %r1819, %r1789; setp.gt.s32 %p217, %r1821, %r1789; or.pred %p36, %p216, %p217; setp.lt.s32 %p218, %r1819, %r1790; setp.gt.s32 %p219, %r1821, %r1790; or.pred %p37, %p218, %p219; setp.lt.s32 %p220, %r1819, %r1791; setp.gt.s32 %p221, %r1821, %r1791; or.pred %p38, %p220, %p221; setp.lt.s32 %p222, %r1819, %r1792; setp.gt.s32 %p223, %r1821, %r1792; or.pred %p39, %p222, %p223; setp.lt.s32 %p224, %r1819, %r1793; setp.gt.s32 %p225, %r1821, %r1793; or.pred %p40, %p224, %p225; setp.lt.s32 %p226, %r1819, %r1794; setp.gt.s32 %p227, %r1821, %r1794; or.pred %p41, %p226, %p227; setp.lt.s32 %p228, %r1819, %r1795; setp.gt.s32 %p229, %r1821, %r1795; or.pred %p42, %p228, %p229; setp.lt.s32 %p230, %r1819, %r1796; setp.gt.s32 %p231, %r1821, %r1796; or.pred %p43, %p230, %p231; setp.lt.s32 %p232, %r1819, %r1797; setp.gt.s32 %p233, %r1821, %r1797; or.pred %p44, %p232, %p233; setp.lt.s32 %p234, %r1819, %r1798; setp.gt.s32 %p235, %r1821, %r1798; or.pred %p45, %p234, %p235; setp.lt.s32 %p236, %r1819, %r1799; setp.gt.s32 %p237, %r1821, %r1799; or.pred %p46, %p236, %p237; setp.lt.s32 %p238, %r1819, %r1800; setp.gt.s32 %p239, %r1821, %r1800; or.pred %p47, %p238, %p239; setp.lt.s32 %p240, %r1819, %r1801; setp.gt.s32 %p241, %r1821, %r1801; or.pred %p48, %p240, %p241; setp.lt.s32 %p242, %r1819, %r1802; setp.gt.s32 %p243, %r1821, %r1802; or.pred %p49, %p242, %p243; setp.lt.s32 %p244, %r1819, %r1803; setp.gt.s32 %p245, %r1821, %r1803; or.pred %p50, %p244, %p245; setp.lt.s32 %p246, %r1819, %r1804; setp.gt.s32 %p247, %r1821, %r1804; or.pred %p51, %p246, %p247; setp.lt.s32 %p248, %r1819, %r1805; setp.gt.s32 %p249, %r1821, %r1805; or.pred %p52, %p248, %p249; setp.lt.s32 %p250, %r1819, %r1806; setp.gt.s32 %p251, %r1821, %r1806; or.pred %p53, %p250, %p251; setp.lt.s32 %p252, %r1819, %r1807; setp.gt.s32 %p253, %r1821, %r1807; or.pred %p54, %p252, %p253; setp.lt.s32 %p254, %r1819, %r1808; setp.gt.s32 %p255, %r1821, %r1808; or.pred %p55, %p254, %p255; setp.lt.s32 %p256, %r1819, %r1809; setp.gt.s32 %p257, %r1821, %r1809; or.pred %p56, %p256, %p257; setp.lt.s32 %p258, %r1819, %r1810; setp.gt.s32 %p259, %r1821, %r1810; or.pred %p57, %p258, %p259; setp.lt.s32 %p260, %r1819, %r1811; setp.gt.s32 %p261, %r1821, %r1811; or.pred %p58, %p260, %p261; setp.lt.s32 %p262, %r1819, %r1812; setp.gt.s32 %p263, %r1821, %r1812; or.pred %p59, %p262, %p263; setp.lt.s32 %p264, %r1819, %r1813; setp.gt.s32 %p265, %r1821, %r1813; or.pred %p60, %p264, %p265; setp.lt.s32 %p266, %r1819, %r1814; setp.gt.s32 %p267, %r1821, %r1814; or.pred %p61, %p266, %p267; setp.lt.s32 %p268, %r1819, %r1815; setp.gt.s32 %p269, %r1821, %r1815; or.pred %p62, %p268, %p269; setp.lt.s32 %p270, %r1819, %r1816; setp.gt.s32 %p271, %r1821, %r1816; or.pred %p63, %p270, %p271; setp.lt.s32 %p272, %r1819, %r1817; setp.gt.s32 %p273, %r1821, %r1817; or.pred %p64, %p272, %p273; setp.lt.s32 %p274, %r1819, %r1818; setp.gt.s32 %p275, %r1821, %r1818; or.pred %p65, %p274, %p275; @%p147 bra $L__BB0_12; mov.b32 %f1360, %r631; mul.ftz.f32 %f1361, %f1359, %f1360; add.s32 %r1822, %r82, %r215; cvt.rn.f32.s32 %f1362, %r1822; mul.ftz.f32 %f1363, %f1361, %f1362; fma.rn.ftz.f32 %f1364, %f3069, %f1360, %f1363; selp.f32 %f3069, 0fFF7FFFFF, %f1364, %p2; add.s32 %r1823, %r1822, 1; cvt.rn.f32.s32 %f1365, %r1823; mul.ftz.f32 %f1366, %f1361, %f1365; fma.rn.ftz.f32 %f1367, %f3068, %f1360, %f1366; selp.f32 %f3068, 0fFF7FFFFF, %f1367, %p3; add.s32 %r1824, %r1822, 8; cvt.rn.f32.s32 %f1368, %r1824; mul.ftz.f32 %f1369, %f1361, %f1368; fma.rn.ftz.f32 %f1370, %f3067, %f1360, %f1369; selp.f32 %f3067, 0fFF7FFFFF, %f1370, %p4; add.s32 %r1825, %r1822, 9; cvt.rn.f32.s32 %f1371, %r1825; mul.ftz.f32 %f1372, %f1361, %f1371; fma.rn.ftz.f32 %f1373, %f3066, %f1360, %f1372; selp.f32 %f3066, 0fFF7FFFFF, %f1373, %p5; add.s32 %r1826, %r1822, 16; cvt.rn.f32.s32 %f1374, %r1826; mul.ftz.f32 %f1375, %f1361, %f1374; fma.rn.ftz.f32 %f1376, %f3065, %f1360, %f1375; selp.f32 %f3065, 0fFF7FFFFF, %f1376, %p6; add.s32 %r1827, %r1822, 17; cvt.rn.f32.s32 %f1377, %r1827; mul.ftz.f32 %f1378, %f1361, %f1377; fma.rn.ftz.f32 %f1379, %f3064, %f1360, %f1378; selp.f32 %f3064, 0fFF7FFFFF, %f1379, %p7; add.s32 %r1828, %r1822, 24; cvt.rn.f32.s32 %f1380, %r1828; mul.ftz.f32 %f1381, %f1361, %f1380; fma.rn.ftz.f32 %f1382, %f3063, %f1360, %f1381; selp.f32 %f3063, 0fFF7FFFFF, %f1382, %p8; add.s32 %r1829, %r1822, 25; cvt.rn.f32.s32 %f1383, %r1829; mul.ftz.f32 %f1384, %f1361, %f1383; fma.rn.ftz.f32 %f1385, %f3062, %f1360, %f1384; selp.f32 %f3062, 0fFF7FFFFF, %f1385, %p9; add.s32 %r1830, %r1822, 32; cvt.rn.f32.s32 %f1386, %r1830; mul.ftz.f32 %f1387, %f1361, %f1386; fma.rn.ftz.f32 %f1388, %f3061, %f1360, %f1387; selp.f32 %f3061, 0fFF7FFFFF, %f1388, %p10; add.s32 %r1831, %r1822, 33; cvt.rn.f32.s32 %f1389, %r1831; mul.ftz.f32 %f1390, %f1361, %f1389; fma.rn.ftz.f32 %f1391, %f3060, %f1360, %f1390; selp.f32 %f3060, 0fFF7FFFFF, %f1391, %p11; add.s32 %r1832, %r1822, 40; cvt.rn.f32.s32 %f1392, %r1832; mul.ftz.f32 %f1393, %f1361, %f1392; fma.rn.ftz.f32 %f1394, %f3059, %f1360, %f1393; selp.f32 %f3059, 0fFF7FFFFF, %f1394, %p12; add.s32 %r1833, %r1822, 41; cvt.rn.f32.s32 %f1395, %r1833; mul.ftz.f32 %f1396, %f1361, %f1395; fma.rn.ftz.f32 %f1397, %f3058, %f1360, %f1396; selp.f32 %f3058, 0fFF7FFFFF, %f1397, %p13; add.s32 %r1834, %r1822, 48; cvt.rn.f32.s32 %f1398, %r1834; mul.ftz.f32 %f1399, %f1361, %f1398; fma.rn.ftz.f32 %f1400, %f3057, %f1360, %f1399; selp.f32 %f3057, 0fFF7FFFFF, %f1400, %p14; add.s32 %r1835, %r1822, 49; cvt.rn.f32.s32 %f1401, %r1835; mul.ftz.f32 %f1402, %f1361, %f1401; fma.rn.ftz.f32 %f1403, %f3056, %f1360, %f1402; selp.f32 %f3056, 0fFF7FFFFF, %f1403, %p15; add.s32 %r1836, %r1822, 56; cvt.rn.f32.s32 %f1404, %r1836; mul.ftz.f32 %f1405, %f1361, %f1404; fma.rn.ftz.f32 %f1406, %f3055, %f1360, %f1405; selp.f32 %f3055, 0fFF7FFFFF, %f1406, %p16; add.s32 %r1837, %r1822, 57; cvt.rn.f32.s32 %f1407, %r1837; mul.ftz.f32 %f1408, %f1361, %f1407; fma.rn.ftz.f32 %f1409, %f3054, %f1360, %f1408; selp.f32 %f3054, 0fFF7FFFFF, %f1409, %p17; add.s32 %r1838, %r1822, 64; cvt.rn.f32.s32 %f1410, %r1838; mul.ftz.f32 %f1411, %f1361, %f1410; fma.rn.ftz.f32 %f1412, %f3053, %f1360, %f1411; selp.f32 %f3053, 0fFF7FFFFF, %f1412, %p18; add.s32 %r1839, %r1822, 65; cvt.rn.f32.s32 %f1413, %r1839; mul.ftz.f32 %f1414, %f1361, %f1413; fma.rn.ftz.f32 %f1415, %f3052, %f1360, %f1414; selp.f32 %f3052, 0fFF7FFFFF, %f1415, %p19; add.s32 %r1840, %r1822, 72; cvt.rn.f32.s32 %f1416, %r1840; mul.ftz.f32 %f1417, %f1361, %f1416; fma.rn.ftz.f32 %f1418, %f3051, %f1360, %f1417; selp.f32 %f3051, 0fFF7FFFFF, %f1418, %p20; add.s32 %r1841, %r1822, 73; cvt.rn.f32.s32 %f1419, %r1841; mul.ftz.f32 %f1420, %f1361, %f1419; fma.rn.ftz.f32 %f1421, %f3050, %f1360, %f1420; selp.f32 %f3050, 0fFF7FFFFF, %f1421, %p21; add.s32 %r1842, %r1822, 80; cvt.rn.f32.s32 %f1422, %r1842; mul.ftz.f32 %f1423, %f1361, %f1422; fma.rn.ftz.f32 %f1424, %f3049, %f1360, %f1423; selp.f32 %f3049, 0fFF7FFFFF, %f1424, %p22; add.s32 %r1843, %r1822, 81; cvt.rn.f32.s32 %f1425, %r1843; mul.ftz.f32 %f1426, %f1361, %f1425; fma.rn.ftz.f32 %f1427, %f3048, %f1360, %f1426; selp.f32 %f3048, 0fFF7FFFFF, %f1427, %p23; add.s32 %r1844, %r1822, 88; cvt.rn.f32.s32 %f1428, %r1844; mul.ftz.f32 %f1429, %f1361, %f1428; fma.rn.ftz.f32 %f1430, %f3047, %f1360, %f1429; selp.f32 %f3047, 0fFF7FFFFF, %f1430, %p24; add.s32 %r1845, %r1822, 89; cvt.rn.f32.s32 %f1431, %r1845; mul.ftz.f32 %f1432, %f1361, %f1431; fma.rn.ftz.f32 %f1433, %f3046, %f1360, %f1432; selp.f32 %f3046, 0fFF7FFFFF, %f1433, %p25; add.s32 %r1846, %r1822, 96; cvt.rn.f32.s32 %f1434, %r1846; mul.ftz.f32 %f1435, %f1361, %f1434; fma.rn.ftz.f32 %f1436, %f3045, %f1360, %f1435; selp.f32 %f3045, 0fFF7FFFFF, %f1436, %p26; add.s32 %r1847, %r1822, 97; cvt.rn.f32.s32 %f1437, %r1847; mul.ftz.f32 %f1438, %f1361, %f1437; fma.rn.ftz.f32 %f1439, %f3044, %f1360, %f1438; selp.f32 %f3044, 0fFF7FFFFF, %f1439, %p27; add.s32 %r1848, %r1822, 104; cvt.rn.f32.s32 %f1440, %r1848; mul.ftz.f32 %f1441, %f1361, %f1440; fma.rn.ftz.f32 %f1442, %f3043, %f1360, %f1441; selp.f32 %f3043, 0fFF7FFFFF, %f1442, %p28; add.s32 %r1849, %r1822, 105; cvt.rn.f32.s32 %f1443, %r1849; mul.ftz.f32 %f1444, %f1361, %f1443; fma.rn.ftz.f32 %f1445, %f3042, %f1360, %f1444; selp.f32 %f3042, 0fFF7FFFFF, %f1445, %p29; add.s32 %r1850, %r1822, 112; cvt.rn.f32.s32 %f1446, %r1850; mul.ftz.f32 %f1447, %f1361, %f1446; fma.rn.ftz.f32 %f1448, %f3041, %f1360, %f1447; selp.f32 %f3041, 0fFF7FFFFF, %f1448, %p30; add.s32 %r1851, %r1822, 113; cvt.rn.f32.s32 %f1449, %r1851; mul.ftz.f32 %f1450, %f1361, %f1449; fma.rn.ftz.f32 %f1451, %f3040, %f1360, %f1450; selp.f32 %f3040, 0fFF7FFFFF, %f1451, %p31; add.s32 %r1852, %r1822, 120; cvt.rn.f32.s32 %f1452, %r1852; mul.ftz.f32 %f1453, %f1361, %f1452; fma.rn.ftz.f32 %f1454, %f3039, %f1360, %f1453; selp.f32 %f3039, 0fFF7FFFFF, %f1454, %p32; add.s32 %r1853, %r1822, 121; cvt.rn.f32.s32 %f1455, %r1853; mul.ftz.f32 %f1456, %f1361, %f1455; fma.rn.ftz.f32 %f1457, %f3038, %f1360, %f1456; selp.f32 %f3038, 0fFF7FFFFF, %f1457, %p33; fma.rn.ftz.f32 %f1458, %f3037, %f1360, %f1363; selp.f32 %f3037, 0fFF7FFFFF, %f1458, %p34; fma.rn.ftz.f32 %f1459, %f3036, %f1360, %f1366; selp.f32 %f3036, 0fFF7FFFFF, %f1459, %p35; fma.rn.ftz.f32 %f1460, %f3035, %f1360, %f1369; selp.f32 %f3035, 0fFF7FFFFF, %f1460, %p36; fma.rn.ftz.f32 %f1461, %f3034, %f1360, %f1372; selp.f32 %f3034, 0fFF7FFFFF, %f1461, %p37; fma.rn.ftz.f32 %f1462, %f3033, %f1360, %f1375; selp.f32 %f3033, 0fFF7FFFFF, %f1462, %p38; fma.rn.ftz.f32 %f1463, %f3032, %f1360, %f1378; selp.f32 %f3032, 0fFF7FFFFF, %f1463, %p39; fma.rn.ftz.f32 %f1464, %f3031, %f1360, %f1381; selp.f32 %f3031, 0fFF7FFFFF, %f1464, %p40; fma.rn.ftz.f32 %f1465, %f3030, %f1360, %f1384; selp.f32 %f3030, 0fFF7FFFFF, %f1465, %p41; fma.rn.ftz.f32 %f1466, %f3029, %f1360, %f1387; selp.f32 %f3029, 0fFF7FFFFF, %f1466, %p42; fma.rn.ftz.f32 %f1467, %f3028, %f1360, %f1390; selp.f32 %f3028, 0fFF7FFFFF, %f1467, %p43; fma.rn.ftz.f32 %f1468, %f3027, %f1360, %f1393; selp.f32 %f3027, 0fFF7FFFFF, %f1468, %p44; fma.rn.ftz.f32 %f1469, %f3026, %f1360, %f1396; selp.f32 %f3026, 0fFF7FFFFF, %f1469, %p45; fma.rn.ftz.f32 %f1470, %f3025, %f1360, %f1399; selp.f32 %f3025, 0fFF7FFFFF, %f1470, %p46; fma.rn.ftz.f32 %f1471, %f3024, %f1360, %f1402; selp.f32 %f3024, 0fFF7FFFFF, %f1471, %p47; fma.rn.ftz.f32 %f1472, %f3023, %f1360, %f1405; selp.f32 %f3023, 0fFF7FFFFF, %f1472, %p48; fma.rn.ftz.f32 %f1473, %f3022, %f1360, %f1408; selp.f32 %f3022, 0fFF7FFFFF, %f1473, %p49; fma.rn.ftz.f32 %f1474, %f3021, %f1360, %f1411; selp.f32 %f3021, 0fFF7FFFFF, %f1474, %p50; fma.rn.ftz.f32 %f1475, %f3020, %f1360, %f1414; selp.f32 %f3020, 0fFF7FFFFF, %f1475, %p51; fma.rn.ftz.f32 %f1476, %f3019, %f1360, %f1417; selp.f32 %f3019, 0fFF7FFFFF, %f1476, %p52; fma.rn.ftz.f32 %f1477, %f3018, %f1360, %f1420; selp.f32 %f3018, 0fFF7FFFFF, %f1477, %p53; fma.rn.ftz.f32 %f1478, %f3017, %f1360, %f1423; selp.f32 %f3017, 0fFF7FFFFF, %f1478, %p54; fma.rn.ftz.f32 %f1479, %f3016, %f1360, %f1426; selp.f32 %f3016, 0fFF7FFFFF, %f1479, %p55; fma.rn.ftz.f32 %f1480, %f3015, %f1360, %f1429; selp.f32 %f3015, 0fFF7FFFFF, %f1480, %p56; fma.rn.ftz.f32 %f1481, %f3014, %f1360, %f1432; selp.f32 %f3014, 0fFF7FFFFF, %f1481, %p57; fma.rn.ftz.f32 %f1482, %f3013, %f1360, %f1435; selp.f32 %f3013, 0fFF7FFFFF, %f1482, %p58; fma.rn.ftz.f32 %f1483, %f3012, %f1360, %f1438; selp.f32 %f3012, 0fFF7FFFFF, %f1483, %p59; fma.rn.ftz.f32 %f1484, %f3011, %f1360, %f1441; selp.f32 %f3011, 0fFF7FFFFF, %f1484, %p60; fma.rn.ftz.f32 %f1485, %f3010, %f1360, %f1444; selp.f32 %f3010, 0fFF7FFFFF, %f1485, %p61; fma.rn.ftz.f32 %f1486, %f3009, %f1360, %f1447; selp.f32 %f3009, 0fFF7FFFFF, %f1486, %p62; fma.rn.ftz.f32 %f1487, %f3008, %f1360, %f1450; selp.f32 %f3008, 0fFF7FFFFF, %f1487, %p63; fma.rn.ftz.f32 %f1488, %f3007, %f1360, %f1453; selp.f32 %f3007, 0fFF7FFFFF, %f1488, %p64; fma.rn.ftz.f32 %f1489, %f3006, %f1360, %f1456; selp.f32 %f3006, 0fFF7FFFFF, %f1489, %p65; bra.uni $L__BB0_13; $L__BB0_12: selp.f32 %f3069, 0fFF7FFFFF, %f3069, %p2; selp.f32 %f3068, 0fFF7FFFFF, %f3068, %p3; selp.f32 %f3067, 0fFF7FFFFF, %f3067, %p4; selp.f32 %f3066, 0fFF7FFFFF, %f3066, %p5; selp.f32 %f3065, 0fFF7FFFFF, %f3065, %p6; selp.f32 %f3064, 0fFF7FFFFF, %f3064, %p7; selp.f32 %f3063, 0fFF7FFFFF, %f3063, %p8; selp.f32 %f3062, 0fFF7FFFFF, %f3062, %p9; selp.f32 %f3061, 0fFF7FFFFF, %f3061, %p10; selp.f32 %f3060, 0fFF7FFFFF, %f3060, %p11; selp.f32 %f3059, 0fFF7FFFFF, %f3059, %p12; selp.f32 %f3058, 0fFF7FFFFF, %f3058, %p13; selp.f32 %f3057, 0fFF7FFFFF, %f3057, %p14; selp.f32 %f3056, 0fFF7FFFFF, %f3056, %p15; selp.f32 %f3055, 0fFF7FFFFF, %f3055, %p16; selp.f32 %f3054, 0fFF7FFFFF, %f3054, %p17; selp.f32 %f3053, 0fFF7FFFFF, %f3053, %p18; selp.f32 %f3052, 0fFF7FFFFF, %f3052, %p19; selp.f32 %f3051, 0fFF7FFFFF, %f3051, %p20; selp.f32 %f3050, 0fFF7FFFFF, %f3050, %p21; selp.f32 %f3049, 0fFF7FFFFF, %f3049, %p22; selp.f32 %f3048, 0fFF7FFFFF, %f3048, %p23; selp.f32 %f3047, 0fFF7FFFFF, %f3047, %p24; selp.f32 %f3046, 0fFF7FFFFF, %f3046, %p25; selp.f32 %f3045, 0fFF7FFFFF, %f3045, %p26; selp.f32 %f3044, 0fFF7FFFFF, %f3044, %p27; selp.f32 %f3043, 0fFF7FFFFF, %f3043, %p28; selp.f32 %f3042, 0fFF7FFFFF, %f3042, %p29; selp.f32 %f3041, 0fFF7FFFFF, %f3041, %p30; selp.f32 %f3040, 0fFF7FFFFF, %f3040, %p31; selp.f32 %f3039, 0fFF7FFFFF, %f3039, %p32; selp.f32 %f3038, 0fFF7FFFFF, %f3038, %p33; selp.f32 %f3037, 0fFF7FFFFF, %f3037, %p34; selp.f32 %f3036, 0fFF7FFFFF, %f3036, %p35; selp.f32 %f3035, 0fFF7FFFFF, %f3035, %p36; selp.f32 %f3034, 0fFF7FFFFF, %f3034, %p37; selp.f32 %f3033, 0fFF7FFFFF, %f3033, %p38; selp.f32 %f3032, 0fFF7FFFFF, %f3032, %p39; selp.f32 %f3031, 0fFF7FFFFF, %f3031, %p40; selp.f32 %f3030, 0fFF7FFFFF, %f3030, %p41; selp.f32 %f3029, 0fFF7FFFFF, %f3029, %p42; selp.f32 %f3028, 0fFF7FFFFF, %f3028, %p43; selp.f32 %f3027, 0fFF7FFFFF, %f3027, %p44; selp.f32 %f3026, 0fFF7FFFFF, %f3026, %p45; selp.f32 %f3025, 0fFF7FFFFF, %f3025, %p46; selp.f32 %f3024, 0fFF7FFFFF, %f3024, %p47; selp.f32 %f3023, 0fFF7FFFFF, %f3023, %p48; selp.f32 %f3022, 0fFF7FFFFF, %f3022, %p49; selp.f32 %f3021, 0fFF7FFFFF, %f3021, %p50; selp.f32 %f3020, 0fFF7FFFFF, %f3020, %p51; selp.f32 %f3019, 0fFF7FFFFF, %f3019, %p52; selp.f32 %f3018, 0fFF7FFFFF, %f3018, %p53; selp.f32 %f3017, 0fFF7FFFFF, %f3017, %p54; selp.f32 %f3016, 0fFF7FFFFF, %f3016, %p55; selp.f32 %f3015, 0fFF7FFFFF, %f3015, %p56; selp.f32 %f3014, 0fFF7FFFFF, %f3014, %p57; selp.f32 %f3013, 0fFF7FFFFF, %f3013, %p58; selp.f32 %f3012, 0fFF7FFFFF, %f3012, %p59; selp.f32 %f3011, 0fFF7FFFFF, %f3011, %p60; selp.f32 %f3010, 0fFF7FFFFF, %f3010, %p61; selp.f32 %f3009, 0fFF7FFFFF, %f3009, %p62; selp.f32 %f3008, 0fFF7FFFFF, %f3008, %p63; selp.f32 %f3007, 0fFF7FFFFF, %f3007, %p64; selp.f32 %f3006, 0fFF7FFFFF, %f3006, %p65; $L__BB0_13: selp.b32 %r2985, %r543, 0, %p74; setp.eq.s32 %p277, %r3060, %r2985; max.ftz.f32 %f1490, %f3069, %f3068; max.ftz.f32 %f1491, %f1490, %f3067; max.ftz.f32 %f1492, %f1491, %f3066; max.ftz.f32 %f1493, %f1492, %f3065; max.ftz.f32 %f1494, %f1493, %f3064; max.ftz.f32 %f1495, %f1494, %f3063; max.ftz.f32 %f1496, %f1495, %f3062; max.ftz.f32 %f1497, %f1496, %f3061; max.ftz.f32 %f1498, %f1497, %f3060; max.ftz.f32 %f1499, %f1498, %f3059; max.ftz.f32 %f1500, %f1499, %f3058; max.ftz.f32 %f1501, %f1500, %f3057; max.ftz.f32 %f1502, %f1501, %f3056; max.ftz.f32 %f1503, %f1502, %f3055; max.ftz.f32 %f1504, %f1503, %f3054; max.ftz.f32 %f1505, %f1504, %f3053; max.ftz.f32 %f1506, %f1505, %f3052; max.ftz.f32 %f1507, %f1506, %f3051; max.ftz.f32 %f1508, %f1507, %f3050; max.ftz.f32 %f1509, %f1508, %f3049; max.ftz.f32 %f1510, %f1509, %f3048; max.ftz.f32 %f1511, %f1510, %f3047; max.ftz.f32 %f1512, %f1511, %f3046; max.ftz.f32 %f1513, %f1512, %f3045; max.ftz.f32 %f1514, %f1513, %f3044; max.ftz.f32 %f1515, %f1514, %f3043; max.ftz.f32 %f1516, %f1515, %f3042; max.ftz.f32 %f1517, %f1516, %f3041; max.ftz.f32 %f1518, %f1517, %f3040; max.ftz.f32 %f1519, %f1518, %f3039; max.ftz.f32 %f327, %f1519, %f3038; max.ftz.f32 %f1520, %f3037, %f3036; max.ftz.f32 %f1521, %f1520, %f3035; max.ftz.f32 %f1522, %f1521, %f3034; max.ftz.f32 %f1523, %f1522, %f3033; max.ftz.f32 %f1524, %f1523, %f3032; max.ftz.f32 %f1525, %f1524, %f3031; max.ftz.f32 %f1526, %f1525, %f3030; max.ftz.f32 %f1527, %f1526, %f3029; max.ftz.f32 %f1528, %f1527, %f3028; max.ftz.f32 %f1529, %f1528, %f3027; max.ftz.f32 %f1530, %f1529, %f3026; max.ftz.f32 %f1531, %f1530, %f3025; max.ftz.f32 %f1532, %f1531, %f3024; max.ftz.f32 %f1533, %f1532, %f3023; max.ftz.f32 %f1534, %f1533, %f3022; max.ftz.f32 %f1535, %f1534, %f3021; max.ftz.f32 %f1536, %f1535, %f3020; max.ftz.f32 %f1537, %f1536, %f3019; max.ftz.f32 %f1538, %f1537, %f3018; max.ftz.f32 %f1539, %f1538, %f3017; max.ftz.f32 %f1540, %f1539, %f3016; max.ftz.f32 %f1541, %f1540, %f3015; max.ftz.f32 %f1542, %f1541, %f3014; max.ftz.f32 %f1543, %f1542, %f3013; max.ftz.f32 %f1544, %f1543, %f3012; max.ftz.f32 %f1545, %f1544, %f3011; max.ftz.f32 %f1546, %f1545, %f3010; max.ftz.f32 %f1547, %f1546, %f3009; max.ftz.f32 %f1548, %f1547, %f3008; max.ftz.f32 %f1549, %f1548, %f3007; max.ftz.f32 %f328, %f1549, %f3006; mov.b32 %r216, %f327; mov.b32 %r217, %f328; @%p277 bra $L__BB0_15; bra.uni $L__BB0_14; $L__BB0_15: mov.u32 %r1878, 31; mov.u32 %r1879, 1; mov.u32 %r1880, -1; shfl.sync.bfly.b32 %r1881|%p288, %r216, %r1879, %r1878, %r1880; mov.b32 %f1864, %r1881; max.ftz.f32 %f1865, %f327, %f1864; mov.b32 %r1882, %f1865; mov.u32 %r1883, 2; shfl.sync.bfly.b32 %r1884|%p289, %r1882, %r1883, %r1878, %r1880; mov.b32 %f1866, %r1884; max.ftz.f32 %f3003, %f1865, %f1866; shfl.sync.bfly.b32 %r1885|%p290, %r217, %r1879, %r1878, %r1880; mov.b32 %f1867, %r1885; max.ftz.f32 %f1868, %f328, %f1867; mov.b32 %r1886, %f1868; shfl.sync.bfly.b32 %r1887|%p291, %r1886, %r1883, %r1878, %r1880; mov.b32 %f1869, %r1887; max.ftz.f32 %f3002, %f1868, %f1869; setp.eq.ftz.f32 %p292, %f3003, 0fFF7FFFFF; selp.f32 %f1870, 0f00000000, %f3003, %p292; sub.ftz.f32 %f1871, %f3069, %f1870; mul.ftz.f32 %f1872, %f1871, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3133, %f1872; sub.ftz.f32 %f1873, %f3068, %f1870; mul.ftz.f32 %f1874, %f1873, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3132, %f1874; sub.ftz.f32 %f1875, %f3067, %f1870; mul.ftz.f32 %f1876, %f1875, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3131, %f1876; sub.ftz.f32 %f1877, %f3066, %f1870; mul.ftz.f32 %f1878, %f1877, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3130, %f1878; sub.ftz.f32 %f1879, %f3065, %f1870; mul.ftz.f32 %f1880, %f1879, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3129, %f1880; sub.ftz.f32 %f1881, %f3064, %f1870; mul.ftz.f32 %f1882, %f1881, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3128, %f1882; sub.ftz.f32 %f1883, %f3063, %f1870; mul.ftz.f32 %f1884, %f1883, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3127, %f1884; sub.ftz.f32 %f1885, %f3062, %f1870; mul.ftz.f32 %f1886, %f1885, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3126, %f1886; sub.ftz.f32 %f1887, %f3061, %f1870; mul.ftz.f32 %f1888, %f1887, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3125, %f1888; sub.ftz.f32 %f1889, %f3060, %f1870; mul.ftz.f32 %f1890, %f1889, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3124, %f1890; sub.ftz.f32 %f1891, %f3059, %f1870; mul.ftz.f32 %f1892, %f1891, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3123, %f1892; sub.ftz.f32 %f1893, %f3058, %f1870; mul.ftz.f32 %f1894, %f1893, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3122, %f1894; sub.ftz.f32 %f1895, %f3057, %f1870; mul.ftz.f32 %f1896, %f1895, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3121, %f1896; sub.ftz.f32 %f1897, %f3056, %f1870; mul.ftz.f32 %f1898, %f1897, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3120, %f1898; sub.ftz.f32 %f1899, %f3055, %f1870; mul.ftz.f32 %f1900, %f1899, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3119, %f1900; sub.ftz.f32 %f1901, %f3054, %f1870; mul.ftz.f32 %f1902, %f1901, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3118, %f1902; sub.ftz.f32 %f1903, %f3053, %f1870; mul.ftz.f32 %f1904, %f1903, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3117, %f1904; sub.ftz.f32 %f1905, %f3052, %f1870; mul.ftz.f32 %f1906, %f1905, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3116, %f1906; sub.ftz.f32 %f1907, %f3051, %f1870; mul.ftz.f32 %f1908, %f1907, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3115, %f1908; sub.ftz.f32 %f1909, %f3050, %f1870; mul.ftz.f32 %f1910, %f1909, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3114, %f1910; sub.ftz.f32 %f1911, %f3049, %f1870; mul.ftz.f32 %f1912, %f1911, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3113, %f1912; sub.ftz.f32 %f1913, %f3048, %f1870; mul.ftz.f32 %f1914, %f1913, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3112, %f1914; sub.ftz.f32 %f1915, %f3047, %f1870; mul.ftz.f32 %f1916, %f1915, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3111, %f1916; sub.ftz.f32 %f1917, %f3046, %f1870; mul.ftz.f32 %f1918, %f1917, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3110, %f1918; sub.ftz.f32 %f1919, %f3045, %f1870; mul.ftz.f32 %f1920, %f1919, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3109, %f1920; sub.ftz.f32 %f1921, %f3044, %f1870; mul.ftz.f32 %f1922, %f1921, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3108, %f1922; sub.ftz.f32 %f1923, %f3043, %f1870; mul.ftz.f32 %f1924, %f1923, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3107, %f1924; sub.ftz.f32 %f1925, %f3042, %f1870; mul.ftz.f32 %f1926, %f1925, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3106, %f1926; sub.ftz.f32 %f1927, %f3041, %f1870; mul.ftz.f32 %f1928, %f1927, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3105, %f1928; sub.ftz.f32 %f1929, %f3040, %f1870; mul.ftz.f32 %f1930, %f1929, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3104, %f1930; sub.ftz.f32 %f1931, %f3039, %f1870; mul.ftz.f32 %f1932, %f1931, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3103, %f1932; sub.ftz.f32 %f1933, %f3038, %f1870; mul.ftz.f32 %f1934, %f1933, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3102, %f1934; setp.eq.ftz.f32 %p293, %f3002, 0fFF7FFFFF; selp.f32 %f1935, 0f00000000, %f3002, %p293; sub.ftz.f32 %f1936, %f3037, %f1935; mul.ftz.f32 %f1937, %f1936, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3101, %f1937; sub.ftz.f32 %f1938, %f3036, %f1935; mul.ftz.f32 %f1939, %f1938, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3100, %f1939; sub.ftz.f32 %f1940, %f3035, %f1935; mul.ftz.f32 %f1941, %f1940, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3099, %f1941; sub.ftz.f32 %f1942, %f3034, %f1935; mul.ftz.f32 %f1943, %f1942, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3098, %f1943; sub.ftz.f32 %f1944, %f3033, %f1935; mul.ftz.f32 %f1945, %f1944, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3097, %f1945; sub.ftz.f32 %f1946, %f3032, %f1935; mul.ftz.f32 %f1947, %f1946, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3096, %f1947; sub.ftz.f32 %f1948, %f3031, %f1935; mul.ftz.f32 %f1949, %f1948, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3095, %f1949; sub.ftz.f32 %f1950, %f3030, %f1935; mul.ftz.f32 %f1951, %f1950, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3094, %f1951; sub.ftz.f32 %f1952, %f3029, %f1935; mul.ftz.f32 %f1953, %f1952, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3093, %f1953; sub.ftz.f32 %f1954, %f3028, %f1935; mul.ftz.f32 %f1955, %f1954, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3092, %f1955; sub.ftz.f32 %f1956, %f3027, %f1935; mul.ftz.f32 %f1957, %f1956, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3091, %f1957; sub.ftz.f32 %f1958, %f3026, %f1935; mul.ftz.f32 %f1959, %f1958, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3090, %f1959; sub.ftz.f32 %f1960, %f3025, %f1935; mul.ftz.f32 %f1961, %f1960, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3089, %f1961; sub.ftz.f32 %f1962, %f3024, %f1935; mul.ftz.f32 %f1963, %f1962, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3088, %f1963; sub.ftz.f32 %f1964, %f3023, %f1935; mul.ftz.f32 %f1965, %f1964, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3087, %f1965; sub.ftz.f32 %f1966, %f3022, %f1935; mul.ftz.f32 %f1967, %f1966, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3086, %f1967; sub.ftz.f32 %f1968, %f3021, %f1935; mul.ftz.f32 %f1969, %f1968, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3085, %f1969; sub.ftz.f32 %f1970, %f3020, %f1935; mul.ftz.f32 %f1971, %f1970, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3084, %f1971; sub.ftz.f32 %f1972, %f3019, %f1935; mul.ftz.f32 %f1973, %f1972, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3083, %f1973; sub.ftz.f32 %f1974, %f3018, %f1935; mul.ftz.f32 %f1975, %f1974, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3082, %f1975; sub.ftz.f32 %f1976, %f3017, %f1935; mul.ftz.f32 %f1977, %f1976, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3081, %f1977; sub.ftz.f32 %f1978, %f3016, %f1935; mul.ftz.f32 %f1979, %f1978, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3080, %f1979; sub.ftz.f32 %f1980, %f3015, %f1935; mul.ftz.f32 %f1981, %f1980, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3079, %f1981; sub.ftz.f32 %f1982, %f3014, %f1935; mul.ftz.f32 %f1983, %f1982, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3078, %f1983; sub.ftz.f32 %f1984, %f3013, %f1935; mul.ftz.f32 %f1985, %f1984, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3077, %f1985; sub.ftz.f32 %f1986, %f3012, %f1935; mul.ftz.f32 %f1987, %f1986, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3076, %f1987; sub.ftz.f32 %f1988, %f3011, %f1935; mul.ftz.f32 %f1989, %f1988, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3075, %f1989; sub.ftz.f32 %f1990, %f3010, %f1935; mul.ftz.f32 %f1991, %f1990, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3074, %f1991; sub.ftz.f32 %f1992, %f3009, %f1935; mul.ftz.f32 %f1993, %f1992, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3073, %f1993; sub.ftz.f32 %f1994, %f3008, %f1935; mul.ftz.f32 %f1995, %f1994, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3072, %f1995; sub.ftz.f32 %f1996, %f3007, %f1935; mul.ftz.f32 %f1997, %f1996, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3071, %f1997; sub.ftz.f32 %f1998, %f3006, %f1935; mul.ftz.f32 %f1999, %f1998, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3070, %f1999; add.ftz.f32 %f2000, %f3133, %f3132; add.ftz.f32 %f2001, %f2000, 0f00000000; add.ftz.f32 %f2002, %f3131, %f3130; add.ftz.f32 %f2003, %f2002, 0f00000000; add.ftz.f32 %f2004, %f3129, %f3128; add.ftz.f32 %f2005, %f2001, %f2004; add.ftz.f32 %f2006, %f3127, %f3126; add.ftz.f32 %f2007, %f2003, %f2006; add.ftz.f32 %f2008, %f3125, %f3124; add.ftz.f32 %f2009, %f2005, %f2008; add.ftz.f32 %f2010, %f3123, %f3122; add.ftz.f32 %f2011, %f2007, %f2010; add.ftz.f32 %f2012, %f3121, %f3120; add.ftz.f32 %f2013, %f2009, %f2012; add.ftz.f32 %f2014, %f3119, %f3118; add.ftz.f32 %f2015, %f2011, %f2014; add.ftz.f32 %f2016, %f3117, %f3116; add.ftz.f32 %f2017, %f2013, %f2016; add.ftz.f32 %f2018, %f3115, %f3114; add.ftz.f32 %f2019, %f2015, %f2018; add.ftz.f32 %f2020, %f3113, %f3112; add.ftz.f32 %f2021, %f2017, %f2020; add.ftz.f32 %f2022, %f3111, %f3110; add.ftz.f32 %f2023, %f2019, %f2022; add.ftz.f32 %f2024, %f3109, %f3108; add.ftz.f32 %f2025, %f2021, %f2024; add.ftz.f32 %f2026, %f3107, %f3106; add.ftz.f32 %f2027, %f2023, %f2026; add.ftz.f32 %f2028, %f3105, %f3104; add.ftz.f32 %f2029, %f2025, %f2028; add.ftz.f32 %f2030, %f3103, %f3102; add.ftz.f32 %f2031, %f2027, %f2030; add.ftz.f32 %f2032, %f2029, %f2031; add.ftz.f32 %f2033, %f3101, %f3100; add.ftz.f32 %f2034, %f2033, 0f00000000; add.ftz.f32 %f2035, %f3099, %f3098; add.ftz.f32 %f2036, %f2035, 0f00000000; add.ftz.f32 %f2037, %f3097, %f3096; add.ftz.f32 %f2038, %f2034, %f2037; add.ftz.f32 %f2039, %f3095, %f3094; add.ftz.f32 %f2040, %f2036, %f2039; add.ftz.f32 %f2041, %f3093, %f3092; add.ftz.f32 %f2042, %f2038, %f2041; add.ftz.f32 %f2043, %f3091, %f3090; add.ftz.f32 %f2044, %f2040, %f2043; add.ftz.f32 %f2045, %f3089, %f3088; add.ftz.f32 %f2046, %f2042, %f2045; add.ftz.f32 %f2047, %f3087, %f3086; add.ftz.f32 %f2048, %f2044, %f2047; add.ftz.f32 %f2049, %f3085, %f3084; add.ftz.f32 %f2050, %f2046, %f2049; add.ftz.f32 %f2051, %f3083, %f3082; add.ftz.f32 %f2052, %f2048, %f2051; add.ftz.f32 %f2053, %f3081, %f3080; add.ftz.f32 %f2054, %f2050, %f2053; add.ftz.f32 %f2055, %f3079, %f3078; add.ftz.f32 %f2056, %f2052, %f2055; add.ftz.f32 %f2057, %f3077, %f3076; add.ftz.f32 %f2058, %f2054, %f2057; add.ftz.f32 %f2059, %f3075, %f3074; add.ftz.f32 %f2060, %f2056, %f2059; add.ftz.f32 %f2061, %f3073, %f3072; add.ftz.f32 %f2062, %f2058, %f2061; add.ftz.f32 %f2063, %f3071, %f3070; add.ftz.f32 %f2064, %f2060, %f2063; add.ftz.f32 %f2065, %f2062, %f2064; mov.b32 %r1888, %f2032; shfl.sync.bfly.b32 %r1889|%p294, %r1888, %r1879, %r1878, %r1880; mov.b32 %f2066, %r1889; add.ftz.f32 %f2067, %f2032, %f2066; mov.b32 %r1890, %f2067; shfl.sync.bfly.b32 %r1891|%p295, %r1890, %r1883, %r1878, %r1880; mov.b32 %f2068, %r1891; add.ftz.f32 %f3005, %f2067, %f2068; mov.b32 %r1892, %f2065; shfl.sync.bfly.b32 %r1893|%p296, %r1892, %r1879, %r1878, %r1880; mov.b32 %f2069, %r1893; add.ftz.f32 %f2070, %f2065, %f2069; mov.b32 %r1894, %f2070; shfl.sync.bfly.b32 %r1895|%p297, %r1894, %r1883, %r1878, %r1880; mov.b32 %f2071, %r1895; add.ftz.f32 %f3004, %f2070, %f2071; bra.uni $L__BB0_16; $L__BB0_14: mov.u32 %r1860, 31; mov.u32 %r1861, 1; mov.u32 %r1862, -1; shfl.sync.bfly.b32 %r1863|%p278, %r216, %r1861, %r1860, %r1862; mov.b32 %f1550, %r1863; max.ftz.f32 %f1551, %f327, %f1550; mov.b32 %r1864, %f1551; mov.u32 %r1865, 2; shfl.sync.bfly.b32 %r1866|%p279, %r1864, %r1865, %r1860, %r1862; mov.b32 %f1552, %r1866; max.ftz.f32 %f1553, %f1551, %f1552; shfl.sync.bfly.b32 %r1867|%p280, %r217, %r1861, %r1860, %r1862; mov.b32 %f1554, %r1867; max.ftz.f32 %f1555, %f328, %f1554; mov.b32 %r1868, %f1555; shfl.sync.bfly.b32 %r1869|%p281, %r1868, %r1865, %r1860, %r1862; mov.b32 %f1556, %r1869; max.ftz.f32 %f1557, %f1555, %f1556; max.ftz.f32 %f329, %f3003, %f1553; sub.ftz.f32 %f1558, %f3003, %f329; mul.ftz.f32 %f1559, %f1558, 0f3FB8AA3B; ex2.approx.ftz.f32 %f1560, %f1559; max.ftz.f32 %f330, %f3002, %f1557; sub.ftz.f32 %f1561, %f3002, %f330; mul.ftz.f32 %f1562, %f1561, 0f3FB8AA3B; ex2.approx.ftz.f32 %f1563, %f1562; mov.b32 %f1564, %r3057; mul.ftz.f32 %f1565, %f1560, %f1564; mov.b32 %r3057, %f1565; mov.b32 %f1566, %r3056; mul.ftz.f32 %f1567, %f1560, %f1566; mov.b32 %r3056, %f1567; mov.b32 %f1568, %r3055; mul.ftz.f32 %f1569, %f1563, %f1568; mov.b32 %r3055, %f1569; mov.b32 %f1570, %r3054; mul.ftz.f32 %f1571, %f1563, %f1570; mov.b32 %r3054, %f1571; mov.b32 %f1572, %r3053; mul.ftz.f32 %f1573, %f1560, %f1572; mov.b32 %r3053, %f1573; mov.b32 %f1574, %r3052; mul.ftz.f32 %f1575, %f1560, %f1574; mov.b32 %r3052, %f1575; mov.b32 %f1576, %r3051; mul.ftz.f32 %f1577, %f1563, %f1576; mov.b32 %r3051, %f1577; mov.b32 %f1578, %r3050; mul.ftz.f32 %f1579, %f1563, %f1578; mov.b32 %r3050, %f1579; mov.b32 %f1580, %r3049; mul.ftz.f32 %f1581, %f1560, %f1580; mov.b32 %r3049, %f1581; mov.b32 %f1582, %r3048; mul.ftz.f32 %f1583, %f1560, %f1582; mov.b32 %r3048, %f1583; mov.b32 %f1584, %r3047; mul.ftz.f32 %f1585, %f1563, %f1584; mov.b32 %r3047, %f1585; mov.b32 %f1586, %r3046; mul.ftz.f32 %f1587, %f1563, %f1586; mov.b32 %r3046, %f1587; mov.b32 %f1588, %r3045; mul.ftz.f32 %f1589, %f1560, %f1588; mov.b32 %r3045, %f1589; mov.b32 %f1590, %r3044; mul.ftz.f32 %f1591, %f1560, %f1590; mov.b32 %r3044, %f1591; mov.b32 %f1592, %r3043; mul.ftz.f32 %f1593, %f1563, %f1592; mov.b32 %r3043, %f1593; mov.b32 %f1594, %r3042; mul.ftz.f32 %f1595, %f1563, %f1594; mov.b32 %r3042, %f1595; mov.b32 %f1596, %r3041; mul.ftz.f32 %f1597, %f1560, %f1596; mov.b32 %r3041, %f1597; mov.b32 %f1598, %r3040; mul.ftz.f32 %f1599, %f1560, %f1598; mov.b32 %r3040, %f1599; mov.b32 %f1600, %r3039; mul.ftz.f32 %f1601, %f1563, %f1600; mov.b32 %r3039, %f1601; mov.b32 %f1602, %r3038; mul.ftz.f32 %f1603, %f1563, %f1602; mov.b32 %r3038, %f1603; mov.b32 %f1604, %r3037; mul.ftz.f32 %f1605, %f1560, %f1604; mov.b32 %r3037, %f1605; mov.b32 %f1606, %r3036; mul.ftz.f32 %f1607, %f1560, %f1606; mov.b32 %r3036, %f1607; mov.b32 %f1608, %r3035; mul.ftz.f32 %f1609, %f1563, %f1608; mov.b32 %r3035, %f1609; mov.b32 %f1610, %r3034; mul.ftz.f32 %f1611, %f1563, %f1610; mov.b32 %r3034, %f1611; mov.b32 %f1612, %r3033; mul.ftz.f32 %f1613, %f1560, %f1612; mov.b32 %r3033, %f1613; mov.b32 %f1614, %r3032; mul.ftz.f32 %f1615, %f1560, %f1614; mov.b32 %r3032, %f1615; mov.b32 %f1616, %r3031; mul.ftz.f32 %f1617, %f1563, %f1616; mov.b32 %r3031, %f1617; mov.b32 %f1618, %r3030; mul.ftz.f32 %f1619, %f1563, %f1618; mov.b32 %r3030, %f1619; mov.b32 %f1620, %r3029; mul.ftz.f32 %f1621, %f1560, %f1620; mov.b32 %r3029, %f1621; mov.b32 %f1622, %r3028; mul.ftz.f32 %f1623, %f1560, %f1622; mov.b32 %r3028, %f1623; mov.b32 %f1624, %r3027; mul.ftz.f32 %f1625, %f1563, %f1624; mov.b32 %r3027, %f1625; mov.b32 %f1626, %r3026; mul.ftz.f32 %f1627, %f1563, %f1626; mov.b32 %r3026, %f1627; mov.b32 %f1628, %r3025; mul.ftz.f32 %f1629, %f1560, %f1628; mov.b32 %r3025, %f1629; mov.b32 %f1630, %r3024; mul.ftz.f32 %f1631, %f1560, %f1630; mov.b32 %r3024, %f1631; mov.b32 %f1632, %r3023; mul.ftz.f32 %f1633, %f1563, %f1632; mov.b32 %r3023, %f1633; mov.b32 %f1634, %r3022; mul.ftz.f32 %f1635, %f1563, %f1634; mov.b32 %r3022, %f1635; mov.b32 %f1636, %r3021; mul.ftz.f32 %f1637, %f1560, %f1636; mov.b32 %r3021, %f1637; mov.b32 %f1638, %r3020; mul.ftz.f32 %f1639, %f1560, %f1638; mov.b32 %r3020, %f1639; mov.b32 %f1640, %r3019; mul.ftz.f32 %f1641, %f1563, %f1640; mov.b32 %r3019, %f1641; mov.b32 %f1642, %r3018; mul.ftz.f32 %f1643, %f1563, %f1642; mov.b32 %r3018, %f1643; mov.b32 %f1644, %r3017; mul.ftz.f32 %f1645, %f1560, %f1644; mov.b32 %r3017, %f1645; mov.b32 %f1646, %r3016; mul.ftz.f32 %f1647, %f1560, %f1646; mov.b32 %r3016, %f1647; mov.b32 %f1648, %r3015; mul.ftz.f32 %f1649, %f1563, %f1648; mov.b32 %r3015, %f1649; mov.b32 %f1650, %r3014; mul.ftz.f32 %f1651, %f1563, %f1650; mov.b32 %r3014, %f1651; mov.b32 %f1652, %r3013; mul.ftz.f32 %f1653, %f1560, %f1652; mov.b32 %r3013, %f1653; mov.b32 %f1654, %r3012; mul.ftz.f32 %f1655, %f1560, %f1654; mov.b32 %r3012, %f1655; mov.b32 %f1656, %r3011; mul.ftz.f32 %f1657, %f1563, %f1656; mov.b32 %r3011, %f1657; mov.b32 %f1658, %r3010; mul.ftz.f32 %f1659, %f1563, %f1658; mov.b32 %r3010, %f1659; setp.eq.ftz.f32 %p282, %f329, 0fFF7FFFFF; selp.f32 %f1660, 0f00000000, %f329, %p282; sub.ftz.f32 %f1661, %f3069, %f1660; mul.ftz.f32 %f1662, %f1661, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3133, %f1662; sub.ftz.f32 %f1663, %f3068, %f1660; mul.ftz.f32 %f1664, %f1663, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3132, %f1664; sub.ftz.f32 %f1665, %f3067, %f1660; mul.ftz.f32 %f1666, %f1665, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3131, %f1666; sub.ftz.f32 %f1667, %f3066, %f1660; mul.ftz.f32 %f1668, %f1667, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3130, %f1668; sub.ftz.f32 %f1669, %f3065, %f1660; mul.ftz.f32 %f1670, %f1669, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3129, %f1670; sub.ftz.f32 %f1671, %f3064, %f1660; mul.ftz.f32 %f1672, %f1671, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3128, %f1672; sub.ftz.f32 %f1673, %f3063, %f1660; mul.ftz.f32 %f1674, %f1673, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3127, %f1674; sub.ftz.f32 %f1675, %f3062, %f1660; mul.ftz.f32 %f1676, %f1675, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3126, %f1676; sub.ftz.f32 %f1677, %f3061, %f1660; mul.ftz.f32 %f1678, %f1677, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3125, %f1678; sub.ftz.f32 %f1679, %f3060, %f1660; mul.ftz.f32 %f1680, %f1679, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3124, %f1680; sub.ftz.f32 %f1681, %f3059, %f1660; mul.ftz.f32 %f1682, %f1681, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3123, %f1682; sub.ftz.f32 %f1683, %f3058, %f1660; mul.ftz.f32 %f1684, %f1683, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3122, %f1684; sub.ftz.f32 %f1685, %f3057, %f1660; mul.ftz.f32 %f1686, %f1685, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3121, %f1686; sub.ftz.f32 %f1687, %f3056, %f1660; mul.ftz.f32 %f1688, %f1687, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3120, %f1688; sub.ftz.f32 %f1689, %f3055, %f1660; mul.ftz.f32 %f1690, %f1689, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3119, %f1690; sub.ftz.f32 %f1691, %f3054, %f1660; mul.ftz.f32 %f1692, %f1691, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3118, %f1692; sub.ftz.f32 %f1693, %f3053, %f1660; mul.ftz.f32 %f1694, %f1693, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3117, %f1694; sub.ftz.f32 %f1695, %f3052, %f1660; mul.ftz.f32 %f1696, %f1695, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3116, %f1696; sub.ftz.f32 %f1697, %f3051, %f1660; mul.ftz.f32 %f1698, %f1697, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3115, %f1698; sub.ftz.f32 %f1699, %f3050, %f1660; mul.ftz.f32 %f1700, %f1699, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3114, %f1700; sub.ftz.f32 %f1701, %f3049, %f1660; mul.ftz.f32 %f1702, %f1701, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3113, %f1702; sub.ftz.f32 %f1703, %f3048, %f1660; mul.ftz.f32 %f1704, %f1703, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3112, %f1704; sub.ftz.f32 %f1705, %f3047, %f1660; mul.ftz.f32 %f1706, %f1705, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3111, %f1706; sub.ftz.f32 %f1707, %f3046, %f1660; mul.ftz.f32 %f1708, %f1707, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3110, %f1708; sub.ftz.f32 %f1709, %f3045, %f1660; mul.ftz.f32 %f1710, %f1709, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3109, %f1710; sub.ftz.f32 %f1711, %f3044, %f1660; mul.ftz.f32 %f1712, %f1711, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3108, %f1712; sub.ftz.f32 %f1713, %f3043, %f1660; mul.ftz.f32 %f1714, %f1713, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3107, %f1714; sub.ftz.f32 %f1715, %f3042, %f1660; mul.ftz.f32 %f1716, %f1715, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3106, %f1716; sub.ftz.f32 %f1717, %f3041, %f1660; mul.ftz.f32 %f1718, %f1717, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3105, %f1718; sub.ftz.f32 %f1719, %f3040, %f1660; mul.ftz.f32 %f1720, %f1719, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3104, %f1720; sub.ftz.f32 %f1721, %f3039, %f1660; mul.ftz.f32 %f1722, %f1721, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3103, %f1722; sub.ftz.f32 %f1723, %f3038, %f1660; mul.ftz.f32 %f1724, %f1723, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3102, %f1724; setp.eq.ftz.f32 %p283, %f330, 0fFF7FFFFF; selp.f32 %f1725, 0f00000000, %f330, %p283; sub.ftz.f32 %f1726, %f3037, %f1725; mul.ftz.f32 %f1727, %f1726, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3101, %f1727; sub.ftz.f32 %f1728, %f3036, %f1725; mul.ftz.f32 %f1729, %f1728, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3100, %f1729; sub.ftz.f32 %f1730, %f3035, %f1725; mul.ftz.f32 %f1731, %f1730, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3099, %f1731; sub.ftz.f32 %f1732, %f3034, %f1725; mul.ftz.f32 %f1733, %f1732, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3098, %f1733; sub.ftz.f32 %f1734, %f3033, %f1725; mul.ftz.f32 %f1735, %f1734, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3097, %f1735; sub.ftz.f32 %f1736, %f3032, %f1725; mul.ftz.f32 %f1737, %f1736, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3096, %f1737; sub.ftz.f32 %f1738, %f3031, %f1725; mul.ftz.f32 %f1739, %f1738, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3095, %f1739; sub.ftz.f32 %f1740, %f3030, %f1725; mul.ftz.f32 %f1741, %f1740, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3094, %f1741; sub.ftz.f32 %f1742, %f3029, %f1725; mul.ftz.f32 %f1743, %f1742, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3093, %f1743; sub.ftz.f32 %f1744, %f3028, %f1725; mul.ftz.f32 %f1745, %f1744, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3092, %f1745; sub.ftz.f32 %f1746, %f3027, %f1725; mul.ftz.f32 %f1747, %f1746, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3091, %f1747; sub.ftz.f32 %f1748, %f3026, %f1725; mul.ftz.f32 %f1749, %f1748, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3090, %f1749; sub.ftz.f32 %f1750, %f3025, %f1725; mul.ftz.f32 %f1751, %f1750, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3089, %f1751; sub.ftz.f32 %f1752, %f3024, %f1725; mul.ftz.f32 %f1753, %f1752, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3088, %f1753; sub.ftz.f32 %f1754, %f3023, %f1725; mul.ftz.f32 %f1755, %f1754, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3087, %f1755; sub.ftz.f32 %f1756, %f3022, %f1725; mul.ftz.f32 %f1757, %f1756, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3086, %f1757; sub.ftz.f32 %f1758, %f3021, %f1725; mul.ftz.f32 %f1759, %f1758, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3085, %f1759; sub.ftz.f32 %f1760, %f3020, %f1725; mul.ftz.f32 %f1761, %f1760, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3084, %f1761; sub.ftz.f32 %f1762, %f3019, %f1725; mul.ftz.f32 %f1763, %f1762, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3083, %f1763; sub.ftz.f32 %f1764, %f3018, %f1725; mul.ftz.f32 %f1765, %f1764, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3082, %f1765; sub.ftz.f32 %f1766, %f3017, %f1725; mul.ftz.f32 %f1767, %f1766, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3081, %f1767; sub.ftz.f32 %f1768, %f3016, %f1725; mul.ftz.f32 %f1769, %f1768, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3080, %f1769; sub.ftz.f32 %f1770, %f3015, %f1725; mul.ftz.f32 %f1771, %f1770, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3079, %f1771; sub.ftz.f32 %f1772, %f3014, %f1725; mul.ftz.f32 %f1773, %f1772, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3078, %f1773; sub.ftz.f32 %f1774, %f3013, %f1725; mul.ftz.f32 %f1775, %f1774, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3077, %f1775; sub.ftz.f32 %f1776, %f3012, %f1725; mul.ftz.f32 %f1777, %f1776, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3076, %f1777; sub.ftz.f32 %f1778, %f3011, %f1725; mul.ftz.f32 %f1779, %f1778, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3075, %f1779; sub.ftz.f32 %f1780, %f3010, %f1725; mul.ftz.f32 %f1781, %f1780, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3074, %f1781; sub.ftz.f32 %f1782, %f3009, %f1725; mul.ftz.f32 %f1783, %f1782, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3073, %f1783; sub.ftz.f32 %f1784, %f3008, %f1725; mul.ftz.f32 %f1785, %f1784, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3072, %f1785; sub.ftz.f32 %f1786, %f3007, %f1725; mul.ftz.f32 %f1787, %f1786, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3071, %f1787; sub.ftz.f32 %f1788, %f3006, %f1725; mul.ftz.f32 %f1789, %f1788, 0f3FB8AA3B; ex2.approx.ftz.f32 %f3070, %f1789; add.ftz.f32 %f1790, %f3133, %f3132; add.ftz.f32 %f1791, %f1790, 0f00000000; add.ftz.f32 %f1792, %f3131, %f3130; add.ftz.f32 %f1793, %f1792, 0f00000000; add.ftz.f32 %f1794, %f3129, %f3128; add.ftz.f32 %f1795, %f1791, %f1794; add.ftz.f32 %f1796, %f3127, %f3126; add.ftz.f32 %f1797, %f1793, %f1796; add.ftz.f32 %f1798, %f3125, %f3124; add.ftz.f32 %f1799, %f1795, %f1798; add.ftz.f32 %f1800, %f3123, %f3122; add.ftz.f32 %f1801, %f1797, %f1800; add.ftz.f32 %f1802, %f3121, %f3120; add.ftz.f32 %f1803, %f1799, %f1802; add.ftz.f32 %f1804, %f3119, %f3118; add.ftz.f32 %f1805, %f1801, %f1804; add.ftz.f32 %f1806, %f3117, %f3116; add.ftz.f32 %f1807, %f1803, %f1806; add.ftz.f32 %f1808, %f3115, %f3114; add.ftz.f32 %f1809, %f1805, %f1808; add.ftz.f32 %f1810, %f3113, %f3112; add.ftz.f32 %f1811, %f1807, %f1810; add.ftz.f32 %f1812, %f3111, %f3110; add.ftz.f32 %f1813, %f1809, %f1812; add.ftz.f32 %f1814, %f3109, %f3108; add.ftz.f32 %f1815, %f1811, %f1814; add.ftz.f32 %f1816, %f3107, %f3106; add.ftz.f32 %f1817, %f1813, %f1816; add.ftz.f32 %f1818, %f3105, %f3104; add.ftz.f32 %f1819, %f1815, %f1818; add.ftz.f32 %f1820, %f3103, %f3102; add.ftz.f32 %f1821, %f1817, %f1820; add.ftz.f32 %f1822, %f1819, %f1821; add.ftz.f32 %f1823, %f3101, %f3100; add.ftz.f32 %f1824, %f1823, 0f00000000; add.ftz.f32 %f1825, %f3099, %f3098; add.ftz.f32 %f1826, %f1825, 0f00000000; add.ftz.f32 %f1827, %f3097, %f3096; add.ftz.f32 %f1828, %f1824, %f1827; add.ftz.f32 %f1829, %f3095, %f3094; add.ftz.f32 %f1830, %f1826, %f1829; add.ftz.f32 %f1831, %f3093, %f3092; add.ftz.f32 %f1832, %f1828, %f1831; add.ftz.f32 %f1833, %f3091, %f3090; add.ftz.f32 %f1834, %f1830, %f1833; add.ftz.f32 %f1835, %f3089, %f3088; add.ftz.f32 %f1836, %f1832, %f1835; add.ftz.f32 %f1837, %f3087, %f3086; add.ftz.f32 %f1838, %f1834, %f1837; add.ftz.f32 %f1839, %f3085, %f3084; add.ftz.f32 %f1840, %f1836, %f1839; add.ftz.f32 %f1841, %f3083, %f3082; add.ftz.f32 %f1842, %f1838, %f1841; add.ftz.f32 %f1843, %f3081, %f3080; add.ftz.f32 %f1844, %f1840, %f1843; add.ftz.f32 %f1845, %f3079, %f3078; add.ftz.f32 %f1846, %f1842, %f1845; add.ftz.f32 %f1847, %f3077, %f3076; add.ftz.f32 %f1848, %f1844, %f1847; add.ftz.f32 %f1849, %f3075, %f3074; add.ftz.f32 %f1850, %f1846, %f1849; add.ftz.f32 %f1851, %f3073, %f3072; add.ftz.f32 %f1852, %f1848, %f1851; add.ftz.f32 %f1853, %f3071, %f3070; add.ftz.f32 %f1854, %f1850, %f1853; add.ftz.f32 %f1855, %f1852, %f1854; mov.b32 %r1870, %f1822; shfl.sync.bfly.b32 %r1871|%p284, %r1870, %r1861, %r1860, %r1862; mov.b32 %f1856, %r1871; add.ftz.f32 %f1857, %f1822, %f1856; mov.b32 %r1872, %f1857; shfl.sync.bfly.b32 %r1873|%p285, %r1872, %r1865, %r1860, %r1862; mov.b32 %f1858, %r1873; add.ftz.f32 %f1859, %f1857, %f1858; mov.b32 %r1874, %f1855; shfl.sync.bfly.b32 %r1875|%p286, %r1874, %r1861, %r1860, %r1862; mov.b32 %f1860, %r1875; add.ftz.f32 %f1861, %f1855, %f1860; mov.b32 %r1876, %f1861; shfl.sync.bfly.b32 %r1877|%p287, %r1876, %r1865, %r1860, %r1862; mov.b32 %f1862, %r1877; add.ftz.f32 %f1863, %f1861, %f1862; fma.rn.ftz.f32 %f3005, %f1560, %f3005, %f1859; fma.rn.ftz.f32 %f3004, %f1563, %f3004, %f1863; mov.f32 %f3002, %f330; mov.f32 %f3003, %f329; $L__BB0_16: shl.b32 %r2996, %r475, 4; and.b32 %r2995, %r475, 16; and.b32 %r2994, %r2996, 112; xor.b32 %r2993, %r2994, %r2995; shl.b64 %rd162, %rd10, 3; add.s32 %r2992, %r17, 56; add.s32 %r2991, %r17, 48; add.s32 %r2990, %r17, 40; add.s32 %r2989, %r17, 32; add.s32 %r2988, %r17, 24; add.s32 %r2987, %r17, 16; add.s32 %r2986, %r17, 8; setp.lt.s32 %p360, %r16, 12; // begin inline asm cvt.rn.f16x2.f32 %r1896, %f3132, %f3133; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1897, %f3100, %f3101; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1898, %f3130, %f3131; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1899, %f3098, %f3099; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1900, %f3128, %f3129; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1901, %f3096, %f3097; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1902, %f3126, %f3127; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1903, %f3094, %f3095; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1904, %f3124, %f3125; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1905, %f3092, %f3093; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1906, %f3122, %f3123; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1907, %f3090, %f3091; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1908, %f3120, %f3121; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1909, %f3088, %f3089; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1910, %f3118, %f3119; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1911, %f3086, %f3087; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1912, %f3116, %f3117; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1913, %f3084, %f3085; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1914, %f3114, %f3115; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1915, %f3082, %f3083; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1916, %f3112, %f3113; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1917, %f3080, %f3081; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1918, %f3110, %f3111; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1919, %f3078, %f3079; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1920, %f3108, %f3109; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1921, %f3076, %f3077; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1922, %f3106, %f3107; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1923, %f3074, %f3075; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1924, %f3104, %f3105; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1925, %f3072, %f3073; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1926, %f3102, %f3103; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1927, %f3070, %f3071; // end inline asm shl.b64 %rd105, %rd10, 6; add.s64 %rd177, %rd177, %rd105; setp.gt.s32 %p298, %r3069, 16383; selp.b32 %r2352, -16384, 16384, %p298; add.s32 %r3068, %r3068, -64; min.s32 %r2353, %r3068, 64; setp.lt.s32 %p299, %r17, %r2353; and.pred %p301, %p299, %p360; setp.lt.s32 %p302, %r2986, %r2353; and.pred %p303, %p302, %p360; setp.lt.s32 %p304, %r2987, %r2353; and.pred %p305, %p304, %p360; setp.lt.s32 %p306, %r2988, %r2353; and.pred %p307, %p306, %p360; setp.lt.s32 %p308, %r2989, %r2353; and.pred %p309, %p308, %p360; setp.lt.s32 %p310, %r2990, %r2353; and.pred %p311, %p310, %p360; setp.lt.s32 %p312, %r2991, %r2353; and.pred %p313, %p312, %p360; setp.lt.s32 %p314, %r2992, %r2353; and.pred %p315, %p314, %p360; add.s32 %r3069, %r2352, %r3069; selp.b32 %r1939, 16, 0, %p311; add.s32 %r1928, %r79, %r3069; add.s32 %r1930, %r1928, 2048; add.s32 %r1932, %r1928, 4096; add.s32 %r1934, %r1928, 6144; add.s32 %r1936, %r1928, 8192; add.s32 %r1938, %r1928, 10240; add.s32 %r1940, %r1928, 12288; add.s32 %r1942, %r1928, 14336; selp.b32 %r1929, 16, 0, %p301; // begin inline asm cp.async.cg.shared.global [%r1928], [%rd177], 16, %r1929; // end inline asm selp.b32 %r1931, 16, 0, %p303; add.s64 %rd98, %rd177, %rd162; // begin inline asm cp.async.cg.shared.global [%r1930], [%rd98], 16, %r1931; // end inline asm selp.b32 %r1933, 16, 0, %p305; add.s64 %rd99, %rd98, %rd162; // begin inline asm cp.async.cg.shared.global [%r1932], [%rd99], 16, %r1933; // end inline asm selp.b32 %r1935, 16, 0, %p307; add.s64 %rd100, %rd99, %rd162; // begin inline asm cp.async.cg.shared.global [%r1934], [%rd100], 16, %r1935; // end inline asm selp.b32 %r1937, 16, 0, %p309; add.s64 %rd101, %rd100, %rd162; // begin inline asm cp.async.cg.shared.global [%r1936], [%rd101], 16, %r1937; // end inline asm add.s64 %rd102, %rd101, %rd162; // begin inline asm cp.async.cg.shared.global [%r1938], [%rd102], 16, %r1939; // end inline asm selp.b32 %r1941, 16, 0, %p313; add.s64 %rd103, %rd102, %rd162; // begin inline asm cp.async.cg.shared.global [%r1940], [%rd103], 16, %r1941; // end inline asm selp.b32 %r1943, 16, 0, %p315; add.s64 %rd104, %rd103, %rd162; // begin inline asm cp.async.cg.shared.global [%r1942], [%rd104], 16, %r1943; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 1; // end inline asm bar.sync 0; shl.b32 %r2366, %r475, 8; and.b32 %r2367, %r2366, 3840; or.b32 %r332, %r2993, %r2367; add.s32 %r2369, %r3066, %r619; add.s32 %r2370, %r2369, 49152; add.s32 %r1948, %r2370, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1944, %r1945, %r1946, %r1947}, [%r1948]; // end inline asm xor.b32 %r333, %r332, 32; add.s32 %r1953, %r2370, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1949, %r1950, %r1951, %r1952}, [%r1953]; // end inline asm xor.b32 %r334, %r332, 64; add.s32 %r1958, %r2370, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1954, %r1955, %r1956, %r1957}, [%r1958]; // end inline asm xor.b32 %r335, %r332, 96; add.s32 %r1963, %r2370, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1959, %r1960, %r1961, %r1962}, [%r1963]; // end inline asm or.b32 %r336, %r332, 128; add.s32 %r1968, %r2370, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1964, %r1965, %r1966, %r1967}, [%r1968]; // end inline asm xor.b32 %r337, %r332, 160; add.s32 %r1973, %r2370, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1969, %r1970, %r1971, %r1972}, [%r1973]; // end inline asm mov.b32 %f2235, %r3054; mov.b32 %f2234, %r3055; mov.b32 %f2233, %r3056; mov.b32 %f2232, %r3057; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1896, %r1897, %r1898, %r1899}, {%r1944, %r1945}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm mov.b32 %f2243, %r3050; mov.b32 %f2242, %r3051; mov.b32 %f2241, %r3052; mov.b32 %f2240, %r3053; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1896, %r1897, %r1898, %r1899}, {%r1946, %r1947}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm mov.b32 %f2251, %r3046; mov.b32 %f2250, %r3047; mov.b32 %f2249, %r3048; mov.b32 %f2248, %r3049; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1896, %r1897, %r1898, %r1899}, {%r1949, %r1950}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm mov.b32 %f2259, %r3042; mov.b32 %f2258, %r3043; mov.b32 %f2257, %r3044; mov.b32 %f2256, %r3045; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1896, %r1897, %r1898, %r1899}, {%r1951, %r1952}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm mov.b32 %f2267, %r3038; mov.b32 %f2266, %r3039; mov.b32 %f2265, %r3040; mov.b32 %f2264, %r3041; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1896, %r1897, %r1898, %r1899}, {%r1954, %r1955}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm mov.b32 %f2275, %r3034; mov.b32 %f2274, %r3035; mov.b32 %f2273, %r3036; mov.b32 %f2272, %r3037; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1896, %r1897, %r1898, %r1899}, {%r1956, %r1957}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm mov.b32 %f2283, %r3030; mov.b32 %f2282, %r3031; mov.b32 %f2281, %r3032; mov.b32 %f2280, %r3033; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1896, %r1897, %r1898, %r1899}, {%r1959, %r1960}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm mov.b32 %f2291, %r3026; mov.b32 %f2290, %r3027; mov.b32 %f2289, %r3028; mov.b32 %f2288, %r3029; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1896, %r1897, %r1898, %r1899}, {%r1961, %r1962}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm mov.b32 %f2299, %r3022; mov.b32 %f2298, %r3023; mov.b32 %f2297, %r3024; mov.b32 %f2296, %r3025; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1896, %r1897, %r1898, %r1899}, {%r1964, %r1965}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm mov.b32 %f2307, %r3018; mov.b32 %f2306, %r3019; mov.b32 %f2305, %r3020; mov.b32 %f2304, %r3021; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1896, %r1897, %r1898, %r1899}, {%r1966, %r1967}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm mov.b32 %f2315, %r3014; mov.b32 %f2314, %r3015; mov.b32 %f2313, %r3016; mov.b32 %f2312, %r3017; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1896, %r1897, %r1898, %r1899}, {%r1969, %r1970}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm mov.b32 %f2323, %r3010; mov.b32 %f2322, %r3011; mov.b32 %f2321, %r3012; mov.b32 %f2320, %r3013; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1896, %r1897, %r1898, %r1899}, {%r1971, %r1972}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2371, %r2369, 53248; add.s32 %r2050, %r2371, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2046, %r2047, %r2048, %r2049}, [%r2050]; // end inline asm add.s32 %r2055, %r2371, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2051, %r2052, %r2053, %r2054}, [%r2055]; // end inline asm add.s32 %r2060, %r2371, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2056, %r2057, %r2058, %r2059}, [%r2060]; // end inline asm add.s32 %r2065, %r2371, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2061, %r2062, %r2063, %r2064}, [%r2065]; // end inline asm add.s32 %r2070, %r2371, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2066, %r2067, %r2068, %r2069}, [%r2070]; // end inline asm add.s32 %r2075, %r2371, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2071, %r2072, %r2073, %r2074}, [%r2075]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1900, %r1901, %r1902, %r1903}, {%r2046, %r2047}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1900, %r1901, %r1902, %r1903}, {%r2048, %r2049}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1900, %r1901, %r1902, %r1903}, {%r2051, %r2052}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1900, %r1901, %r1902, %r1903}, {%r2053, %r2054}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1900, %r1901, %r1902, %r1903}, {%r2056, %r2057}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1900, %r1901, %r1902, %r1903}, {%r2058, %r2059}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1900, %r1901, %r1902, %r1903}, {%r2061, %r2062}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1900, %r1901, %r1902, %r1903}, {%r2063, %r2064}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1900, %r1901, %r1902, %r1903}, {%r2066, %r2067}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1900, %r1901, %r1902, %r1903}, {%r2068, %r2069}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1900, %r1901, %r1902, %r1903}, {%r2071, %r2072}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1900, %r1901, %r1902, %r1903}, {%r2073, %r2074}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2372, %r2369, 57344; add.s32 %r2152, %r2372, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2148, %r2149, %r2150, %r2151}, [%r2152]; // end inline asm add.s32 %r2157, %r2372, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2153, %r2154, %r2155, %r2156}, [%r2157]; // end inline asm add.s32 %r2162, %r2372, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2158, %r2159, %r2160, %r2161}, [%r2162]; // end inline asm add.s32 %r2167, %r2372, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2163, %r2164, %r2165, %r2166}, [%r2167]; // end inline asm add.s32 %r2172, %r2372, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2168, %r2169, %r2170, %r2171}, [%r2172]; // end inline asm add.s32 %r2177, %r2372, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2173, %r2174, %r2175, %r2176}, [%r2177]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1904, %r1905, %r1906, %r1907}, {%r2148, %r2149}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1904, %r1905, %r1906, %r1907}, {%r2150, %r2151}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1904, %r1905, %r1906, %r1907}, {%r2153, %r2154}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1904, %r1905, %r1906, %r1907}, {%r2155, %r2156}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1904, %r1905, %r1906, %r1907}, {%r2158, %r2159}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1904, %r1905, %r1906, %r1907}, {%r2160, %r2161}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1904, %r1905, %r1906, %r1907}, {%r2163, %r2164}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1904, %r1905, %r1906, %r1907}, {%r2165, %r2166}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1904, %r1905, %r1906, %r1907}, {%r2168, %r2169}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1904, %r1905, %r1906, %r1907}, {%r2170, %r2171}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1904, %r1905, %r1906, %r1907}, {%r2173, %r2174}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1904, %r1905, %r1906, %r1907}, {%r2175, %r2176}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2373, %r2369, 61440; add.s32 %r2254, %r2373, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2250, %r2251, %r2252, %r2253}, [%r2254]; // end inline asm add.s32 %r2259, %r2373, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2255, %r2256, %r2257, %r2258}, [%r2259]; // end inline asm add.s32 %r2264, %r2373, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2260, %r2261, %r2262, %r2263}, [%r2264]; // end inline asm add.s32 %r2269, %r2373, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2265, %r2266, %r2267, %r2268}, [%r2269]; // end inline asm add.s32 %r2274, %r2373, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2270, %r2271, %r2272, %r2273}, [%r2274]; // end inline asm add.s32 %r2279, %r2373, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2275, %r2276, %r2277, %r2278}, [%r2279]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1908, %r1909, %r1910, %r1911}, {%r2250, %r2251}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1908, %r1909, %r1910, %r1911}, {%r2252, %r2253}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1908, %r1909, %r1910, %r1911}, {%r2255, %r2256}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1908, %r1909, %r1910, %r1911}, {%r2257, %r2258}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1908, %r1909, %r1910, %r1911}, {%r2260, %r2261}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1908, %r1909, %r1910, %r1911}, {%r2262, %r2263}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1908, %r1909, %r1910, %r1911}, {%r2265, %r2266}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1908, %r1909, %r1910, %r1911}, {%r2267, %r2268}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1908, %r1909, %r1910, %r1911}, {%r2270, %r2271}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1908, %r1909, %r1910, %r1911}, {%r2272, %r2273}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1908, %r1909, %r1910, %r1911}, {%r2275, %r2276}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1908, %r1909, %r1910, %r1911}, {%r2277, %r2278}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm bar.sync 0; add.s32 %r3060, %r3060, 128; setp.lt.s32 %p316, %r3060, %r23; @%p316 bra $L__BB0_18; bra.uni $L__BB0_17; $L__BB0_18: shl.b64 %rd169, %rd6, 7; mov.u32 %r2390, 31; mov.u32 %r2391, 0; mov.u32 %r2392, 2; mov.u32 %r2393, -1; shfl.sync.idx.b32 %r2394|%p317, %r2392, %r2391, %r2390, %r2393; shl.b32 %r2395, %r2394, 7; neg.s32 %r2396, %r2395; cvt.s64.s32 %rd115, %r2396; add.s64 %rd117, %rd169, %rd115; add.s64 %rd118, %rd171, %rd117; add.s64 %rd171, %rd118, 128; cvt.s64.s32 %rd119, %r2395; add.s64 %rd120, %rd172, 256; sub.s64 %rd172, %rd120, %rd119; setp.gt.s32 %p318, %r3063, 16383; selp.b32 %r2397, -16384, 16384, %p318; add.s32 %r3059, %r3059, -128; min.s32 %r2398, %r3059, 128; setp.lt.s64 %p319, %rd172, 192; setp.lt.s32 %p320, %r9, %r2398; and.pred %p321, %p320, %p319; setp.lt.s32 %p322, %r621, %r2398; and.pred %p323, %p322, %p319; setp.lt.s32 %p324, %r622, %r2398; and.pred %p325, %p324, %p319; setp.lt.s32 %p326, %r623, %r2398; and.pred %p327, %p326, %p319; setp.lt.s32 %p328, %r625, %r2398; and.pred %p329, %p328, %p319; setp.lt.s32 %p330, %r626, %r2398; and.pred %p331, %p330, %p319; setp.lt.s32 %p332, %r627, %r2398; and.pred %p333, %p332, %p319; setp.lt.s32 %p334, %r628, %r2398; and.pred %p335, %p334, %p319; add.s32 %r3063, %r2397, %r3063; selp.b32 %r2385, 16, 0, %p331; add.s32 %r2374, %r30, %r3063; add.s32 %r2376, %r2374, 2048; add.s32 %r2378, %r2374, 4096; add.s32 %r2380, %r2374, 6144; add.s32 %r2382, %r2374, 8192; add.s32 %r2384, %r2374, 10240; add.s32 %r2386, %r2374, 12288; add.s32 %r2388, %r2374, 14336; selp.b32 %r2375, 16, 0, %p321; // begin inline asm cp.async.cg.shared.global [%r2374], [%rd171], 16, %r2375; // end inline asm selp.b32 %r2377, 16, 0, %p323; add.s64 %rd108, %rd171, %rd68; // begin inline asm cp.async.cg.shared.global [%r2376], [%rd108], 16, %r2377; // end inline asm selp.b32 %r2379, 16, 0, %p325; add.s64 %rd109, %rd108, %rd68; // begin inline asm cp.async.cg.shared.global [%r2378], [%rd109], 16, %r2379; // end inline asm selp.b32 %r2381, 16, 0, %p327; add.s64 %rd110, %rd109, %rd68; // begin inline asm cp.async.cg.shared.global [%r2380], [%rd110], 16, %r2381; // end inline asm selp.b32 %r2383, 16, 0, %p329; add.s64 %rd111, %rd110, %rd68; // begin inline asm cp.async.cg.shared.global [%r2382], [%rd111], 16, %r2383; // end inline asm add.s64 %rd112, %rd111, %rd68; // begin inline asm cp.async.cg.shared.global [%r2384], [%rd112], 16, %r2385; // end inline asm selp.b32 %r2387, 16, 0, %p333; add.s64 %rd113, %rd112, %rd68; // begin inline asm cp.async.cg.shared.global [%r2386], [%rd113], 16, %r2387; // end inline asm selp.b32 %r2389, 16, 0, %p335; add.s64 %rd114, %rd113, %rd68; // begin inline asm cp.async.cg.shared.global [%r2388], [%rd114], 16, %r2389; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 1; // end inline asm bar.sync 0; bra.uni $L__BB0_19; $L__BB0_17: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s64 %rd172, %rd172, 128; $L__BB0_19: setp.gt.s32 %p336, %r3066, 16383; selp.b32 %r2814, -16384, 16384, %p336; add.s32 %r2815, %r2814, %r3066; add.s32 %r2817, %r2815, %r619; add.s32 %r2818, %r2817, 49152; add.s32 %r2410, %r2818, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2406, %r2407, %r2408, %r2409}, [%r2410]; // end inline asm add.s32 %r2415, %r2818, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2411, %r2412, %r2413, %r2414}, [%r2415]; // end inline asm add.s32 %r2420, %r2818, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2416, %r2417, %r2418, %r2419}, [%r2420]; // end inline asm add.s32 %r2425, %r2818, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2421, %r2422, %r2423, %r2424}, [%r2425]; // end inline asm add.s32 %r2430, %r2818, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2426, %r2427, %r2428, %r2429}, [%r2430]; // end inline asm add.s32 %r2435, %r2818, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2431, %r2432, %r2433, %r2434}, [%r2435]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1912, %r1913, %r1914, %r1915}, {%r2406, %r2407}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1912, %r1913, %r1914, %r1915}, {%r2408, %r2409}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1912, %r1913, %r1914, %r1915}, {%r2411, %r2412}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1912, %r1913, %r1914, %r1915}, {%r2413, %r2414}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1912, %r1913, %r1914, %r1915}, {%r2416, %r2417}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1912, %r1913, %r1914, %r1915}, {%r2418, %r2419}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1912, %r1913, %r1914, %r1915}, {%r2421, %r2422}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1912, %r1913, %r1914, %r1915}, {%r2423, %r2424}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1912, %r1913, %r1914, %r1915}, {%r2426, %r2427}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1912, %r1913, %r1914, %r1915}, {%r2428, %r2429}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1912, %r1913, %r1914, %r1915}, {%r2431, %r2432}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1912, %r1913, %r1914, %r1915}, {%r2433, %r2434}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2819, %r2817, 53248; add.s32 %r2512, %r2819, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2508, %r2509, %r2510, %r2511}, [%r2512]; // end inline asm add.s32 %r2517, %r2819, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2513, %r2514, %r2515, %r2516}, [%r2517]; // end inline asm add.s32 %r2522, %r2819, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2518, %r2519, %r2520, %r2521}, [%r2522]; // end inline asm add.s32 %r2527, %r2819, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2523, %r2524, %r2525, %r2526}, [%r2527]; // end inline asm add.s32 %r2532, %r2819, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2528, %r2529, %r2530, %r2531}, [%r2532]; // end inline asm add.s32 %r2537, %r2819, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2533, %r2534, %r2535, %r2536}, [%r2537]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1916, %r1917, %r1918, %r1919}, {%r2508, %r2509}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1916, %r1917, %r1918, %r1919}, {%r2510, %r2511}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1916, %r1917, %r1918, %r1919}, {%r2513, %r2514}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1916, %r1917, %r1918, %r1919}, {%r2515, %r2516}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1916, %r1917, %r1918, %r1919}, {%r2518, %r2519}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1916, %r1917, %r1918, %r1919}, {%r2520, %r2521}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1916, %r1917, %r1918, %r1919}, {%r2523, %r2524}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1916, %r1917, %r1918, %r1919}, {%r2525, %r2526}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1916, %r1917, %r1918, %r1919}, {%r2528, %r2529}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1916, %r1917, %r1918, %r1919}, {%r2530, %r2531}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1916, %r1917, %r1918, %r1919}, {%r2533, %r2534}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1916, %r1917, %r1918, %r1919}, {%r2535, %r2536}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2820, %r2817, 57344; add.s32 %r2614, %r2820, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2610, %r2611, %r2612, %r2613}, [%r2614]; // end inline asm add.s32 %r2619, %r2820, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2615, %r2616, %r2617, %r2618}, [%r2619]; // end inline asm add.s32 %r2624, %r2820, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2620, %r2621, %r2622, %r2623}, [%r2624]; // end inline asm add.s32 %r2629, %r2820, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2625, %r2626, %r2627, %r2628}, [%r2629]; // end inline asm add.s32 %r2634, %r2820, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2630, %r2631, %r2632, %r2633}, [%r2634]; // end inline asm add.s32 %r2639, %r2820, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2635, %r2636, %r2637, %r2638}, [%r2639]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1920, %r1921, %r1922, %r1923}, {%r2610, %r2611}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1920, %r1921, %r1922, %r1923}, {%r2612, %r2613}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1920, %r1921, %r1922, %r1923}, {%r2615, %r2616}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1920, %r1921, %r1922, %r1923}, {%r2617, %r2618}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1920, %r1921, %r1922, %r1923}, {%r2620, %r2621}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1920, %r1921, %r1922, %r1923}, {%r2622, %r2623}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1920, %r1921, %r1922, %r1923}, {%r2625, %r2626}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1920, %r1921, %r1922, %r1923}, {%r2627, %r2628}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1920, %r1921, %r1922, %r1923}, {%r2630, %r2631}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1920, %r1921, %r1922, %r1923}, {%r2632, %r2633}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1920, %r1921, %r1922, %r1923}, {%r2635, %r2636}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1920, %r1921, %r1922, %r1923}, {%r2637, %r2638}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm add.s32 %r2821, %r2817, 61440; add.s32 %r2716, %r2821, %r332; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2712, %r2713, %r2714, %r2715}, [%r2716]; // end inline asm add.s32 %r2721, %r2821, %r333; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2717, %r2718, %r2719, %r2720}, [%r2721]; // end inline asm add.s32 %r2726, %r2821, %r334; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2722, %r2723, %r2724, %r2725}, [%r2726]; // end inline asm add.s32 %r2731, %r2821, %r335; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2727, %r2728, %r2729, %r2730}, [%r2731]; // end inline asm add.s32 %r2736, %r2821, %r336; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2732, %r2733, %r2734, %r2735}, [%r2736]; // end inline asm add.s32 %r2741, %r2821, %r337; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2737, %r2738, %r2739, %r2740}, [%r2741]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2232, %f2233, %f2234, %f2235}, {%r1924, %r1925, %r1926, %r1927}, {%r2712, %r2713}, {%f2232, %f2233, %f2234, %f2235}; // end inline asm mov.b32 %r3057, %f2232; mov.b32 %r3056, %f2233; mov.b32 %r3055, %f2234; mov.b32 %r3054, %f2235; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2240, %f2241, %f2242, %f2243}, {%r1924, %r1925, %r1926, %r1927}, {%r2714, %r2715}, {%f2240, %f2241, %f2242, %f2243}; // end inline asm mov.b32 %r3053, %f2240; mov.b32 %r3052, %f2241; mov.b32 %r3051, %f2242; mov.b32 %r3050, %f2243; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2248, %f2249, %f2250, %f2251}, {%r1924, %r1925, %r1926, %r1927}, {%r2717, %r2718}, {%f2248, %f2249, %f2250, %f2251}; // end inline asm mov.b32 %r3049, %f2248; mov.b32 %r3048, %f2249; mov.b32 %r3047, %f2250; mov.b32 %r3046, %f2251; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2256, %f2257, %f2258, %f2259}, {%r1924, %r1925, %r1926, %r1927}, {%r2719, %r2720}, {%f2256, %f2257, %f2258, %f2259}; // end inline asm mov.b32 %r3045, %f2256; mov.b32 %r3044, %f2257; mov.b32 %r3043, %f2258; mov.b32 %r3042, %f2259; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2264, %f2265, %f2266, %f2267}, {%r1924, %r1925, %r1926, %r1927}, {%r2722, %r2723}, {%f2264, %f2265, %f2266, %f2267}; // end inline asm mov.b32 %r3041, %f2264; mov.b32 %r3040, %f2265; mov.b32 %r3039, %f2266; mov.b32 %r3038, %f2267; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2272, %f2273, %f2274, %f2275}, {%r1924, %r1925, %r1926, %r1927}, {%r2724, %r2725}, {%f2272, %f2273, %f2274, %f2275}; // end inline asm mov.b32 %r3037, %f2272; mov.b32 %r3036, %f2273; mov.b32 %r3035, %f2274; mov.b32 %r3034, %f2275; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2280, %f2281, %f2282, %f2283}, {%r1924, %r1925, %r1926, %r1927}, {%r2727, %r2728}, {%f2280, %f2281, %f2282, %f2283}; // end inline asm mov.b32 %r3033, %f2280; mov.b32 %r3032, %f2281; mov.b32 %r3031, %f2282; mov.b32 %r3030, %f2283; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2288, %f2289, %f2290, %f2291}, {%r1924, %r1925, %r1926, %r1927}, {%r2729, %r2730}, {%f2288, %f2289, %f2290, %f2291}; // end inline asm mov.b32 %r3029, %f2288; mov.b32 %r3028, %f2289; mov.b32 %r3027, %f2290; mov.b32 %r3026, %f2291; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2296, %f2297, %f2298, %f2299}, {%r1924, %r1925, %r1926, %r1927}, {%r2732, %r2733}, {%f2296, %f2297, %f2298, %f2299}; // end inline asm mov.b32 %r3025, %f2296; mov.b32 %r3024, %f2297; mov.b32 %r3023, %f2298; mov.b32 %r3022, %f2299; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2304, %f2305, %f2306, %f2307}, {%r1924, %r1925, %r1926, %r1927}, {%r2734, %r2735}, {%f2304, %f2305, %f2306, %f2307}; // end inline asm mov.b32 %r3021, %f2304; mov.b32 %r3020, %f2305; mov.b32 %r3019, %f2306; mov.b32 %r3018, %f2307; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2312, %f2313, %f2314, %f2315}, {%r1924, %r1925, %r1926, %r1927}, {%r2737, %r2738}, {%f2312, %f2313, %f2314, %f2315}; // end inline asm mov.b32 %r3017, %f2312; mov.b32 %r3016, %f2313; mov.b32 %r3015, %f2314; mov.b32 %r3014, %f2315; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f2320, %f2321, %f2322, %f2323}, {%r1924, %r1925, %r1926, %r1927}, {%r2739, %r2740}, {%f2320, %f2321, %f2322, %f2323}; // end inline asm mov.b32 %r3013, %f2320; mov.b32 %r3012, %f2321; mov.b32 %r3011, %f2322; mov.b32 %r3010, %f2323; setp.gt.s32 %p337, %r2815, 16383; selp.b32 %r2822, -16384, 16384, %p337; add.s32 %r3066, %r2822, %r2815; setp.gt.s32 %p339, %r214, 16383; selp.b32 %r2823, -16384, 16384, %p339; add.s32 %r3064, %r2823, %r214; setp.gt.s32 %p340, %r213, 8191; selp.b32 %r2824, -8192, 8192, %p340; add.s32 %r3062, %r2824, %r213; @%p316 bra $L__BB0_5; $L__BB0_20: setp.equ.ftz.f32 %p341, %f3005, 0f00000000; mov.f32 %f3141, 0f3F800000; mov.f32 %f3140, %f3141; @%p341 bra $L__BB0_22; rcp.approx.ftz.f32 %f3140, %f3005; $L__BB0_22: setp.equ.ftz.f32 %p342, %f3004, 0f00000000; @%p342 bra $L__BB0_24; rcp.approx.ftz.f32 %f3141, %f3004; $L__BB0_24: shl.b32 %r2999, %r16, 4; cvt.s64.s32 %rd165, %r2999; mov.b64 %rd164, fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0; mov.u64 %rd163, %rd164; ld.param.u32 %r2998, [%rd163+44]; ld.param.u32 %r2997, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; mov.b32 %f2954, %r3057; mul.ftz.f32 %f2907, %f3140, %f2954; mov.b32 %f2955, %r3056; mul.ftz.f32 %f2906, %f3140, %f2955; mov.b32 %f2956, %r3055; mul.ftz.f32 %f2909, %f3141, %f2956; mov.b32 %f2957, %r3054; mul.ftz.f32 %f2908, %f3141, %f2957; mov.b32 %f2958, %r3053; mul.ftz.f32 %f2911, %f3140, %f2958; mov.b32 %f2959, %r3052; mul.ftz.f32 %f2910, %f3140, %f2959; mov.b32 %f2960, %r3051; mul.ftz.f32 %f2913, %f3141, %f2960; mov.b32 %f2961, %r3050; mul.ftz.f32 %f2912, %f3141, %f2961; mov.b32 %f2962, %r3049; mul.ftz.f32 %f2915, %f3140, %f2962; mov.b32 %f2963, %r3048; mul.ftz.f32 %f2914, %f3140, %f2963; mov.b32 %f2964, %r3047; mul.ftz.f32 %f2917, %f3141, %f2964; mov.b32 %f2965, %r3046; mul.ftz.f32 %f2916, %f3141, %f2965; mov.b32 %f2966, %r3045; mul.ftz.f32 %f2919, %f3140, %f2966; mov.b32 %f2967, %r3044; mul.ftz.f32 %f2918, %f3140, %f2967; mov.b32 %f2968, %r3043; mul.ftz.f32 %f2921, %f3141, %f2968; mov.b32 %f2969, %r3042; mul.ftz.f32 %f2920, %f3141, %f2969; mov.b32 %f2970, %r3041; mul.ftz.f32 %f2923, %f3140, %f2970; mov.b32 %f2971, %r3040; mul.ftz.f32 %f2922, %f3140, %f2971; mov.b32 %f2972, %r3039; mul.ftz.f32 %f2925, %f3141, %f2972; mov.b32 %f2973, %r3038; mul.ftz.f32 %f2924, %f3141, %f2973; mov.b32 %f2974, %r3037; mul.ftz.f32 %f2927, %f3140, %f2974; mov.b32 %f2975, %r3036; mul.ftz.f32 %f2926, %f3140, %f2975; mov.b32 %f2976, %r3035; mul.ftz.f32 %f2929, %f3141, %f2976; mov.b32 %f2977, %r3034; mul.ftz.f32 %f2928, %f3141, %f2977; mov.b32 %f2978, %r3033; mul.ftz.f32 %f2931, %f3140, %f2978; mov.b32 %f2979, %r3032; mul.ftz.f32 %f2930, %f3140, %f2979; mov.b32 %f2980, %r3031; mul.ftz.f32 %f2933, %f3141, %f2980; mov.b32 %f2981, %r3030; mul.ftz.f32 %f2932, %f3141, %f2981; mov.b32 %f2982, %r3029; mul.ftz.f32 %f2935, %f3140, %f2982; mov.b32 %f2983, %r3028; mul.ftz.f32 %f2934, %f3140, %f2983; mov.b32 %f2984, %r3027; mul.ftz.f32 %f2937, %f3141, %f2984; mov.b32 %f2985, %r3026; mul.ftz.f32 %f2936, %f3141, %f2985; mov.b32 %f2986, %r3025; mul.ftz.f32 %f2939, %f3140, %f2986; mov.b32 %f2987, %r3024; mul.ftz.f32 %f2938, %f3140, %f2987; mov.b32 %f2988, %r3023; mul.ftz.f32 %f2941, %f3141, %f2988; mov.b32 %f2989, %r3022; mul.ftz.f32 %f2940, %f3141, %f2989; mov.b32 %f2990, %r3021; mul.ftz.f32 %f2943, %f3140, %f2990; mov.b32 %f2991, %r3020; mul.ftz.f32 %f2942, %f3140, %f2991; mov.b32 %f2992, %r3019; mul.ftz.f32 %f2945, %f3141, %f2992; mov.b32 %f2993, %r3018; mul.ftz.f32 %f2944, %f3141, %f2993; mov.b32 %f2994, %r3017; mul.ftz.f32 %f2947, %f3140, %f2994; mov.b32 %f2995, %r3016; mul.ftz.f32 %f2946, %f3140, %f2995; mov.b32 %f2996, %r3015; mul.ftz.f32 %f2949, %f3141, %f2996; mov.b32 %f2997, %r3014; mul.ftz.f32 %f2948, %f3141, %f2997; mov.b32 %f2998, %r3013; mul.ftz.f32 %f2951, %f3140, %f2998; mov.b32 %f2999, %r3012; mul.ftz.f32 %f2950, %f3140, %f2999; mov.b32 %f3000, %r3011; mul.ftz.f32 %f2953, %f3141, %f3000; mov.b32 %f3001, %r3010; mul.ftz.f32 %f2952, %f3141, %f3001; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm cvt.rn.f16x2.f32 %r2825, %f2906, %f2907; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2826, %f2908, %f2909; // end inline asm shl.b32 %r2938, %r475, 2; and.b32 %r2939, %r2938, 124; add.s32 %r2941, %r2939, %r619; and.b32 %r2942, %r475, 96; shr.u32 %r2943, %r2942, 1; and.b32 %r2944, %r475, 28; shr.u32 %r2945, %r2944, 2; or.b32 %r2946, %r2943, %r2945; shl.b32 %r2947, %r2946, 8; add.s32 %r2827, %r2941, %r2947; // begin inline asm st.shared.b32 [%r2827], %r2825; // end inline asm add.s32 %r2829, %r2827, 2048; // begin inline asm st.shared.b32 [%r2829], %r2826; // end inline asm xor.b32 %r2833, %r2827, 16; // begin inline asm cvt.rn.f16x2.f32 %r2831, %f2910, %f2911; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2832, %f2912, %f2913; // end inline asm // begin inline asm st.shared.b32 [%r2833], %r2831; // end inline asm add.s32 %r2835, %r2833, 2048; // begin inline asm st.shared.b32 [%r2835], %r2832; // end inline asm xor.b32 %r2839, %r2827, 32; // begin inline asm cvt.rn.f16x2.f32 %r2837, %f2914, %f2915; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2838, %f2916, %f2917; // end inline asm // begin inline asm st.shared.b32 [%r2839], %r2837; // end inline asm add.s32 %r2841, %r2839, 2048; // begin inline asm st.shared.b32 [%r2841], %r2838; // end inline asm xor.b32 %r2845, %r2827, 48; // begin inline asm cvt.rn.f16x2.f32 %r2843, %f2918, %f2919; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2844, %f2920, %f2921; // end inline asm // begin inline asm st.shared.b32 [%r2845], %r2843; // end inline asm add.s32 %r2847, %r2845, 2048; // begin inline asm st.shared.b32 [%r2847], %r2844; // end inline asm xor.b32 %r2851, %r2827, 64; // begin inline asm cvt.rn.f16x2.f32 %r2849, %f2922, %f2923; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2850, %f2924, %f2925; // end inline asm // begin inline asm st.shared.b32 [%r2851], %r2849; // end inline asm add.s32 %r2853, %r2851, 2048; // begin inline asm st.shared.b32 [%r2853], %r2850; // end inline asm xor.b32 %r2857, %r2827, 80; // begin inline asm cvt.rn.f16x2.f32 %r2855, %f2926, %f2927; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2856, %f2928, %f2929; // end inline asm // begin inline asm st.shared.b32 [%r2857], %r2855; // end inline asm add.s32 %r2859, %r2857, 2048; // begin inline asm st.shared.b32 [%r2859], %r2856; // end inline asm xor.b32 %r2863, %r2827, 96; // begin inline asm cvt.rn.f16x2.f32 %r2861, %f2930, %f2931; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2862, %f2932, %f2933; // end inline asm // begin inline asm st.shared.b32 [%r2863], %r2861; // end inline asm add.s32 %r2865, %r2863, 2048; // begin inline asm st.shared.b32 [%r2865], %r2862; // end inline asm xor.b32 %r2869, %r2827, 112; // begin inline asm cvt.rn.f16x2.f32 %r2867, %f2934, %f2935; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2868, %f2936, %f2937; // end inline asm // begin inline asm st.shared.b32 [%r2869], %r2867; // end inline asm add.s32 %r2871, %r2869, 2048; // begin inline asm st.shared.b32 [%r2871], %r2868; // end inline asm xor.b32 %r2875, %r2827, 128; // begin inline asm cvt.rn.f16x2.f32 %r2873, %f2938, %f2939; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2874, %f2940, %f2941; // end inline asm // begin inline asm st.shared.b32 [%r2875], %r2873; // end inline asm add.s32 %r2877, %r2875, 2048; // begin inline asm st.shared.b32 [%r2877], %r2874; // end inline asm xor.b32 %r2881, %r2827, 144; // begin inline asm cvt.rn.f16x2.f32 %r2879, %f2942, %f2943; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2880, %f2944, %f2945; // end inline asm // begin inline asm st.shared.b32 [%r2881], %r2879; // end inline asm add.s32 %r2883, %r2881, 2048; // begin inline asm st.shared.b32 [%r2883], %r2880; // end inline asm xor.b32 %r2887, %r2827, 160; // begin inline asm cvt.rn.f16x2.f32 %r2885, %f2946, %f2947; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2886, %f2948, %f2949; // end inline asm // begin inline asm st.shared.b32 [%r2887], %r2885; // end inline asm add.s32 %r2889, %r2887, 2048; // begin inline asm st.shared.b32 [%r2889], %r2886; // end inline asm xor.b32 %r2893, %r2827, 176; // begin inline asm cvt.rn.f16x2.f32 %r2891, %f2950, %f2951; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r2892, %f2952, %f2953; // end inline asm // begin inline asm st.shared.b32 [%r2893], %r2891; // end inline asm add.s32 %r2895, %r2893, 2048; // begin inline asm st.shared.b32 [%r2895], %r2892; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2897, %r2898, %r2899, %r2900}, [%r27]; // end inline asm add.s32 %r2906, %r27, 2048; // begin inline asm ld.shared.v4.b32 {%r2902, %r2903, %r2904, %r2905}, [%r2906]; // end inline asm add.s32 %r2911, %r27, 4096; // begin inline asm ld.shared.v4.b32 {%r2907, %r2908, %r2909, %r2910}, [%r2911]; // end inline asm add.s32 %r2916, %r27, 6144; // begin inline asm ld.shared.v4.b32 {%r2912, %r2913, %r2914, %r2915}, [%r2916]; // end inline asm add.s32 %r2921, %r27, 8192; // begin inline asm ld.shared.v4.b32 {%r2917, %r2918, %r2919, %r2920}, [%r2921]; // end inline asm add.s32 %r2926, %r27, 10240; // begin inline asm ld.shared.v4.b32 {%r2922, %r2923, %r2924, %r2925}, [%r2926]; // end inline asm add.s32 %r2931, %r27, 12288; // begin inline asm ld.shared.v4.b32 {%r2927, %r2928, %r2929, %r2930}, [%r2931]; // end inline asm add.s32 %r2936, %r27, 14336; // begin inline asm ld.shared.v4.b32 {%r2932, %r2933, %r2934, %r2935}, [%r2936]; // end inline asm mul.lo.s32 %r2952, %r2998, %r478; shl.b32 %r2953, %r2952, 1; cvt.s64.s32 %rd122, %r2953; add.s64 %rd35, %rd122, %rd165; cvt.u32.u64 %r2954, %rd14; setp.ge.s32 %p343, %r2954, %r2997; @%p343 bra $L__BB0_47; shl.b32 %r3001, %r16, 4; cvt.s64.s32 %rd168, %r3001; mov.b64 %rd167, fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0; mov.u64 %rd166, %rd167; ld.param.u32 %r3000, [%rd166+44]; cvt.u32.u64 %r2955, %rd168; shl.b32 %r2956, %r3000, 1; setp.ge.s32 %p344, %r2955, %r2956; @%p344 bra $L__BB0_27; mul.lo.s64 %rd123, %rd12, %rd14; add.s64 %rd124, %rd35, %rd123; cvta.to.global.u64 %rd125, %rd13; add.s64 %rd126, %rd125, %rd124; st.global.v4.u32 [%rd126], {%r2897, %r2898, %r2899, %r2900}; $L__BB0_27: ld.param.u32 %r3002, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2958, %r2954, 8; setp.ge.s32 %p345, %r2958, %r3002; @%p345 bra $L__BB0_47; @%p344 bra $L__BB0_30; add.s64 %rd127, %rd14, 8; mul.lo.s64 %rd128, %rd127, %rd12; add.s64 %rd129, %rd35, %rd128; cvta.to.global.u64 %rd130, %rd13; add.s64 %rd131, %rd130, %rd129; st.global.v4.u32 [%rd131], {%r2902, %r2903, %r2904, %r2905}; $L__BB0_30: ld.param.u32 %r3003, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2962, %r2954, 16; setp.ge.s32 %p347, %r2962, %r3003; @%p347 bra $L__BB0_47; @%p344 bra $L__BB0_33; add.s64 %rd132, %rd14, 16; mul.lo.s64 %rd133, %rd132, %rd12; add.s64 %rd134, %rd35, %rd133; cvta.to.global.u64 %rd135, %rd13; add.s64 %rd136, %rd135, %rd134; st.global.v4.u32 [%rd136], {%r2907, %r2908, %r2909, %r2910}; $L__BB0_33: ld.param.u32 %r3004, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2966, %r2954, 24; setp.ge.s32 %p349, %r2966, %r3004; @%p349 bra $L__BB0_47; @%p344 bra $L__BB0_36; add.s64 %rd137, %rd14, 24; mul.lo.s64 %rd138, %rd137, %rd12; add.s64 %rd139, %rd35, %rd138; cvta.to.global.u64 %rd140, %rd13; add.s64 %rd141, %rd140, %rd139; st.global.v4.u32 [%rd141], {%r2912, %r2913, %r2914, %r2915}; $L__BB0_36: ld.param.u32 %r3005, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2970, %r2954, 32; setp.ge.s32 %p351, %r2970, %r3005; @%p351 bra $L__BB0_47; @%p344 bra $L__BB0_39; add.s64 %rd142, %rd14, 32; mul.lo.s64 %rd143, %rd142, %rd12; add.s64 %rd144, %rd35, %rd143; cvta.to.global.u64 %rd145, %rd13; add.s64 %rd146, %rd145, %rd144; st.global.v4.u32 [%rd146], {%r2917, %r2918, %r2919, %r2920}; $L__BB0_39: ld.param.u32 %r3006, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2974, %r2954, 40; setp.ge.s32 %p353, %r2974, %r3006; @%p353 bra $L__BB0_47; @%p344 bra $L__BB0_42; add.s64 %rd147, %rd14, 40; mul.lo.s64 %rd148, %rd147, %rd12; add.s64 %rd149, %rd35, %rd148; cvta.to.global.u64 %rd150, %rd13; add.s64 %rd151, %rd150, %rd149; st.global.v4.u32 [%rd151], {%r2922, %r2923, %r2924, %r2925}; $L__BB0_42: ld.param.u32 %r3007, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2978, %r2954, 48; setp.ge.s32 %p355, %r2978, %r3007; @%p355 bra $L__BB0_47; @%p344 bra $L__BB0_45; add.s64 %rd152, %rd14, 48; mul.lo.s64 %rd153, %rd152, %rd12; add.s64 %rd154, %rd35, %rd153; cvta.to.global.u64 %rd155, %rd13; add.s64 %rd156, %rd155, %rd154; st.global.v4.u32 [%rd156], {%r2927, %r2928, %r2929, %r2930}; $L__BB0_45: ld.param.u32 %r3008, [fmha_v2_flash_attention_fp16_fp32_64_128_S_96_sliding_window_causal_sm86_kernel_nl_tiled_param_0+40]; add.s32 %r2984, %r2954, 56; setp.ge.s32 %p357, %r2984, %r3008; or.pred %p359, %p357, %p344; @%p359 bra $L__BB0_47; add.s64 %rd157, %rd14, 56; mul.lo.s64 %rd158, %rd157, %rd12; add.s64 %rd159, %rd35, %rd158; cvta.to.global.u64 %rd160, %rd13; add.s64 %rd161, %rd160, %rd159; st.global.v4.u32 [%rd161], {%r2932, %r2933, %r2934, %r2935}; $L__BB0_47: ret; }